In [2]:
# Data wrangling libraries
import numpy as np
import pandas as pd
import os

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [31]:
# modelling
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [33]:
# evaluation metrics libraries
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import (precision_score, recall_score, f1_score,accuracy_score)

In [10]:
# Construct the directory containing the CSV files
current_dir = os.getcwd()  # Get the current working directory
data_dir = os.path.join(current_dir, '..', 'data') 

# List of file names to read
file_names = ['global_df.csv']

# Initialize a dictionary to store DataFrames
dataframes = {}

# Loop through the file names, read each file, and store in the dictionary
for file_name in file_names:
    file_path = os.path.join(data_dir, file_name)  # Construct the full file path
    dataframes[file_name] = pd.read_csv(file_path, index_col=0)  # Read the CSV file and store in the dictionary

# Access the DataFrames using their file names
global_df = dataframes['global_df.csv']

In [25]:
global_df.columns

Index(['sample_id', 'rock_name', 'qap_name', 'major_id', 'method_id', 'sio2',
       'tio2', 'al2o3', 'fe2o3', 'mgo', 'cao', 'mno', 'k2o', 'na2o', 'p2o5',
       'rock_type'],
      dtype='object')

In [12]:
global_df

Unnamed: 0,sample_id,rock_name,qap_name,major_id,method_id,sio2,tio2,al2o3,fe2o3,mgo,cao,mno,k2o,na2o,p2o5,rock_type
6,51,bt granite,monzogranite,583272,5254,74.05,0.25,13.77,0.29,0.29,1.37,0.05,4.00,3.46,0.06,granite
7,52,coarse-grained bt granite,monzogranite,521229,5254,69.28,0.31,14.91,0.53,0.88,2.23,0.08,4.43,3.15,0.10,granite
153,381,granite,monzogranite,537177,5379,70.57,0.39,14.78,0.40,0.77,2.60,0.09,3.57,3.18,0.11,granite
154,382,granite,monzogranite,565484,5379,72.72,0.11,14.67,0.56,0.21,1.20,0.07,4.63,3.76,0.06,granite
155,383,granite,monzogranite,588241,5379,74.44,0.10,14.07,0.40,0.24,1.24,0.05,4.28,3.60,0.06,granite
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47196,1022064,dyke,monzogranite,526989,1,69.75,0.33,12.11,5.74,0.04,0.09,0.07,5.01,6.13,0.06,granite
47197,1022066,dyke,monzogranite,545737,1,71.22,0.16,12.09,4.62,0.10,0.38,0.07,3.76,6.85,0.04,granite
47198,1022067,dyke,monzogranite,588567,1,74.48,0.18,11.08,3.69,0.03,0.08,0.07,4.25,5.51,0.04,granite
47200,1022087,charnockite,monzogranite,542328,1,70.99,0.50,13.63,1.63,0.42,1.95,0.04,5.27,3.10,0.15,granite


In [27]:
X_train = global_df[['sio2', 'tio2', 'al2o3', 'fe2o3', 'mgo', 'cao', 'mno', 'k2o', 'na2o', 'p2o5']]
y_train = global_df['rock_type']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.30, random_state=13)

In [40]:
pipelines = {
    'Logistic Regression': Pipeline([
        ('classifier', LogisticRegression(max_iter=1000))
    ]),
    'Random Forest': Pipeline([
        ('classifier', RandomForestClassifier(random_state=13))
    ]),
    'Decision Tree': Pipeline([
        ('classifier', DecisionTreeClassifier(random_state=13))
    ]),
    'KNN': Pipeline([
        ('classifier', KNeighborsClassifier())
    ]),
    'SVM': Pipeline([
        ('classifier', SVC(random_state=13))
    ]),
    'Gradient Boosting': Pipeline([
        ('classifier', GradientBoostingClassifier(random_state=13))
    ])
}

# Train and evaluate each model
for model_name, pipeline in pipelines.items():
    print(f"Evaluating {model_name}...")
    
    # Train the model on the global dataset
    pipeline.fit(X_train, y_train)
    
    # Evaluate on the validation set
    predictions = pipeline.predict(X_test)
    print(f"Validation Classification Report for {model_name}:")
    print(classification_report(y_test, predictions))
    
    # Confusion matrix for validation set
    print(f"Confusion Matrix for {model_name}:")
    print(confusion_matrix(y_test, predictions))
    
    # Hyperparameter tuning 
    # Note: You can add hyperparameter tuning as needed for each model
    
    # Final evaluation on the test set with hyperparameter tuning
#     y_test_pred = pipeline.predict(X_test)
#     print(f"Test Classification Report for {model_name}:")
#     print(classification_report(y_test, y_test_pred))
    
#     # Confusion matrix for test set
#     print(f"Confusion Matrix for {model_name}:")
#     print(confusion_matrix(y_test, y_test_pred))
    
    print("------------------------------------------")

Evaluating Logistic Regression...
Validation Classification Report for Logistic Regression:
              precision    recall  f1-score   support

     diorite       1.00      1.00      1.00       911
     granite       1.00      1.00      1.00      6461

    accuracy                           1.00      7372
   macro avg       1.00      1.00      1.00      7372
weighted avg       1.00      1.00      1.00      7372

Confusion Matrix for Logistic Regression:
[[ 911    0]
 [   1 6460]]
------------------------------------------
Evaluating Random Forest...
Validation Classification Report for Random Forest:
              precision    recall  f1-score   support

     diorite       1.00      1.00      1.00       911
     granite       1.00      1.00      1.00      6461

    accuracy                           1.00      7372
   macro avg       1.00      1.00      1.00      7372
weighted avg       1.00      1.00      1.00      7372

Confusion Matrix for Random Forest:
[[ 910    1]
 [   0 6461]]