In [2]:
# Data wrangling libraries
import numpy as np
import pandas as pd
import os

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [4]:
# modelling
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [6]:
# evaluation metrics libraries
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import (precision_score, recall_score, f1_score,accuracy_score)

In [8]:
# Construct the directory containing the CSV files
current_dir = os.getcwd()  # Get the current working directory
data_dir = os.path.join(current_dir, '..', 'data') 

# List of file names to read
file_names = ['wholerock_cleaned.csv', 'globaL_df.csv']

# Initialize a dictionary to store DataFrames
dataframes = {}

# Loop through the file names, read each file, and store in the dictionary
for file_name in file_names:
    file_path = os.path.join(data_dir, file_name)  # Construct the full file path
    dataframes[file_name] = pd.read_csv(file_path, index_col=0)  # Read the CSV file and store in the dictionary

# Access the DataFrames using their file names
wholerock_df = dataframes['wholerock_cleaned.csv']
global_df = dataframes['globaL_df.csv']

In [10]:
wholerock_df.columns

Index(['sampno', 'sio2', 'tio2', 'al2o3', 'fe2o3', 'mgo', 'cao', 'mno', 'k2o',
       'p2o5', 'rock_type'],
      dtype='object')

In [12]:
wholerock_df

Unnamed: 0,sampno,sio2,tio2,al2o3,fe2o3,mgo,cao,mno,k2o,p2o5,rock_type
79,2158877,68.28,0.20,14.0,3.31,0.48,0.30,0.216888,3.43,0.126,granite
81,2158878,74.74,0.11,13.7,1.53,0.38,0.32,0.090370,2.95,0.133,granite
82,2158879,74.20,0.15,14.0,1.57,0.46,0.40,0.065841,3.47,0.120,granite
83,2158880,73.74,0.22,14.4,1.41,0.43,0.43,0.051640,3.37,0.126,granite
84,2158881,74.67,0.06,13.4,1.17,0.30,0.35,0.058095,3.96,0.152,granite
...,...,...,...,...,...,...,...,...,...,...,...
602,2159249,51.95,0.88,17.0,9.58,2.56,4.40,0.374390,3.74,0.463,diorite
604,2159250,69.05,0.37,14.6,2.72,1.20,1.73,0.080042,4.09,0.165,granite
606,2159251,69.41,0.34,14.6,2.65,1.21,1.31,0.087788,4.52,0.258,granite
607,2159252,60.23,0.44,15.9,5.79,1.65,3.13,0.251745,4.48,0.189,diorite


In [20]:
X_train = global_df[['sio2', 'tio2', 'al2o3', 'fe2o3', 'mgo', 'cao', 'mno', 'k2o', 'p2o5']]
y_train = global_df['rock_type']

In [22]:
X_wholerock = wholerock_df.drop(['rock_type', 'sampno'], axis=1)
y_wholerock = wholerock_df['rock_type']

In [24]:
pipelines = {
    'Logistic Regression': Pipeline([
        ('classifier', LogisticRegression(max_iter=1000))
    ]),
    'Random Forest': Pipeline([
        ('classifier', RandomForestClassifier(random_state=13))
    ]),
    'Decision Tree': Pipeline([
        ('classifier', DecisionTreeClassifier(random_state=13))
    ]),
    'KNN': Pipeline([
        ('classifier', KNeighborsClassifier())
    ]),
    'SVM': Pipeline([
        ('classifier', SVC(random_state=13))
    ]),
    'Gradient Boosting': Pipeline([
        ('classifier', GradientBoostingClassifier(random_state=13))
    ])
}

# Train and evaluate each model
for model_name, pipeline in pipelines.items():
    print(f"Evaluating {model_name}...")
    
    # Train the model on the global dataset
    pipeline.fit(X_train, y_train)
    
    # Evaluate on the validation set
    predictions = pipeline.predict(X_wholerock)
    print(f"Validation Classification Report for {model_name}:")
    print(classification_report(y_wholerock, predictions))
    
    # Confusion matrix for validation set
    print(f"Confusion Matrix for {model_name}:")
    print(confusion_matrix(y_wholerock, predictions))
    
    # Hyperparameter tuning 
    # Note: You can add hyperparameter tuning as needed for each model
    
    # Final evaluation on the test set with hyperparameter tuning
#     y_test_pred = pipeline.predict(X_test)
#     print(f"Test Classification Report for {model_name}:")
#     print(classification_report(y_test, y_test_pred))
    
#     # Confusion matrix for test set
#     print(f"Confusion Matrix for {model_name}:")
#     print(confusion_matrix(y_test, y_test_pred))
    
    print("------------------------------------------")

Evaluating Logistic Regression...
Validation Classification Report for Logistic Regression:
              precision    recall  f1-score   support

     diorite       0.67      0.05      0.09       201
     granite       0.24      0.92      0.38        66

    accuracy                           0.27       267
   macro avg       0.45      0.49      0.24       267
weighted avg       0.56      0.27      0.16       267

Confusion Matrix for Logistic Regression:
[[ 10 191]
 [  5  61]]
------------------------------------------
Evaluating Random Forest...
Validation Classification Report for Random Forest:
              precision    recall  f1-score   support

     diorite       0.57      0.04      0.07       201
     granite       0.24      0.91      0.38        66

    accuracy                           0.25       267
   macro avg       0.40      0.47      0.23       267
weighted avg       0.49      0.25      0.15       267

Confusion Matrix for Random Forest:
[[  8 193]
 [  6  60]]
-------