In [1]:
# Data wrangling libraries
import numpy as np
import pandas as pd
import os

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# modelling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [3]:
# evaluation metrics libraries
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import (precision_score, recall_score, f1_score,accuracy_score)

In [4]:
# Construct the directory containing the CSV files
current_dir = os.getcwd()  # Get the current working directory
data_dir = os.path.join(current_dir, '..', 'data') 

# List of file names to read
file_names = ['df_xrf.csv', 'globaL_df.csv']

# Initialize a dictionary to store DataFrames
dataframes = {}

# Loop through the file names, read each file, and store in the dictionary
for file_name in file_names:
    file_path = os.path.join(data_dir, file_name)  # Construct the full file path
    dataframes[file_name] = pd.read_csv(file_path, index_col=0)  # Read the CSV file and store in the dictionary

# Access the DataFrames using their file names
xrf_df = dataframes['df_xrf.csv']
global_df = dataframes['globaL_df.csv']

In [27]:
global_df

Unnamed: 0,sample_id,rock_name,qap_name,major_id,method_id,sio2,tio2,al2o3,fe2o3,mgo,cao,mno,k2o,na2o,p2o5,rock_type
6,51,bt granite,monzogranite,583272,5254,74.05,0.25,13.77,0.29,0.29,1.37,0.05,4.00,3.46,0.06,granite
7,52,coarse-grained bt granite,monzogranite,521229,5254,69.28,0.31,14.91,0.53,0.88,2.23,0.08,4.43,3.15,0.10,granite
153,381,granite,monzogranite,537177,5379,70.57,0.39,14.78,0.40,0.77,2.60,0.09,3.57,3.18,0.11,granite
154,382,granite,monzogranite,565484,5379,72.72,0.11,14.67,0.56,0.21,1.20,0.07,4.63,3.76,0.06,granite
155,383,granite,monzogranite,588241,5379,74.44,0.10,14.07,0.40,0.24,1.24,0.05,4.28,3.60,0.06,granite
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47196,1022064,dyke,monzogranite,526989,1,69.75,0.33,12.11,5.74,0.04,0.09,0.07,5.01,6.13,0.06,granite
47197,1022066,dyke,monzogranite,545737,1,71.22,0.16,12.09,4.62,0.10,0.38,0.07,3.76,6.85,0.04,granite
47198,1022067,dyke,monzogranite,588567,1,74.48,0.18,11.08,3.69,0.03,0.08,0.07,4.25,5.51,0.04,granite
47200,1022087,charnockite,monzogranite,542328,1,70.99,0.50,13.63,1.63,0.42,1.95,0.04,5.27,3.10,0.15,granite


In [5]:
xrf_df.columns

Index(['sio2', 'tio2', 'al2o3', 'fe2o3', 'mgo', 'cao', 'mno', 'k2o', 'p2o5',
       'rock_type'],
      dtype='object')

In [6]:
xrf_df

Unnamed: 0,sio2,tio2,al2o3,fe2o3,mgo,cao,mno,k2o,p2o5,rock_type
0,75.970779,0.044775,9.693397,1.079253,0.230089,0.330458,0.131090,3.611102,0.116388,granite
1,92.092015,0.016977,8.000228,0.184410,0.230089,0.403357,0.014549,2.194034,0.254139,granite
2,74.377224,0.033534,6.210955,0.340443,0.230089,0.442249,0.019978,1.760836,0.234403,granite
3,67.898437,0.028367,9.142570,0.211402,0.230089,0.483317,0.009582,3.657696,0.248790,granite
4,70.759319,0.043649,11.508511,0.338909,0.230089,0.391176,0.017399,4.299519,0.149314,granite
...,...,...,...,...,...,...,...,...,...,...
981,61.438481,0.471900,17.247564,7.289305,6.525819,5.012353,0.166892,3.017981,0.103363,diorite
982,64.703119,0.466720,15.681514,6.696700,6.084401,4.954704,0.155655,2.785818,0.102250,diorite
983,61.289781,0.439220,15.330686,7.737864,7.527860,5.540497,0.197714,3.130623,0.115308,diorite
984,56.750064,0.510952,17.576405,8.856410,8.590450,2.356354,0.166797,2.100112,0.149447,diorite


In [7]:
X_train = global_df[['sio2', 'tio2', 'al2o3', 'fe2o3', 'mgo', 'cao', 'mno', 'k2o', 'p2o5']]
y_train = global_df['rock_type']

In [8]:
X_test = xrf_df.drop('rock_type', axis=1)
y_test = xrf_df['rock_type']

In [17]:
# Choose the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Transform the training and test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaled Training Data:\n", X_train_scaled)
print("Scaled Test Data:\n", X_test_scaled)

Scaled Training Data:
 [[ 0.58108337 -0.36595426 -0.18607168 ... -0.1546123  -0.16748542
  -0.37163684]
 [ 0.00218401 -0.25391937  0.37435576 ...  0.12760449  0.09620881
  -0.09983925]
 [ 0.1587417  -0.10453951  0.31044737 ...  0.22167675 -0.43117965
  -0.03188985]
 ...
 [ 0.63326926 -0.49666163 -1.50848379 ...  0.03353223 -0.01417482
  -0.50753564]
 [ 0.20971397  0.10085778 -0.2548961  ... -0.24868456  0.61133243
   0.23990775]
 [-1.03789302  1.2025342  -0.12707931 ...  1.25647165  0.05941427
   1.25914874]]
Scaled Test Data:
 [[ 0.81419398 -0.74916039 -2.19014224 ...  0.60822013 -0.40597435
   0.01151394]
 [ 2.77070831 -0.80106594 -3.02250896 ... -0.48810814 -1.27498013
   0.94752661]
 [ 0.62079613 -0.77015049 -3.90212098 ... -0.43703893 -1.54063566
   0.81342349]
 ...
 [-0.96752932 -0.01263338  0.5811661  ...  1.23496701 -0.70062432
   0.00417664]
 [-1.51848093  0.12130824  1.68516832 ...  0.94412786 -1.33257748
   0.23614795]
 [-1.05674536  0.42446194  1.1872879  ...  1.14700029 -0

In [21]:
logmodel = LogisticRegression()
logmodel.fit(X_train_scaled, y_train)

In [23]:
predictions = logmodel.predict(X_test_scaled)

In [25]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[ 48 512]
 [  0 238]]
              precision    recall  f1-score   support

     diorite       1.00      0.09      0.16       560
     granite       0.32      1.00      0.48       238

    accuracy                           0.36       798
   macro avg       0.66      0.54      0.32       798
weighted avg       0.80      0.36      0.25       798



In [30]:
treemodel = DecisionTreeClassifier()
treemodel.fit(X_train_scaled, y_train)
predictions = treemodel.predict(X_test_scaled)

In [32]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[ 41 519]
 [  2 236]]
              precision    recall  f1-score   support

     diorite       0.95      0.07      0.14       560
     granite       0.31      0.99      0.48       238

    accuracy                           0.35       798
   macro avg       0.63      0.53      0.31       798
weighted avg       0.76      0.35      0.24       798



## AutoML

In [10]:
# X_val, X_test, y_val, y_test = train_test_split(X_xrf, y_xrf, test_size=0.5, random_state=13, stratify=y_xrf)

In [11]:
# pipelines = {
#     'Logistic Regression': Pipeline([
#         ('classifier', LogisticRegression(max_iter=1000))
#     ]),
#     'Random Forest': Pipeline([
#         ('classifier', RandomForestClassifier(random_state=13))
#     ]),
#     'Decision Tree': Pipeline([
#         ('classifier', DecisionTreeClassifier(random_state=13))
#     ]),
#     'KNN': Pipeline([
#         ('classifier', KNeighborsClassifier())
#     ]),
#     'SVM': Pipeline([
#         ('classifier', SVC(random_state=13))
#     ]),
#     'Gradient Boosting': Pipeline([
#         ('classifier', GradientBoostingClassifier(random_state=13))
#     ])
# }

# # Train and evaluate each model
# for model_name, pipeline in pipelines.items():
#     print(f"Evaluating {model_name}...")
    
#     # Train the model on the global dataset
#     pipeline.fit(X_train, y_train)
    
#     # Evaluate on the validation set
#     y_val_pred = pipeline.predict(X_val)
#     print(f"Validation Classification Report for {model_name}:")
#     print(classification_report(y_val, y_val_pred))
    
#     # Confusion matrix for validation set
#     print(f"Confusion Matrix for {model_name}:")
#     print(confusion_matrix(y_val, y_val_pred))
    
#     # Hyperparameter tuning 
#     # Note: You can add hyperparameter tuning as needed for each model
    
#     # Final evaluation on the test set with hyperparameter tuning
# #     y_test_pred = pipeline.predict(X_test)
# #     print(f"Test Classification Report for {model_name}:")
# #     print(classification_report(y_test, y_test_pred))
    
# #     # Confusion matrix for test set
# #     print(f"Confusion Matrix for {model_name}:")
# #     print(confusion_matrix(y_test, y_test_pred))
    
#     print("------------------------------------------")

Evaluating Logistic Regression...
Validation Classification Report for Logistic Regression:
              precision    recall  f1-score   support

     diorite       0.95      0.07      0.14       280
     granite       0.31      0.99      0.48       119

    accuracy                           0.35       399
   macro avg       0.63      0.53      0.31       399
weighted avg       0.76      0.35      0.24       399

Confusion Matrix for Logistic Regression:
[[ 21 259]
 [  1 118]]
------------------------------------------
Evaluating Random Forest...
Validation Classification Report for Random Forest:
              precision    recall  f1-score   support

     diorite       0.91      0.07      0.13       280
     granite       0.31      0.98      0.47       119

    accuracy                           0.34       399
   macro avg       0.61      0.53      0.30       399
weighted avg       0.73      0.34      0.23       399

Confusion Matrix for Random Forest:
[[ 20 260]
 [  2 117]]
-------