# Logistic Regression carbon and silicon isotopes

In [21]:
# Imports
import joblib
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Read the CSV and Perform Basic Data Cleaning

In [13]:
C_Si_N_Al = pd.read_csv('presolargrains_C_Si_N_Al.csv')
C_Si_N_Al.head()

Unnamed: 0,Type,carbon_12_13,nitrogen_14_15,aluminum_26_27,silicon_29_28,silicon_30_28
0,X,1581.0,116.0,0.0095,-684.0,-490.0
1,X,234.75,187.2,0.3327,-683.0,-501.0
2,X,140.0,97.0,0.017,-653.0,-446.0
3,X,223.0,102.0,0.114,-600.0,-459.0
4,X,1693.89,63.2,0.2364,-588.0,-605.0


## Create a Train Test Split

In [14]:
# Set `Type` for the y values
y = C_Si_N_Al["Type"]
X = C_Si_N_Al.drop(columns=["Type"])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [15]:
X_train.head()

Unnamed: 0,carbon_12_13,nitrogen_14_15,aluminum_26_27,silicon_29_28,silicon_30_28
37,348.0,75.6,0.142,-332.0,-467.0
263,30.23,420.81,0.001232,68.0,46.0
190,8.31,2081.0,0.00027,29.0,53.0
10,552.0,58.0,0.2049,-473.3,-501.9
251,4.86,5838.0,0.00116,59.0,71.0


## Pre-processing

In [16]:
# Scale the data using the MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train the Model

In [17]:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.6829268292682927
Testing Data Score: 0.6829268292682927


## Hyperparameter Tuning

In [19]:
# Use `GridSearchCV` to tune the model's parameters
param_grid = {'C': [1, 5, 10],
              'penalty': ["l1", "l2"]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [9]:
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.8713968957871396, total=   0.3s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ......... C=1, penalty=l1, score=0.872192958136956, total=   0.2s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] ........ C=1, penalty=l1, score=0.8731260410882843, total=   0.2s
[CV] C=1, penalty=l2 .................................................
[CV] ........ C=1, penalty=l2, score=0.8481152993348116, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ........ C=1, penalty=l2, score=0.8486276684225118, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ........ C=1, penalty=l2, score=0.8484175458078845, total=   0.0s
[CV] C=5, penalty=l1 .................................................




[CV] ......... C=5, penalty=l1, score=0.878880266075388, total=   0.4s
[CV] C=5, penalty=l1 .................................................




[CV] ........ C=5, penalty=l1, score=0.8802328805101192, total=   0.4s
[CV] C=5, penalty=l1 .................................................
[CV] ........ C=5, penalty=l1, score=0.8781232648528595, total=   0.4s
[CV] C=5, penalty=l2 .................................................
[CV] ........ C=5, penalty=l2, score=0.8625277161862528, total=   0.0s
[CV] C=5, penalty=l2 .................................................
[CV] ........ C=5, penalty=l2, score=0.8627668422511783, total=   0.0s
[CV] C=5, penalty=l2 .................................................
[CV] ........ C=5, penalty=l2, score=0.8628539700166574, total=   0.0s
[CV] C=10, penalty=l1 ................................................




[CV] ....... C=10, penalty=l1, score=0.8835920177383592, total=   0.4s
[CV] C=10, penalty=l1 ................................................




[CV] ....... C=10, penalty=l1, score=0.8849459384530081, total=   0.5s
[CV] C=10, penalty=l1 ................................................


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    3.9s finished


[CV] ....... C=10, penalty=l1, score=0.8806218767351471, total=   0.5s
[CV] C=10, penalty=l2 ................................................
[CV] ....... C=10, penalty=l2, score=0.8647450110864745, total=   0.0s
[CV] C=10, penalty=l2 ................................................
[CV] ....... C=10, penalty=l2, score=0.8652619905738841, total=   0.0s
[CV] C=10, penalty=l2 ................................................
[CV] ....... C=10, penalty=l2, score=0.8650749583564686, total=   0.0s


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [26]:
# Print the best parameters and the best score
#print(grid.best_params_) # AttributeError: 'GridSearchCV' object has no attribute 'best_params_'
#print(grid.best_score_) #AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

## Save the Model

In [27]:
# Save fitted model to file
filename = 'LogisticRegression_C_Si_N_Al.sav'
joblib.dump(grid, filename)

['LogisticRegression_C_Si_N_Al.sav']

## Summary:

### Logistic Regression based on carbon, silicon, nitrogen, and aluminum: 88.5% at C=10, penalty=11