# Logistic Regression

In [17]:
# Imports
import joblib
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Read the CSV and Perform Basic Data Cleaning

In [9]:
carbon = pd.read_csv('presolargrains_C_only_for_SVM.csv')
carbon.head()

Unnamed: 0,Type,12C/13C
0,X,9455.1
1,X,6793.0
2,X,6227.0
3,X,4250.0
4,X,3993.46


## Create a Train Test Split

In [10]:
# Set `Type` for the y values
y = carbon["Type"]
X = carbon.drop(columns=["Type"])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [11]:
X_train.head()

Unnamed: 0,12C/13C
6579,58.71
12268,41.61
14970,8.04
9689,50.22
10260,48.8


## Pre-processing

In [13]:
# Don't need this when only using one independent variable (carbon)

# Scale the data using the MinMaxScaler
#X_scaler = MinMaxScaler().fit(X_train)

#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)

## Train the Model

In [15]:
model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.8935552710200597
Testing Data Score: 0.8950332821300563


## Hyperparameter Tuning

In [18]:
# Use `GridSearchCV` to tune the model's parameters
param_grid = {'C': [1, 5, 10],
              'penalty': ["l1", "l2"]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [19]:
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.8979017400204709, total=   0.1s
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.8975934459805428, total=   0.1s
[CV] C=1, penalty=l1 .................................................




[CV] ........ C=1, penalty=l1, score=0.8951550884388618, total=   0.1s
[CV] C=1, penalty=l2 .................................................
[CV] ......... C=1, penalty=l2, score=0.892272262026612, total=   0.1s
[CV] C=1, penalty=l2 .................................................
[CV] ........ C=1, penalty=l2, score=0.8929851510496671, total=   0.1s
[CV] C=1, penalty=l2 .................................................
[CV] ......... C=1, penalty=l2, score=0.891566265060241, total=   0.1s
[CV] C=5, penalty=l1 .................................................




[CV] ......... C=5, penalty=l1, score=0.900460593654043, total=   0.1s
[CV] C=5, penalty=l1 .................................................
[CV] ........ C=5, penalty=l1, score=0.8983614951356886, total=   0.1s
[CV] C=5, penalty=l1 .................................................
[CV] ........ C=5, penalty=l1, score=0.8959241220199948, total=   0.1s
[CV] C=5, penalty=l2 .................................................




[CV] ........ C=5, penalty=l2, score=0.8976458546571137, total=   0.1s
[CV] C=5, penalty=l2 .................................................
[CV] ........ C=5, penalty=l2, score=0.8965693804403482, total=   0.1s
[CV] C=5, penalty=l2 .................................................
[CV] ........ C=5, penalty=l2, score=0.8951550884388618, total=   0.1s
[CV] C=10, penalty=l1 ................................................




[CV] ........ C=10, penalty=l1, score=0.900460593654043, total=   0.1s
[CV] C=10, penalty=l1 ................................................
[CV] ....... C=10, penalty=l1, score=0.8983614951356886, total=   0.1s
[CV] C=10, penalty=l1 ................................................
[CV] ....... C=10, penalty=l1, score=0.8964368110740836, total=   0.1s
[CV] C=10, penalty=l2 ................................................




[CV] ....... C=10, penalty=l2, score=0.8994370522006141, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] ....... C=10, penalty=l2, score=0.8983614951356886, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] ....... C=10, penalty=l2, score=0.8956677774929506, total=   0.1s


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    1.4s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [20]:
# Print the best parameters and the best score
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'penalty': 'l1'}
0.8984208279982928


## Save the Model

In [21]:
# Save fitted model to file
filename = 'LogisticRegression.sav'
joblib.dump(grid, filename)

['LogisticRegression.sav']

## Summary:

### Logistic Regression based on carbon only: 89.8%