In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

Using loader for data

In [4]:
import sys
sys.path.append("..")

In [5]:
from helpers.picklers import *
from helpers.data_utils import *

Data path

In [6]:
data_path = "../data/descriptor/DescriptorData.csv"

Prepare training data for our models

In [7]:
# Loading descriptor files
df = pd.read_csv(data_path)

In [8]:
# Saving in the object
X = df.loc[:, [str(i) for i in range(0, 14)]].to_numpy()
y = df.loc[:, "band_gap"].to_numpy() > 0

In [9]:
X.shape, y.shape

((18962, 14), (18962,))

In [10]:
np.random.seed(42)

In [11]:
# Shuffling and Splitting it
X_shuffled, y_shuffled, ind = shuffle_data(X, y, None)
X_train, y_train, X_test, y_test = split(X_shuffled, y_shuffled)

In [12]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((15169, 14), (15169,), (3793, 14), (3793,))

Applying Logistic Regression to the Data

In [13]:
grid_parameters = {
    'C' : np.logspace(-6, 5, 33)
}
estimator = LogisticRegression(max_iter=5000, C=10.0)

In [14]:
clf = GridSearchCV(estimator, param_grid=grid_parameters, verbose=True, cv=5, n_jobs=10)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 33 candidates, totalling 165 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    6.8s
[Parallel(n_jobs=10)]: Done 165 out of 165 | elapsed:   52.5s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=10.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=5000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=10,
             param_grid={'C': array([1.00000000e-06, 2.2...
       1.33352143e-02, 2.94272718e-02, 6.49381632e-02, 1.43301257e-01,
       3.16227766e-01, 6.97830585e-01, 1.53992653e+00, 3.39820833e+00,
       7.49894209e+00, 1.65481710e+01, 3.65174127e+01, 8.05842188e+01,
       1.77827941e+02, 3.92418976e+02, 8.65964323e+02, 1.91095297e+03,
       4

In [15]:
clf.score(X_test, y_test)

0.8594779857632481

In [16]:
np.mean(y_train==0), np.mean(y_test==0)

(0.752389742237458, 0.7595570788294226)

In [17]:
clf.best_estimator_

LogisticRegression(C=1.539926526059492, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=5000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Apply Standard Scaler

In [18]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Retraining

In [19]:
clf = GridSearchCV(estimator, param_grid=grid_parameters, verbose=True, cv=5, n_jobs=10)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 33 candidates, totalling 165 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    5.7s
[Parallel(n_jobs=10)]: Done 165 out of 165 | elapsed:   12.1s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=10.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=5000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=10,
             param_grid={'C': array([1.00000000e-06, 2.2...
       1.33352143e-02, 2.94272718e-02, 6.49381632e-02, 1.43301257e-01,
       3.16227766e-01, 6.97830585e-01, 1.53992653e+00, 3.39820833e+00,
       7.49894209e+00, 1.65481710e+01, 3.65174127e+01, 8.05842188e+01,
       1.77827941e+02, 3.92418976e+02, 8.65964323e+02, 1.91095297e+03,
       4

In [20]:
clf.score(X_test, y_test)

0.8600052728710783

In [21]:
clf.best_estimator_

LogisticRegression(C=0.6978305848598664, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=5000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Making new search within the 10^1 region

In [22]:
new_grid_parameters = {
    'C' : np.linspace(2, 4, 10)
}

In [23]:
clf = GridSearchCV(estimator, param_grid=new_grid_parameters, verbose=True, cv=5, n_jobs=10)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    6.3s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:    7.2s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=10.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=5000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=10,
             param_grid={'C': array([2.        , 2.22222222, 2.44444444, 2.66666667, 2.88888889,
       3.11111111, 3.33333333, 3.55555556, 3.77777778, 4.        ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [24]:
clf.score(X_test, y_test)

0.8594779857632481

In [25]:
clf.best_estimator_

LogisticRegression(C=2.2222222222222223, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=5000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
y_pred = clf.best_estimator_.predict(X_test)
y_prob = clf.best_estimator_.predict_proba(X_test)

In [27]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [28]:
confusion_matrix(y_test, y_pred)

array([[2681,  200],
       [ 333,  579]])

In [29]:
auc = roc_auc_score(y_test, y_prob[:, 1])
auc

0.9049721557451421

In [30]:
dict1 = {'Pred_LogReg': y_prob[:, 1], 'Real_LogReg':y_test }  
df=pd.DataFrame(dict1)
df.to_csv("logRegData.csv", index=False)