In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt 

In [2]:
#read csv
df = pd.read_csv('data/PHY_TRAIN.csv')

In [14]:
#splitting target and predictors
y = df['target']
X = df.drop(['target', 'exampleid'],axis=1)
feature_names = list(X.columns)

In [4]:
#STEP 1: Data Exploration
X.describe()

Unnamed: 0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat69,feat70,feat71,feat72,feat73,feat74,feat75,feat76,feat77,feat78
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.155606,0.084876,-0.050354,-6e-05,0.126569,0.049887,-0.038344,0.00286,0.848353,0.673485,...,0.00812,0.000478,0.003198,0.052807,0.066944,-0.014101,-0.00146,0.09448,0.002843,0.066545
std,0.414875,0.295335,0.253748,0.392916,0.400694,0.223713,0.214168,0.322077,0.453585,0.511087,...,0.769302,0.446978,0.381329,0.18071,0.283114,0.176896,0.295939,0.315841,0.019081,0.223091
min,0.0,0.0,-1.0,-1.0,0.0,0.0,-1.0,-1.0,0.0,0.0,...,-1.0,-0.999998,-0.908001,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.522596,0.250301,...,-1.0,-1.3e-05,-0.001582,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.787572,0.599672,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.105687,1.018601,...,1.0,1.5e-05,0.003002,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.63902,3.42959,0.999954,1.0,2.719006,3.054644,0.999274,1.0,6.699783,5.283748,...,1.0,0.999999,0.907744,0.999953,3.42959,0.999869,1.0,3.0,0.385513,1.0


In [5]:
#Data exploration for target variables
y.describe()

count    50000.000000
mean         0.497220
std          0.499997
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: target, dtype: float64

In [6]:
y.value_counts()

#sns.countplot(x = y, data=y)
#plt.show

0    25139
1    24861
Name: target, dtype: int64

In [7]:
y

0        0
1        0
2        1
3        0
4        0
        ..
49995    0
49996    1
49997    1
49998    1
49999    0
Name: target, Length: 50000, dtype: int64

In [8]:
#STEP 2: Missing value indicator
from sklearn.impute import MissingIndicator
#True indicates Missing Value, False indicates no missing value

#indicator = MissingIndicator(features='all')
indicator = MissingIndicator(features='missing-only')
missing = indicator.fit_transform(X)
#converts boolean to integers
missing.astype(int)

array([[1, 1, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [9]:
#STEP 3: Replace missing values with mean
from sklearn.impute import SimpleImputer
mean_imputer = SimpleImputer()
mean_imputer = mean_imputer.fit(X)
df1 = pd.DataFrame(mean_imputer.transform(X),columns = feature_names)


In [10]:
y = y.to_numpy()

In [11]:
#STEP 4 Ignored because we have not taken a data prep course
#STEP 5 Modeling
# n_jobs at -1 WILL USE UP A LOT OF MEMORY for your computer.
# It may cause the computer to become VERY SLOW at doing anything else.
from sklearn.model_selection import cross_val_score
from time import time

#Logistic Regression w/o Interaction Terms

t0 = time()
logitModel = LogisticRegression(n_jobs=-1)
t1 = time()
logitValues = cross_val_score(logitModel, df1, y=y, n_jobs=-1, verbose=1, scoring="accuracy")
t2 = time()
logitMean = logitValues.mean()
print("Time to run the model", t1 - t0)
print("Time to run 5-Folds", t2 - t1)
print("Mean accuracy through 5-folds", logitMean)
#******Might want to try Recursive Feature Elimination

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


Time to run the model 0.0
Time to run 5-Folds 7.656734466552734
Mean accuracy through 5-folds 0.6976199999999999


In [12]:
logitModel.fit(df1,y)
baseline = logitModel.score(df1,y)
baseline

0.6982

In [22]:
# Logistic regression with at least three two-way interactions
# this part might not work for multiple features n
score = 0
interactions = list()
for feature_A in feature_names:
    for feature_B in feature_names:
        if feature_A > feature_B:
            print(feature_A, flush=True)
            print("\t",feature_B, flush=True)
            df1['interaction'] = df1[feature_A] * df1[feature_B]
            logitModel.fit(df1, y)
            score = logitModel.score(df1,y)
            if score > baseline:
                interactions.append((feature_A, feature_B, round(score,8)))
#print(df1)
print('Baseline R2: %.3f' % baseline)
print('Top 10 interactions: %s' % sorted(interactions ,key = lambda x:x[2], reverse=True)[:10])

feat2
	 feat1
feat2
	 feat10
feat2
	 feat11
feat2
	 feat12
feat2
	 feat13
feat2
	 feat14
feat2
	 feat15
feat2
	 feat16
feat2
	 feat17


KeyboardInterrupt: 

In [None]:
# ---------Random Forest----------
import sklearn
from sklearn.ensemble import RandomForestClassifier
# sorted(sklearn.metrics.SCORERS.keys())

# Gini
t0 = time()
rfGini = RandomForestClassifier(criterion="gini", n_estimators = 100, random_state = 42, n_jobs=-1, verbose=1)
t1 = time()
dtGiniValues = cross_val_score(rfGini, df1, y=y, n_jobs=-1, verbose=1, scoring="accuracy")
t2 = time()
dtGiniMean = dtGiniValues.mean()
print("Time to run the model", t1 - t0)
print("Time to run 5-Folds", t2 - t1)
print("Mean accuracy through 5-folds", dtGiniMean)

In [None]:
# Entropy
t0 = time()
rfEntropy = RandomForestClassifier(criterion="entropy", n_estimators = 100, random_state = 42, n_jobs=-1, verbose=1)
t1 = time()
dtEntropyValues = cross_val_score(rfEntropy, df1, y=y, n_jobs=-1, verbose=1, scoring="accuracy")
t2 = time()
dtEntropyMean = dtEntropyValues.mean()
print("Time to run the model", t1 - t0)
print("Time to run 5-Folds", t2 - t1)
print("Mean accuracy through 5-folds", dtEntropyMean)

In [None]:
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
t0 = time()
gb_clf = GradientBoostingClassifier()
t1 = time()
gbclfScores = cross_val_score(gb_clf, df1, y=y, n_jobs=-1, verbose=1, scoring="accuracy")
t2 = time()
gbclfMean = gbclfScores.mean()
print("Time to run the model", t1 - t0)
print("Time to run 5-Folds", t2 - t1)
print("Mean accuracy through 5-folds", gbclfMean)

In [None]:
#Step 6: Comparisons of fitted models using c-statistics, i.e., AUC of the ROC curve