In [None]:
#############################################################################################
#  RUT-SOM-DATA-PT-06-2020-U-C                                                 Douglas High #
#   Machine-Learning-Challenge                                             November 7, 2020 #
#      > model0_compare.ipynb                                                               #
#   - run variety of machine learning methods without scaling and with standardscaler(ss)   #
#     as well as minmax (mm) scaling.                                                       #
#   - save scores to dataframe for comparison.                                              # 
#############################################################################################

In [None]:
#############################################################################
#00.UI     Update/Install                                                   #
#    a- install/check for update sklearn (to prevent version mismatches).   #
#    b- install joblib - used to save model (uncomment to install).         #
#       *** Restart Kernal after joblib install ***                         #
#############################################################################

#a
!pip install sklearn --upgrade

#b
# !pip install joblib

In [1]:
###########################
#00   I/O                 #
#   - import libraries.   #
#   - associate files.    #
###########################

import pandas as pd
import warnings; warnings.simplefilter('ignore')

data_in = "../data_in/exoplanet_data.csv"

In [2]:
###################################################
#10     Read and remove                           #
#   - read data into df.                          #
#   - drop null columns if all values are null.   #
#   - drop null rows.                             #
#   - split data to work with known results (confirmed or false positive).   #
#      saving candidate records for predictions.                             #
###################################################

df = pd.read_csv(data_in)

df = df.dropna(axis='columns', how='all')

df = df.dropna()

#
known_df = df[df["koi_disposition"] != "CANDIDATE"]
unknown_df = df[df["koi_disposition"] == "CANDIDATE"]



In [3]:
################################################
#20     Set Features                           #
#   - select columns to use as features.       #
#     (This will also be used as your x values)#
################################################

# feature_names = data.columns


X = known_df.drop("koi_disposition", axis=1)
y = known_df["koi_disposition"]

y = pd.get_dummies(y)
y = y.drop("FALSE POSITIVE", axis=1)
# y

In [4]:
########################################################
#30     Test/train split                               #
#   - split known_df into training and testing data.   #
########################################################

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
############################################
#40     Scale                              #
#   a- create standard scaler (ss) data.   #
#   b- create minmax scaled (mm) data.     #
############################################

#a
from sklearn.preprocessing import StandardScaler
X_scaler_ss = StandardScaler().fit(X_train)
X_train_scaled_ss = X_scaler_ss.transform(X_train)
X_test_scaled_ss = X_scaler_ss.transform(X_test)

#b
from sklearn.preprocessing import  MinMaxScaler
X_scaler_mm = MinMaxScaler().fit(X_train)
X_train_scaled_mm = X_scaler_mm.transform(X_train)
X_test_scaled_mm = X_scaler_mm.transform(X_test)

In [6]:
######################################################
#50      Compare df                                  #
#   - create df to hold metrics of various models.   #
#   model = model used (ie. linearRegression).       #
#   scale code from 40-Scale above, 00 = none.       #
#   train_score = r2 score from training data.       #
#   test_score = r2 score from test data.            #
#   avg = average train/test score.                  #
######################################################

compare_df = pd.DataFrame({"model":[], "scale_code":[], "train_score":[], "test_score":[], "avg":[]})
# row = ["1","2","","3"]
# compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

In [7]:
################################################
#100.00    Linear Regression no scale              #
#   a- 
################################################

#a 
from sklearn.linear_model import LinearRegression
model_100 = LinearRegression()
model_100.fit(X_train, y_train)

predictions_100 = model_100.predict(X_test)

#b
model = "Linear Regression"
scale = "00"
train = model_100.score(X_train, y_train)
test = model_100.score(X_test, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,0,0.770991,0.750459,0.760725


In [8]:
################################################
#100.ss    Linear Regression ss scale              #
#   a- 
################################################

#a 
# from sklearn.linear_model import LinearRegression
model_100S = LinearRegression()
model_100S.fit(X_train_scaled_ss, y_train)

predictions_100S = model_100S.predict(X_test_scaled_ss)

#b
model = "Linear Regression"
scale = "ss"
train = model_100.score(X_train_scaled_ss, y_train)
test = model_100.score(X_test_scaled_ss, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123


In [9]:
################################################
#100.mm    Linear Regression mm scale              #
#   a- 
################################################

#a 
# from sklearn.linear_model import LinearRegression
model_100m = LinearRegression()
model_100m.fit(X_train_scaled_mm, y_train)

predictions_100m = model_100m.predict(X_test_scaled_mm)

#b
model = "Linear Regression"
scale = "mm"
train = model_100m.score(X_train_scaled_mm, y_train)
test = model_100m.score(X_test_scaled_mm, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725


In [10]:
################################################
#110.00    Lasso no scale              #
#   a- 
################################################

#a 
from sklearn.linear_model import Lasso
lasso_110 = Lasso(alpha=.01).fit(X_train, y_train)
predictions_110 = lasso_110.predict(X_test)

#b
model = "Lasso"
scale = "00"
train = lasso_110.score(X_train, y_train)
test = lasso_110.score(X_test, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548


In [11]:
################################################
#110.ss    Lasso ss scale              #
#   a- 
################################################

#a 
# from sklearn.linear_model import Lasso
lasso_110s = Lasso(alpha=.01).fit(X_train_scaled_ss, y_train)
predictions_110s = lasso_110s.predict(X_test_scaled_ss)

#b
model = "Lasso"
scale = "ss"
train = lasso_110s.score(X_train_scaled_ss, y_train)
test = lasso_110s.score(X_test_scaled_ss, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936


In [12]:
################################################
#110.mm    Lasso mm scale              #
#   a- 
################################################

#a 
# from sklearn.linear_model import Lasso
lasso_110m = Lasso(alpha=.01).fit(X_train_scaled_mm, y_train)
predictions_110m = lasso_110m.predict(X_test_scaled_mm)

#b
model = "Lasso"
scale = "mm"
train = lasso_110m.score(X_train_scaled_mm, y_train)
test = lasso_110m.score(X_test_scaled_mm, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684


In [13]:
################################################
#120.00    Ridge no scale              #
#   a- 
################################################

#a 
from sklearn.linear_model import Ridge
ridge_120 = Ridge(alpha=.01).fit(X_train, y_train)
predictions_120 = ridge_120.predict(X_test)

#b
model = "Ridge"
scale = "00"
train = ridge_120.score(X_train, y_train)
test = ridge_120.score(X_test, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714


In [14]:
################################################
#120.00    Ridge ss scale              #
#   a- 
################################################

#a 
# from sklearn.linear_model import Ridge
ridge_120s = Ridge(alpha=.01).fit(X_train_scaled_ss, y_train)
predictions_120s = ridge_120s.predict(X_test_scaled_ss)

#b
model = "Ridge"
scale = "ss"
train = ridge_120s.score(X_train_scaled_ss, y_train)
test = ridge_120s.score(X_test_scaled_ss, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726


In [15]:
################################################
#120.00    Ridge mm scale              #
#   a- 
################################################

#a 
# from sklearn.linear_model import Ridge
ridge_120m = Ridge(alpha=.01).fit(X_train_scaled_mm, y_train)
predictions_120m = ridge_120m.predict(X_test_scaled_mm)

#b
model = "Ridge"
scale = "mm"
train = ridge_120m.score(X_train_scaled_mm, y_train)
test = ridge_120m.score(X_test_scaled_mm, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611


In [16]:
################################################
#130.00    ElasticNet no scale              #
#   a- 
################################################

#a 
from sklearn.linear_model import ElasticNet
elasticnet_130 = ElasticNet(alpha=.01).fit(X_train, y_train)
predictions_130 = elasticnet_130.predict(X_test)

#b
model = "ElasticNet"
scale = "00"
train = elasticnet_130.score(X_train, y_train)
test = elasticnet_130.score(X_test, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [17]:
#############################################
#130.ss    ElasticNet ss scale              #
#############################################

#a 
# from sklearn.linear_model import ElasticNet
elasticnet_130s = ElasticNet(alpha=.01).fit(X_train_scaled_ss, y_train)
predictions_130s = elasticnet_130s.predict(X_test_scaled_ss)

#b
model = "ElasticNet"
scale = "ss"
train = elasticnet_130s.score(X_train_scaled_ss, y_train)
test = elasticnet_130s.score(X_test_scaled_ss, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [18]:
#############################################
#130.00    ElasticNet mm scale              #
#############################################

#a 
# from sklearn.linear_model import ElasticNet
elasticnet_130m = ElasticNet(alpha=.01).fit(X_train_scaled_mm, y_train)
predictions_130m = elasticnet_130m.predict(X_test_scaled_mm)

#b
model = "ElasticNet"
scale = "mm"
train = elasticnet_130m.score(X_train_scaled_mm, y_train)
test = elasticnet_130m.score(X_test_scaled_mm, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [19]:
################################################
#140.00    LogisticRegression no scale         #
################################################

#a 
from sklearn.linear_model import LogisticRegression
classifier_140 = LogisticRegression()
classifier_140.fit(X_train, y_train)
predictions_140 = classifier_140.predict(X_test)
#print(classifier140)

#b
model = "LogisticRegression"
scale = "00"
train = classifier_140.score(X_train, y_train)
test = classifier_140.score(X_test, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [20]:
################################################
#140.ss    LogisticRegression ss scale         #
################################################

#a 
# from sklearn.linear_model import LogisticRegression
classifier_140s = LogisticRegression()
classifier_140s.fit(X_train_scaled_ss, y_train)
predictions_140s = classifier_140s.predict(X_test_scaled_ss)

#b
model = "LogisticRegression"
scale = "ss"
train = classifier_140s.score(X_train_scaled_ss, y_train)
test = classifier_140s.score(X_test_scaled_ss, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [21]:
################################################
#140.mm    LogisticRegression mm scale         #
################################################

#a 
# from sklearn.linear_model import LogisticRegression
classifier_140m = LogisticRegression()
classifier_140m.fit(X_train_scaled_mm, y_train)
predictions_140m = classifier_140m.predict(X_test_scaled_mm)

#b
model = "LogisticRegression"
scale = "mm"
train = classifier_140m.score(X_train_scaled_mm, y_train)
test = classifier_140m.score(X_test_scaled_mm, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [22]:
################################################
#150.00    DecisionTree no scale         #
################################################

#a 
from sklearn import tree
clf_150 = tree.DecisionTreeClassifier()
clf_150 = clf_150.fit(X_train, y_train)
predictions_150 = clf_150.predict(X_test)

#b
model = "DecisionTree"
scale = "00"
train = clf_150.score(X_train, y_train)
test = clf_150.score(X_test, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [23]:
################################################
#150.ss    DecisionTree ss scale         #
################################################

#a 
# from sklearn import tree
clf_150s = tree.DecisionTreeClassifier()
clf_150s = clf_150s.fit(X_train_scaled_ss, y_train)
predictions_150s = clf_150s.predict(X_test_scaled_ss)

#b
model = "DecisionTree"
scale = "ss"
train = clf_150s.score(X_train_scaled_ss, y_train)
test = clf_150s.score(X_test_scaled_ss, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [24]:
################################################
#150.mm    DecisionTree mm scale         #
################################################

#a 
# from sklearn import tree
clf_150m = tree.DecisionTreeClassifier()
clf_150m = clf_150m.fit(X_train_scaled_mm, y_train)
predictions_150m = clf_150m.predict(X_test_scaled_mm)

#b
model = "DecisionTree"
scale = "mm"
train = clf_150m.score(X_train_scaled_mm, y_train)
test = clf_150m.score(X_test_scaled_mm, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [25]:
################################################
#160.00    RandomForest no scale         #
################################################

#a 
from sklearn.ensemble import RandomForestClassifier
rf_160 = RandomForestClassifier(n_estimators=200)
rf_160 = rf_160.fit(X_train, y_train)
predictions_160 = rf_160.predict(X_test)

#b
model = "RandomForest"
scale = "00"
train = rf_160.score(X_train, y_train)
test = rf_160.score(X_test, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [26]:
################################################
#160.ss    RandomForest ss scale         #
################################################

#a 
# from sklearn.ensemble import RandomForestClassifier
rf_160s = RandomForestClassifier(n_estimators=200)
rf_160s = rf_160s.fit(X_train_scaled_ss, y_train)
predictions_160s = rf_160s.predict(X_test_scaled_ss)

#b
model = "RandomForest"
scale = "ss"
train = rf_160s.score(X_train_scaled_ss, y_train)
test = rf_160s.score(X_test_scaled_ss, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [27]:
################################################
#160.mm    RandomForest mm scale         #
################################################

#a 
# from sklearn.ensemble import RandomForestClassifier
rf_160m = RandomForestClassifier(n_estimators=200)
rf_160m = rf_160m.fit(X_train_scaled_mm, y_train)
predictions_160m = rf_160m.predict(X_test_scaled_mm)

#b
model = "RandomForest"
scale = "mm"
train = rf_160m.score(X_train_scaled_mm, y_train)
test = rf_160m.score(X_test_scaled_mm, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [28]:
##################################################################
#170.00    KNN no scale                                          #
#    - using neighbors = 33 as that is where test/train cross.   #
##################################################################

#a 
from sklearn.neighbors import KNeighborsClassifier
knn_170 = KNeighborsClassifier(n_neighbors=33)
knn_170.fit(X_train, y_train)
predictions_170 = knn_170.predict(X_train)

#b
model = "KNN"
scale = "00"
train = knn_170.score(X_train, y_train)
test = knn_170.score(X_test, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [29]:
################################################
#170.ss    KNN ss scale                        #
################################################

#a 
# from sklearn.neighbors import KNeighborsClassifier
knn_170s = KNeighborsClassifier(n_neighbors=33)
knn_170s.fit(X_train_scaled_ss, y_train)
predictions_170s = knn_170s.predict(X_train_scaled_ss)

#b
model = "KNN"
scale = "ss"
train = knn_170s.score(X_train_scaled_ss, y_train)
test = knn_170s.score(X_test_scaled_ss, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [30]:
################################################
#170.mm    KNN mm scale                        #
################################################

#a 
# from sklearn.neighbors import KNeighborsClassifier
knn_170m = KNeighborsClassifier(n_neighbors=33)
knn_170m.fit(X_train_scaled_mm, y_train)
predictions_170m = knn_170m.predict(X_train_scaled_mm)

#b
model = "KNN"
scale = "mm"
train = knn_170m.score(X_train_scaled_mm, y_train)
test = knn_170m.score(X_test_scaled_mm, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [32]:
################################################
#180.00   SVC no scale                         #
#   ** this routine takes ~15 minutes to run   #
#       and returns the following row...       #
#  SVC	00	0.957014	0.950226	0.953620   #
################################################

# #a 
# from sklearn.svm import SVC 
# model_180 = SVC(kernel='linear')
# model_180.fit(X_train, y_train)
# predictions_180 = model_180.predict(X_train)

# #b
# model = "SVC"
# scale = "00"
# train = model_180.score(X_train, y_train)
# test = model_180.score(X_test, y_test)
# avg = (train+test)/2
# row = [model, scale, train, test, avg]
# compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

# compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [33]:
##########################################
#180.ss    SVC ss scale                 #
##########################################

#a 
from sklearn.svm import SVC 
model_180s = SVC(kernel='linear')
model_180s.fit(X_train_scaled_ss, y_train)
predictions_180s = model_180s.predict(X_train_scaled_ss)

#b
model = "SVC"
scale = "ss"
train = model_180s.score(X_train_scaled_ss, y_train)
test = model_180s.score(X_test_scaled_ss, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [34]:
##########################################
#180.mm    SVC mm scale                 #
##########################################

#a 
# from sklearn.svm import SVC 
model_180m = SVC(kernel='linear')
model_180m.fit(X_train_scaled_mm, y_train)
predictions_180m = model_180m.predict(X_train_scaled_mm)

#b
model = "SVC"
scale = "00"
train = model_180m.score(X_train_scaled_mm, y_train)
test = model_180m.score(X_test_scaled_mm, y_test)
avg = (train+test)/2
row = [model, scale, train, test, avg]
compare_df = compare_df.append(pd.DataFrame([row], columns=compare_df.columns))

compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,Linear Regression,00,0.770991,0.750459,0.760725
0,Linear Regression,ss,-7.925801,-6.058446,-6.992123
0,Linear Regression,mm,0.770991,0.750459,0.760725
0,Lasso,00,0.749769,0.737326,0.743548
0,Lasso,ss,0.76185,0.748023,0.754936
0,Lasso,mm,0.704809,0.688559,0.696684
0,Ridge,00,0.77099,0.750437,0.760714
0,Ridge,ss,0.770991,0.75046,0.760726
0,Ridge,mm,0.770937,0.750284,0.760611
0,ElasticNet,00,0.759048,0.746144,0.752596


In [36]:
compare_df.to_csv("../data_out/methods_compare.csv", index=False, header=True)

In [39]:
compare_df.sort_values(["avg"], ascending = False, inplace=True)  
compare_df

Unnamed: 0,model,scale_code,train_score,test_score,avg
0,DecisionTree,mm,1.0,0.984917,0.992459
0,DecisionTree,ss,1.0,0.984163,0.992081
0,DecisionTree,00,1.0,0.984163,0.992081
0,SVC,ss,0.992459,0.986425,0.989442
0,RandomForest,ss,1.0,0.978884,0.989442
0,LogisticRegression,ss,0.992961,0.985671,0.989316
0,LogisticRegression,mm,0.991956,0.985671,0.988813
0,KNN,mm,0.991956,0.985671,0.988813
0,SVC,00,0.991956,0.985671,0.988813
0,RandomForest,mm,1.0,0.977376,0.988688
