# Jude's Automated System (JUDAS)

Author: Jude Michael 2019, MSDS 2019

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline

In [2]:
# df = pd.read_excel('titanic.xls')
# df.head()

In [3]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Data Cleaning

In [4]:
# df_clean = df.drop(['name', 'cabin', 'ticket'], axis=1)
df_clean = df.drop(['embark_town', 'alive'], axis=1)
df_clean = df_clean.dropna(axis=0)

In [5]:
from collections import Counter

# state_counts = Counter(df_clean['target'])
state_counts = Counter(df_clean['survived'])
df_state = pd.DataFrame.from_dict(state_counts, orient='index')

num=(df_state[0]/df_state[0].sum())**2
print("Population per class: {}\n".format(df_state))
print("1.25 * Proportion Chance Criterion: {}%".format(1.25*100*num.sum()))

Population per class:      0
1  123
0   59

1.25 * Proportion Chance Criterion: 70.22853520106267%


In [6]:
df_transformed = df_clean.copy()
log_features = ['fare']
for feature in log_features:
    df_transformed[feature] = np.log(df_transformed[feature]+1)
    
df_transformed.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,alone
1,1,1,female,38.0,1,0,4.280593,C,First,woman,False,C,False
3,1,1,female,35.0,1,0,3.990834,S,First,woman,False,C,False
6,0,1,male,54.0,0,0,3.967694,S,First,man,True,E,True
10,1,3,female,4.0,1,1,2.873565,S,Third,child,False,G,False
11,1,1,female,58.0,0,0,3.316003,S,First,woman,False,C,True


In [7]:
X = pd.get_dummies(df_transformed.drop('survived', axis=1), drop_first=True)
y = df_clean['survived']
X.shape

(182, 20)

## Machine Learning

### Normalization

In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_norm = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_norm.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,adult_male,alone,sex_male,embarked_Q,embarked_S,class_Second,class_Third,who_man,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G
0,0.0,0.468892,0.333333,0.0,0.685892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.430956,0.333333,0.0,0.639463,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.671219,0.0,0.0,0.635755,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.038948,0.333333,0.25,0.460439,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.721801,0.0,0.0,0.531333,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


### Classification

In [9]:
from judas.classification.automate import Judas as JudasClassifier

trials = 20
judasc = JudasClassifier()
params = [
    ('knn', trials, range(1,30)),
    ('logistic', 'l1', trials),
    ('logistic', 'l2', trials),
    ('svm', 'l1', trials),
    ('svm', 'l2', trials),
    ('nsvm-rbf', trials),
    ('ensemble-decisiontree', trials, range(1,20)),
    ('ensemble-randomforest', trials, range(1,20)),
    ('ensemble-gbm', trials, range(1,10)),
]

judasc.automate(X_norm,y,params)

knn, n neighbors=20
Computing: [########################################] Seed 20/20 | Execution time: 11.06s
logistic, reg=l1
Computing: [########################################] Seed 20/20 | Execution time: 5.10s
logistic, reg=l2
Computing: [########################################] Seed 20/20 | Execution time: 2.34s
svm, reg=l1
Computing: [########################################] Seed 20/20 | Execution time: 3.32s
svm, reg=l2
Computing: [########################################] Seed 20/20 | Execution time: 3.97s
nsvm-rbf
Computing: [########################################] Seed 20/20 | Execution time: 3.86s
ensemble-decisiontree, max depth=range(1, 20)
Computing: [########################################] Seed 20/20 | Execution time: 2.43s
ensemble-randomforest, n estimators=range(1, 20)
Computing: [########################################] Seed 20/20 | Execution time: 9.00s
ensemble-gbm, max depth=range(1, 10)
Computing: [########################################] Seed 20/20 | E

In [10]:
judasc.score()

Unnamed: 0,Machine Learning Method,Test Accuracy,Best Parameter,Top Predictor Variable
0,kNN,76.41%,N_Neighbor = 24,
1,Logistic (l1),74.89%,C = 0.4,who_woman
2,Logistic (l2),74.35%,C = 1,who_woman
3,Linear SVM (l1),75.65%,C = 0.2,fare
4,Linear SVM (l2),75.22%,C = 0.1,fare
5,Nonlinear SVM (RBF),77.17%,gamma = 0.01,
6,Decision Trees,79.02%,depth = 1,adult_male
7,Random Forest,75.65%,n-estimator = 10,age
8,Gradient Boosting Method,75.87%,depth = 5,fare


### Regression

I know that this is a classification problem, but let's use the regression library to show that the library works.

In [11]:
from judas.regression.automate import Judas as JudasRegressor

trials = 5
judasr = JudasRegressor()
params = [
    ('knn', trials, range(1,30)),
    ('linear', trials),
    ('lasso', trials),
    ('ridge', trials),
    ('svm', trials), # too slow
    ('svm-rbf', trials), # too slow
    ('svm-poly', trials), # too slow
    ('ensemble-decisiontree', trials, range(1,20)),
    ('ensemble-randomforest', trials, range(1,20)),
    ('ensemble-gbm', trials, range(1,10)),
]

judasr.automate(X_norm,y,params)

knn, n neighbors=5
Computing: [########################################] Seed 5/5 | Execution time: 1.44s
linear
Computing: [########################################] Seed 5/5 | Execution time: 0.06s
lasso
Computing: [########################################] Seed 5/5 | Execution time: 0.63s
ridge
Computing: [########################################] Seed 5/5 | Execution time: 0.72s
svm
Computing: [########################################] Seed 5/5 | Execution time: 68.76s
svm-rbf
Computing: [########################################] Seed 5/5 | Execution time: 1.24s
svm-poly
Computing: [########################################] Seed 5/5 | Execution time: 0.74s
ensemble-decisiontree, max depth=range(1, 20)
Computing: [########################################] Seed 5/5 | Execution time: 0.58s
ensemble-randomforest, n estimators=range(1, 20)
Computing: [########################################] Seed 5/5 | Execution time: 2.12s
ensemble-gbm, max depth=range(1, 10)
Computing: [#############

In [12]:
judasr.score()

Unnamed: 0,Machine Learning Method,Test Accuracy,Best Parameter,Top Predictor Variable
0,kNN,22.43%,N_Neighbor = 7,
1,Linear,18.39%,,pclass
2,Lasso,21.73%,alpha = 0.01,pclass
3,Ridge,23.61%,alpha = 20,pclass
4,SVR (linear),12.12%,C = 0.01,adult_male
5,SVR (rbf),14.00%,C = 0.1,
6,SVR (poly),18.66%,C = 3,
7,Decision Trees,20.03%,depth = 1,adult_male
8,Random Forest,16.95%,n-estimator = 16,fare
9,Gradient Boosting Method,23.36%,depth = 2,adult_male


### Test again using another 'dataset'

In [13]:
X1 = np.arange(100)
X2 = np.random.uniform(0,100,100)
X = [X1, X2]
y = X1 + X2
# y = pd.Series(y,name='Target')

X = np.concatenate([X, [np.zeros(100)]*4], axis=0).T
X = pd.DataFrame(X, columns=['col{}'.format(i) for i in range(X.shape[1])])
X.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5
0,0.0,45.627404,0.0,0.0,0.0,0.0
1,1.0,82.114706,0.0,0.0,0.0,0.0
2,2.0,44.463121,0.0,0.0,0.0,0.0
3,3.0,17.514817,0.0,0.0,0.0,0.0
4,4.0,58.984383,0.0,0.0,0.0,0.0


In [14]:
scaler = MinMaxScaler()

X_norm = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_norm.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5
0,0.0,0.455486,0.0,0.0,0.0,0.0
1,0.010101,0.821298,0.0,0.0,0.0,0.0
2,0.020202,0.443813,0.0,0.0,0.0,0.0
3,0.030303,0.173637,0.0,0.0,0.0,0.0
4,0.040404,0.589399,0.0,0.0,0.0,0.0


In [15]:
trials = 50
judasr = JudasRegressor()
params = [
    ('knn', trials, range(1,30)),
    ('linear', trials),
    ('lasso', trials),
    ('ridge', trials),
    ('svm', trials), # too slow
    ('svm-rbf', trials), # too slow
    ('svm-poly', trials), # too slow
    ('ensemble-decisiontree', trials, range(1,20)),
    ('ensemble-randomforest', trials, range(1,20)),
    ('ensemble-gbm', trials, range(1,10)),
]

judasr.automate(X_norm,y,params)

knn, n neighbors=50
Computing: [########################################] Seed 50/50 | Execution time: 10.13s
linear
Computing: [########################################] Seed 50/50 | Execution time: 0.52s
lasso
Computing: [########################################] Seed 50/50 | Execution time: 5.21s
ridge
Computing: [########################################] Seed 50/50 | Execution time: 3.57s
svm
Computing: [########################################] Seed 50/50 | Execution time: 4.55s
svm-rbf
Computing: [########################################] Seed 50/50 | Execution time: 7.62s
svm-poly
Computing: [########################################] Seed 50/50 | Execution time: 6.04s
ensemble-decisiontree, max depth=range(1, 20)
Computing: [########################################] Seed 50/50 | Execution time: 5.30s
ensemble-randomforest, n estimators=range(1, 20)
Computing: [########################################] Seed 50/50 | Execution time: 18.60s
ensemble-gbm, max depth=range(1, 10)
Compu

In [16]:
judasr.score()

Unnamed: 0,Machine Learning Method,Test Accuracy,Best Parameter,Top Predictor Variable
0,kNN,97.24%,N_Neighbor = 2,
1,Linear,100.00%,,col0
2,Lasso,100.00%,alpha = 1e-08,col0
3,Ridge,100.00%,alpha = 1e-08,col0
4,SVR (linear),100.00%,C = 300,col1
5,SVR (rbf),100.00%,C = 5000,
6,SVR (poly),84.78%,C = 5000,
7,Decision Trees,91.80%,depth = 9,col0
8,Random Forest,95.67%,n-estimator = 19,col0
9,Gradient Boosting Method,79.40%,depth = 7,col0
