# ML Pocket Reference Book

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import (ensemble,model_selection,preprocessing,tree)
from sklearn.metrics import (auc,confusion_matrix,roc_auc_score,roc_curve)
from sklearn.model_selection import (train_test_split,StratifiedKFold)
from yellowbrick.classifier import (ConfusionMatrix,ROCAUC)
from yellowbrick.model_selection import (LearningCurve)

In [2]:
df  = pd.read_excel('titanic3.xlsx')
orig_df = df

In [3]:
 display (df.head())

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
#### AMAZING REPORT about the dataset ####
# import pandas_profiling
# pandas_profiling.ProfileReport(df)

In [5]:
df.shape

(1309, 14)

In [6]:
df.describe().iloc[:, : 3] # notice iloc - all rows, and columns 0 to 2

Unnamed: 0,pclass,survived,age
count,1309.0,1309.0,1046.0
mean,2.294882,0.381971,29.881135
std,0.837836,0.486055,14.4135
min,1.0,0.0,0.1667
25%,2.0,0.0,21.0
50%,3.0,0.0,28.0
75%,3.0,1.0,39.0
max,3.0,1.0,80.0


In [7]:
df.loc[df['age'] > 18, 'age']

0       29.0
3       30.0
4       25.0
5       48.0
6       63.0
        ... 
1299    27.0
1301    45.5
1306    26.5
1307    27.0
1308    29.0
Name: age, Length: 853, dtype: float64

In [8]:
# find missing value and their percentage for each variable
df.isnull().mean()

pclass       0.000000
survived     0.000000
name         0.000000
sex          0.000000
age          0.200917
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.000764
cabin        0.774637
embarked     0.001528
boat         0.628724
body         0.907563
home.dest    0.430863
dtype: float64

In [9]:
# find missing value and their total number for each variable
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [10]:
# body variable is leaks data/info because obviously body is an indication if someone died and we should not teach the model
# that.  We need to remove this variable
# likewise about the boat column -is LEAKS data because a passenger survived if they are on a boat


In [11]:
mask = df.isnull().any(axis=1) # finding some rows with missing data
mask

0       True
1       True
2       True
3       True
4       True
        ... 
1304    True
1305    True
1306    True
1307    True
1308    True
Length: 1309, dtype: bool

In [12]:
df.loc[:,'age']

0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
         ...   
1304    14.5000
1305        NaN
1306    26.5000
1307    27.0000
1308    29.0000
Name: age, Length: 1309, dtype: float64

In [13]:
display (df); df.describe(); df.isnull().mean()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


pclass       0.000000
survived     0.000000
name         0.000000
sex          0.000000
age          0.200917
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.000764
cabin        0.774637
embarked     0.001528
boat         0.628724
body         0.907563
home.dest    0.430863
dtype: float64

In [14]:
df.isnull().mean()

pclass       0.000000
survived     0.000000
name         0.000000
sex          0.000000
age          0.200917
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.000764
cabin        0.774637
embarked     0.001528
boat         0.628724
body         0.907563
home.dest    0.430863
dtype: float64

In [15]:
df = df.drop(
    columns=[
        "name",
        "ticket",
        "home.dest",
        "boat",
        "body",
        "cabin",
    ]
)
""" # droping unwanted columns that do not add value or they LEAK"""

' # droping unwanted columns that do not add value or they LEAK'

In [16]:
df = pd.get_dummies(df)

In [17]:
df.drop(columns='sex_female') # drop the perfectly inverse column - not good for statistics analysis
# you can also use to remove invesrse col. : df= pd.get_dummies(df, drop_first = True)

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_male,embarked_C,embarked_Q,embarked_S
0,1,1,29.0000,0,0,211.3375,0,0,0,1
1,1,1,0.9167,1,2,151.5500,1,0,0,1
2,1,0,2.0000,1,2,151.5500,0,0,0,1
3,1,0,30.0000,1,2,151.5500,1,0,0,1
4,1,0,25.0000,1,2,151.5500,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1304,3,0,14.5000,1,0,14.4542,0,1,0,0
1305,3,0,,1,0,14.4542,0,1,0,0
1306,3,0,26.5000,0,0,7.2250,1,1,0,0
1307,3,0,27.0000,0,0,7.2250,1,1,0,0


In [18]:
# dataframe of the features and a series with the lables
X = df.drop(columns="survived")
y = df.survived
display (y)

0       1
1       1
2       0
3       0
4       0
       ..
1304    0
1305    0
1306    0
1307    0
1308    0
Name: survived, Length: 1309, dtype: int64

In [19]:
# splitting the data to train and test sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)
display (X_train)

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
1214,3,,0,0,8.6625,0,1,0,0,1
677,3,26.0,0,0,7.8958,0,1,0,0,1
534,2,19.0,0,0,26.0000,1,0,0,0,1
1174,3,,8,2,69.5500,1,0,0,0,1
864,3,28.0,0,0,7.7750,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1095,3,,0,0,7.6292,1,0,0,1,0
1130,3,18.0,0,0,7.7750,1,0,0,0,1
1294,3,28.5,0,0,16.1000,0,1,0,0,1
860,3,26.0,0,0,7.9250,1,0,0,0,1


In [20]:
#### We know need to IMPUTE the data- fill in the missing data points ####
# using regression to estimate missing values
from sklearn.experimental import (enable_iterative_imputer) 
from sklearn import impute
num_cols = ["pclass", "age", "sibsp", "parch", "fare", "sex_female"]
imputer = impute.IterativeImputer()
imputed = imputer.fit_transform(X_train[num_cols])  # the fit is only for the training data !!!!
X_train.loc[:, num_cols] = imputed
imputed = imputer.transform(X_test[num_cols])  # only transform (not fit) cause it is a testing set
X_test.loc[:, num_cols] = imputed
display (X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
1214,3.0,26.984491,0.0,0.0,8.6625,0.0,1,0,0,1
677,3.0,26.000000,0.0,0.0,7.8958,0.0,1,0,0,1
534,2.0,19.000000,0.0,0.0,26.0000,1.0,0,0,0,1
1174,3.0,0.437798,8.0,2.0,69.5500,1.0,0,0,0,1
864,3.0,28.000000,0.0,0.0,7.7750,1.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1095,3.0,25.011616,0.0,0.0,7.6292,1.0,0,0,1,0
1130,3.0,18.000000,0.0,0.0,7.7750,1.0,0,0,0,1
1294,3.0,28.500000,0.0,0.0,16.1000,0.0,1,0,0,1
860,3.0,26.000000,0.0,0.0,7.9250,1.0,0,0,0,1


### Standardize the data
Many models perform better after this is done, especially those that depend on a distance metric to determine similarity.  Tree models don't. To standardize is translating the data so that it has mean value zero and a SD = 1.
This way models do not treat variables with larger scales as more important than smaller scaled variables.

In [21]:
X_train

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
1214,3.0,26.984491,0.0,0.0,8.6625,0.0,1,0,0,1
677,3.0,26.000000,0.0,0.0,7.8958,0.0,1,0,0,1
534,2.0,19.000000,0.0,0.0,26.0000,1.0,0,0,0,1
1174,3.0,0.437798,8.0,2.0,69.5500,1.0,0,0,0,1
864,3.0,28.000000,0.0,0.0,7.7750,1.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1095,3.0,25.011616,0.0,0.0,7.6292,1.0,0,0,1,0
1130,3.0,18.000000,0.0,0.0,7.7750,1.0,0,0,0,1
1294,3.0,28.500000,0.0,0.0,16.1000,0.0,1,0,0,1
860,3.0,26.000000,0.0,0.0,7.9250,1.0,0,0,0,1


In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train.iloc[:,0:5]= scaler.fit_transform(X_train.iloc[ : , 0:5])
#### Important - we do only below transform here on the testing set cause we got the fit parameters from the training set which will 
### will be exactly the same for the testing set (hence no need to fit again- not to get different fit parameters)
X_test.iloc[:,0:5]= scaler.transform(X_test.iloc[ : , 0:5])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [23]:
display (X_test)

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
1148,0.825248,0.496095,-0.498616,-0.432553,-0.502719,0.0,1,0,0,1
1049,0.825248,-0.673461,0.425943,0.686709,-0.339518,0.0,1,1,0,0
982,0.825248,-0.128451,-0.498616,-0.432553,-0.488120,0.0,1,0,0,1
808,0.825248,-0.128537,-0.498616,-0.432553,-0.485199,0.0,1,0,0,1
1195,0.825248,-0.128370,-0.498616,-0.432553,-0.490881,0.0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
325,-0.363317,0.106243,-0.498616,-0.432553,-0.391446,0.0,1,0,0,1
919,0.825248,-0.790416,-0.498616,-0.432553,-0.500745,0.0,1,1,0,0
532,-0.363317,0.963917,-0.498616,-0.432553,-0.391446,0.0,1,0,0,1
1159,0.825248,-0.282937,-0.498616,-0.432553,-0.485199,1.0,0,0,0,1


In [24]:
# creating a base dummy model to compare smarter models
from sklearn.dummy import DummyClassifier
bm = DummyClassifier()
bm.fit(X_train, y_train)
print ("The accuracy of the base-dummy model is:")
bm.score(X_test, y_test)  # accuracy

The accuracy of the base-dummy model is:


0.5114503816793893

In [25]:
from sklearn import metrics
metrics.precision_score(y_test, bm.predict(X_test))

0.4161073825503356

In [26]:
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [27]:
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import (LogisticRegression)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import (KNeighborsClassifier)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier)
# import xgboost as xgb

In [28]:
for model in [DummyClassifier,LogisticRegression,DecisionTreeClassifier,
    KNeighborsClassifier,GaussianNB,SVC,RandomForestClassifier#,xgboost.XGBClassifier
             ]:
    cls = model()
    kfold = model_selection.KFold(n_splits=10, random_state=42)
    s = model_selection.cross_val_score(cls, X, y, scoring="roc_auc", cv=kfold)
    print(f"{model.__name__:22}  AUC: "f"{s.mean():.3f} STD: {s.std():.2f}")



DummyClassifier         AUC: 0.497 STD: 0.05
LogisticRegression      AUC: 0.843 STD: 0.03
DecisionTreeClassifier  AUC: 0.762 STD: 0.04
KNeighborsClassifier    AUC: 0.836 STD: 0.04
GaussianNB              AUC: 0.822 STD: 0.04




SVC                     AUC: 0.841 STD: 0.04
RandomForestClassifier  AUC: 0.833 STD: 0.04


