In [1]:
# Refactoring: Start refactoring and remove code smells one by one. See comments with [refactoring] prefix

In [2]:
cd ..

/home/clean-code-ml


In [3]:
#source: https://www.kaggle.com/bhaveshsk/getting-started-with-titanic-dataset/data
#data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

#data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#machine learning packages
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

from src.model_training import train_model

In [4]:
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")
df = pd.concat([train_df,test_df])

df

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450
5,,,Q,8.4583,"Moran, Mr. James",0,6,3,male,0,0.0,330877
6,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,7,1,male,0,0.0,17463
7,2.0,,S,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,male,3,0.0,349909
8,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,female,0,1.0,347742
9,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,female,1,1.0,237736


In [5]:
# process for refactoring

# identify function which i want to extract
# write a test
# - create src/tests/test_preprocessing.py and src/preprocessing.py
# - run tests: nosetests --with-watch --rednose --nologcapture
# - think about interface of this function:
#   - what's the input (a df with a name column)
#   - what's the output (a df with a new column - title)
# - write a test case
#   - define a test class (import unittest, etc)
#   - create input dataframe (can copy data from notebook)
#   - create expected dataframe
#   - from src.my_module import my_func
# - make test pass
#   - since the implementation in the notebook is already working, we can copy that over
# - replace code in notebook with the functions we just defined
# - comment out old code 
# - restart and run notebook just to make sure everything still works (manual "integration" test)
# - remove dead code

In [6]:
# # extract title
# df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

# # codify rare titles
# df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
#                                     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

# # standardize titles
# df['Title'] = df['Title'].replace('Mlle', 'Miss')
# df['Title'] = df['Title'].replace('Ms', 'Miss')
# df['Title'] = df['Title'].replace('Mme', 'Mrs')

# # encode titles
# title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

# df['Title'] = df['Title'].map(title_mapping)
# df['Title'] = df['Title'].fillna(0)

from src.preprocessing import add_derived_title, encode_title

df = add_derived_title(df)
df = encode_title(df)
# refactored! woohoo!

In [7]:
df['Sex'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [8]:
# guess_ages = np.zeros((2,3))

# for i in range(0, 2):
#     for j in range(0, 3):
#         guess_df = df[(df['Sex'] == i) & \
#                               (df['Pclass'] == j+1)]['Age'].dropna()

#         # age_mean = guess_df.mean()
#         # age_std = guess_df.std()
#         # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

#         age_guess = guess_df.median()

#         # Convert random age float to nearest .5 age
#         guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5

# for i in range(0, 2):
#     for j in range(0, 3):
#         df.loc[ (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j+1),\
#                 'Age'] = guess_ages[i,j]

# df['Age'] = df['Age'].astype(int)

In [9]:
# [refactoring] Smells: Age is not an accurate name for this column + we're mutating the original dataframe
# df.loc[ df['Age'] <= 16, 'Age'] = 0
# df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
# df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
# df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3

from src.preprocessing import categorize_column

df['AgeGroup'] = categorize_column(df['Age'], num_bins=5)

In [10]:
# [refactoring] Smell - 'FamilySize' is an intermediate column for deriving 'IsAlone'. We can reduce complexity for the reader/developer by encapsulating (not exposing) this implementation detail
# df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# df['IsAlone'] = 0
# df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

from src.preprocessing import add_is_alone_column

df = add_is_alone_column(df)

In [11]:
df = df.drop(['Parch', 'SibSp', 'Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1)

In [12]:
df['AgeGroup*Class'] = df.AgeGroup * df.Pclass

In [13]:
# [code smell] fillna logic is scattered across multiple cells (this one and next one)
# freq_port = df.Embarked.dropna().mode()[0]
# df['Embarked_2'] = df['Embarked'].fillna(freq_port)

from src.preprocessing import impute_nans
df = impute_nans(df, categorical_columns=['Embarked'], continuous_columns=['Fare', 'Age'])

# encode column
df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)


In [14]:
# df['Fare'] = df['Fare'].fillna(df['Fare'].dropna().median())

In [15]:
# [refactoring] dead code, because 'FareBand' is not used. we can remove the lines that use 'FareBand'
# df['FareBand'] = pd.qcut(df['Fare'], 4)

In [16]:
# [refactoring] code smells - magic numbers. should use the output of `pd.qcut(df['Fare'], 4)` directly in here.
# df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0
# df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
# df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
# df.loc[ df['Fare'] > 31, 'Fare'] = 3
# df['Fare'] = df['Fare'].astype(int)

# df = df.drop(['FareBand'], axis=1)

# refactored. wahoo! our earlier refactoring for calculating AgeBand pays off now!
df['FareBand'] = categorize_column(df['Fare'], num_bins=4)

In [17]:
train_df = df[-df['Survived'].isna()]
test_df = df[df['Survived'].isna()]
test_df = test_df.drop('Survived', axis=1)

In [18]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.copy()

In [19]:
# old way of training model and getting accuracy score
# logreg = LogisticRegression()
# logreg.fit(X_train, Y_train)
# Y_pred = logreg.predict(X_test)
# acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
# acc_log

# refactored: extracted duplication (in training model and getting accuracy score) into a function
logreg, acc_log = train_model(LogisticRegression, X_train, Y_train, solver='lbfgs')
logreg, acc_log

accuracy (LogisticRegression): 81.03




(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False), 81.03)

In [20]:
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
4,Survived,2.140353
7,IsAlone,0.491005
5,Title,0.430124
1,Fare,0.293428
9,FareBand,0.167547
8,AgeGroup*Class,0.058457
2,Pclass,0.001548
0,Embarked,-0.024123
6,AgeGroup,-0.229339
3,Sex,-1.190982


In [21]:
_, acc_svc           = train_model(SVC, X_train, Y_train, gamma='auto')
_, acc_knn           = train_model(KNeighborsClassifier, X_train, Y_train, n_neighbors=3)
_, acc_gaussian      = train_model(GaussianNB, X_train, Y_train)
_, acc_perceptron    = train_model(Perceptron, X_train, Y_train)
_, acc_linear_svc    = train_model(LinearSVC, X_train, Y_train)
_, acc_sgd           = train_model(SGDClassifier, X_train, Y_train)
_, acc_decision_tree = train_model(DecisionTreeClassifier, X_train, Y_train)
_, acc_random_forest = train_model(RandomForestClassifier, X_train, Y_train, n_estimators=100)



accuracy (SVC): 90.57
accuracy (KNeighborsClassifier): 84.85
accuracy (GaussianNB): 76.32
accuracy (Perceptron): 59.15
accuracy (LinearSVC): 77.67
accuracy (SGDClassifier): 73.51
accuracy (DecisionTreeClassifier): 98.43
accuracy (RandomForestClassifier): 98.43


In [22]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,98.43
8,Decision Tree,98.43
0,Support Vector Machines,90.57
1,KNN,84.85
2,Logistic Regression,81.03
7,Linear SVC,77.67
4,Naive Bayes,76.32
6,Stochastic Gradient Decent,73.51
5,Perceptron,59.15
