In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df = pd.read_csv("./data/train.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
for col in df.columns:
    print(col, len(set(df[col])))

PassengerId 891
Survived 2
Pclass 3
Name 891
Sex 2
Age 265
SibSp 7
Parch 7
Ticket 681
Fare 248
Cabin 148
Embarked 4


In [8]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [9]:
def formatting_embarked(df):
    dic_embarked = {'S':1, 'C':2, 'Q':3}
    try:
        for e in dic_embarked.keys():
            df.loc[df['Embarked']==e, "Embarked"]=dic_embarked[e]
        df['Embarked'] = df['Embarked'].astype('float')
        return df
    except NameError:
        raise("Embarked is not in df.columns")

In [10]:
df_format = formatting_embarked(df)

df_format.Embarked.unique()

array([ 1.,  2.,  3., nan])

In [11]:
df_format['Sex'].unique()

array(['male', 'female'], dtype=object)

In [12]:
def formatting_sex(df):
    dic_sex = {'male':1, 'female':2}
    try:
        for e in dic_sex.keys():
            df.loc[df['Sex']==e, "Sex"] = dic_sex[e]
        df['Sex']=df['Sex'].astype('float')
        return df
    except NameError:
        raise('Sex is not in df.columns')

In [13]:
df_format = formatting_sex(df_format)

In [14]:
df_format['Sex'].unique()

array([1., 2.])

In [15]:
df_format.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex            float64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked       float64
dtype: object

In [16]:
df_format.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [17]:
df_format.shape

(891, 12)

In [18]:
df_format = df_format.drop(['Cabin', 'Name', 'Ticket'], axis=1)

In [19]:
age_mean = round(float(np.mean(df_format[['Age']])),2)

def formatting_age(df):
    try:
        df = df.fillna(age_mean)
        return df
    except:
        raise NameError("df_columns does not contain 'Age'")

In [20]:
df_format = formatting_age(df_format)

In [21]:
df_format['Age'].isnull().sum()

0

In [22]:
print(sorted(list(df_format['Age'].unique())))

[0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 14.5, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 20.5, 21.0, 22.0, 23.0, 23.5, 24.0, 24.5, 25.0, 26.0, 27.0, 28.0, 28.5, 29.0, 29.7, 30.0, 30.5, 31.0, 32.0, 32.5, 33.0, 34.0, 34.5, 35.0, 36.0, 36.5, 37.0, 38.0, 39.0, 40.0, 40.5, 41.0, 42.0, 43.0, 44.0, 45.0, 45.5, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 55.5, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 70.0, 70.5, 71.0, 74.0, 80.0]


In [23]:
df_format.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [24]:
df_format = df_format[df_format['Embarked'].isnull()==False]

In [25]:
df_format.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex            float64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked       float64
dtype: object

In [26]:
df_format['Embarked'].unique()

array([ 1. ,  2. ,  3. , 29.7])

In [27]:
df_format.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex            float64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked       float64
dtype: object

In [28]:
df_format.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [59]:
x_cols = list(df_format.columns)

x_cols.remove('Survived')

X = df_format[x_cols]

y = df_format[['Survived']]

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [62]:
logReg = LogisticRegression()
tree = DecisionTreeClassifier()
randFor = RandomForestClassifier()

models = [logReg, tree, randFor]

for mod in models:
    mod.fit(X_train, y_train)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mod.fit(X_train, y_train)


In [87]:
def evaluate_model(model):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    acc = accuracy_score()
    
    print('   MSE: {}\n   RMSE: {}'.format(mse, rmse))

In [68]:
dict_models = {'LogisticRegression':logReg, 'TreeClassifier':tree, 'Random Forest Classifier':randFor}

for e in dict_models.keys():
    print(e)
    evaluate_model(dict_models[e])

LogisticRegression
   MSE: 0.19029850746268656
   RMSE: 0.436232171512701
TreeClassifier
   MSE: 0.27238805970149255
   RMSE: 0.5219080950718168
Random Forest Classifier
   MSE: 0.19402985074626866
   RMSE: 0.44048819592160315


In [93]:
test_df = pd.read_csv('./data/test.csv')

In [52]:
def cleaning_dataset(df):
    df = formatting_embarked(df)
    df = formatting_age(df)
    df = formatting_sex(df)
    
    df = df.drop(['Cabin', 'Name', 'Ticket'], axis=1)
    df = df[df['Embarked'].isnull()==False]
    
    return df

In [136]:
cleant_test_df = cleaning_dataset(test_df)

In [118]:
print(X.columns, len(X.columns))
print(cleant_test_df.columns, len(cleant_test_df.columns))

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object') 8
Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object') 8


In [137]:
import os
import shutil

if os.path.exists('predictions')==False:
    os.mkdir('./predictions')
def save_predictions(model, df):
    filename = e+'_pred.csv'
    path = filename
    new_path = './predictions/'+path
    
    pred = dict_models[e].predict(df)
    df['Survived'] = pred
    
    submission = df[['PassengerId', 'Survived']].set_index(df['PassengerId']).drop(['PassengerId'], axis=1)
    submission.to_csv(new_path)
    print('Loaded '+filename)
    display(submission.head())
    
    
for e in dict_models.keys():
    save_predictions(e, cleant_test_df)
    cleant_test_df = cleant_test_df.drop(['Survived'], axis = 1)

Loaded LogisticRegression_pred.csv


Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


Loaded TreeClassifier_pred.csv


Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0


Loaded Random Forest Classifier_pred.csv


Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0
