# Building Predictive Models

In [227]:
# importing the necessary libraries...
import pandas as pd
import os
import numpy as np

In [228]:
# set the data path for further use...
processed_data_path = os.path.join(os.path.pardir, "data", "processed")
train_file_path = os.path.join(processed_data_path, "train.csv")
test_file_path = os.path.join(processed_data_path, "test.csv")

In [229]:
# creating the two processed data sets
train_df = pd.read_csv(train_file_path, index_col = "PassengerId")
test_df = pd.read_csv(test_file_path, index_col = "PassengerId")

In [230]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Survived            891 non-null    int64  
 1   Age                 891 non-null    float64
 2   Fare                891 non-null    float64
 3   FamilySize          891 non-null    int64  
 4   IsMother            891 non-null    int64  
 5   IsMale              891 non-null    int64  
 6   Deck_A              891 non-null    int64  
 7   Deck_B              891 non-null    int64  
 8   Deck_C              891 non-null    int64  
 9   Deck_D              891 non-null    int64  
 10  Deck_E              891 non-null    int64  
 11  Deck_F              891 non-null    int64  
 12  Deck_G              891 non-null    int64  
 13  Deck_Z              891 non-null    int64  
 14  Pclass_1            891 non-null    int64  
 15  Pclass_2            891 non-null    int64  
 16  Pclass_3

In [231]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 418 non-null    float64
 1   Fare                418 non-null    float64
 2   FamilySize          418 non-null    int64  
 3   IsMother            418 non-null    int64  
 4   IsMale              418 non-null    int64  
 5   Deck_A              418 non-null    int64  
 6   Deck_B              418 non-null    int64  
 7   Deck_C              418 non-null    int64  
 8   Deck_D              418 non-null    int64  
 9   Deck_E              418 non-null    int64  
 10  Deck_F              418 non-null    int64  
 11  Deck_G              418 non-null    int64  
 12  Deck_Z              418 non-null    int64  
 13  Pclass_1            418 non-null    int64  
 14  Pclass_2            418 non-null    int64  
 15  Pclass_3            418 non-null    int64  
 16  Title

In [232]:
# creating the matrices with numerical data for models with to_numpy()
# ravel() creates a flattened one-d array
X = train_df.loc[:, "Age":].to_numpy().astype("float")
y = train_df["Survived"].ravel()

In [233]:
print(X.shape)
print(y.shape)


(891, 32)
(891,)


In [387]:
# train test splitting (cross validation)
# using scikit
from sklearn.model_selection import train_test_split

# test_size = 0.2 tells us that validation occurs with 20% of the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) 

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(668, 32)
(668,)
(223, 32)
(223,)


In [235]:
# average survival in train and test data
print(f"The mean survival in train : {np.mean(y_train)}")
print(f"The mean survival in test : {np.mean(y_test)}")


The mean survival in train : 0.3884430176565008
The mean survival in test : 0.373134328358209


### Check Scikit learn version

In [236]:
import sklearn

In [237]:
sklearn.__version__

'0.22.1'

## Baseline model

In [168]:
# importing the functions
from sklearn.dummy import DummyClassifier

In [169]:
# create model
model_dummy = DummyClassifier(strategy = "most_frequent", random_state = 0)

In [170]:
# training the model
model_dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [171]:
# using validation and getting the score of the model...
print(f"The score for the baseline validation values is {model_dummy.score(X_test, y_test)}")

The score for the baseline validation values is 0.6083916083916084


In [172]:
# performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [173]:
# accuracy score
print(f"The accuracy score is {accuracy_score(y_test, model_dummy.predict(X_test))}")

The accuracy score is 0.6083916083916084


In [174]:
# confusion matrix
print(f"The confusion matrix is \n {confusion_matrix(y_test, model_dummy.predict(X_test))}")

The confusion matrix is 
 [[87  0]
 [56  0]]


In [175]:
# precision score
print(f"The precision score is {precision_score(y_test, model_dummy.predict(X_test))}")

# recall score
print(f"The recall score is {recall_score(y_test, model_dummy.predict(X_test))}")

The precision score is 0.0
The recall score is 0.0


  _warn_prf(average, modifier, msg_start, len(result))


### First Kaggle Submission

In [176]:
# converting into a matrix
test_X = test_df.to_numpy().astype("float")

In [177]:
# getting the predictions
predictions = model_dummy.predict(test_X)

In [178]:
# creating the submissions
df_submission = pd.DataFrame({"PassengerId" : test_df.index, "Survived" : predictions})

In [179]:
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [180]:
# creating the data path for the submissions
submissions_data_path = os.path.join(os.path.pardir, "data", "external")
submissions_file_path = os.path.join(submissions_data_path, "01_dummy.csv")

In [181]:
# writing to the file...
df_submission.to_csv(submissions_file_path, index = False)

In [182]:
def get_submission_file(model, filename):
    
    # converting to a matrix notation
    test_X = test_df.to_numpy().astype("float")
    
    # make predictions
    predictions = model.predict(test_X)
    
    #submissions data frame
    df_submission = pd.DataFrame({"PassengerId" : test_df.index, "Survived" : predictions})
    
    # creating the data path for the submissions
    submissions_data_path = os.path.join(os.path.pardir, "data", "external")
    submissions_file_path = os.path.join(submissions_data_path, filename)
    
    # writing to the file...
    df_submission.to_csv(submissions_file_path, index = False)

In [183]:
# get the submission file
get_submission_file(model_dummy, "01_dummy.csv")

# Logistic Regression model

In [204]:
# import functions
from sklearn.linear_model import LogisticRegression

In [205]:
# create the model
model_lr_1 = LogisticRegression(random_state = 0)

In [206]:
# train the model
model_lr_1.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [207]:
# evaluate the model score
print(f"The score for the model is {model_lr_1.score(X_test, y_test)}")

The score for the model is 0.8461538461538461


In [208]:
# model coefficients
model_lr_1.coef_

array([[-0.0286439 ,  0.00402935, -0.49196778,  0.35088369, -0.77652067,
         0.12623432,  0.02144286, -0.27613352,  0.40547212,  0.84751814,
         0.18776659, -0.00555472, -0.24467608,  0.89707008,  0.51474192,
        -0.34974228,  0.07474945,  0.87690073,  0.56438787, -1.45676181,
         1.19945307, -0.06861235, -0.12804724,  0.1663737 ,  0.11290373,
         0.3011729 ,  0.48161939,  0.54199083,  0.36054634,  0.15953255,
         0.2638616 ,  0.79820812]])

### Second Kaggle submission

In [209]:
# get the submission file...
get_submission_file(model_lr_1, "01_lr.csv")
# second submission completed.

## Optimizing the model

In [210]:
# base model
model_lr = LogisticRegression(random_state = 0)

In [211]:
# getting the grid search
from sklearn.model_selection import GridSearchCV

In [212]:
# giving the hyper-parameters some range of values in order to optimise the model...
parameters = {"C" : [1.0, 10.0, 50.0, 100.0, 1000,0], "penalty" : ["l1", "l2"]}

# the gridsearchcv() object, i.e clf is created
clf = GridSearchCV(model_lr, param_grid = parameters, cv = 3)

In [213]:
clf.fit(X_train, y_train)

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative so

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ZeroDivisionError: float division by zero

ZeroDivisionError: float division by zero

ZeroDivisionError: float division by zero

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000, 0],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [214]:
# we now obtain the best values for the parameters
clf.best_params_

{'C': 50.0, 'penalty': 'l2'}

In [215]:
# now, the best score
print(f"The best score : {clf.best_score_}")

The best score : 0.8302222222222223


In [216]:
# getting the score for our test data(partitioned from the original training data)
print(f"The score for logistic regression model-2 is : {clf.score(X_test, y_test)}")

The score for logistic regression model-2 is : 0.8461538461538461


### Third kaggle submission

In [217]:
# get submission file
#get_submission_file(clf, "02_lr.csv")

### Feature Normalisation and Standardisation

In [238]:
# getting the necessary functions
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#### Feature Normalisation

In [239]:
# we scale all the values to (0.0, 1.0) range
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [240]:
X_train_scaled[:, 0].min(), X_train_scaled[:, 0].max()

(0.0, 1.0)

In [241]:
#normalise the test data
X_test_scaled = scaler.transform(X_test)

#### Feature standardisation

In [242]:
# feature standardisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Create model after standardisation

In [223]:
# base model
model_lr = LogisticRegression(random_state = 0)
parameters = {"C" : [1.0, 10.0, 50.0, 100.0, 1000,0], "penalty" : ["l1", "l2"]}
clf = GridSearchCV(model_lr, param_grid = parameters, cv = 5)
clf.fit(X_train_scaled, y_train)

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ZeroDivisionError: float division by zero

ZeroDivisionError: float division by zero

ZeroDivisionError: float division by zero

ZeroDivisionError: float division by zero

ZeroDivisionError: float division by zero



GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000, 0],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [224]:
# getting the best scores corresponding to the parameter tweaking
clf.best_score_

0.8195704697986577

In [225]:
# now we get the value for the test score...
print(f"The test score : {clf.score(X_test_scaled, y_test)}")

The test score : 0.8601398601398601


In [226]:
#get_submission_file(clf, "03_lr.csv")

# Random Forest Classifier

In [388]:
# this works using a bunch of decision trees and taking a majority vote on the class they have predicted...

from sklearn.ensemble import RandomForestClassifier

In [428]:
# we wanna now create the model
ran_clf = RandomForestClassifier(n_estimators = 100, random_state = 0)

# now we wanna train the model
ran_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [433]:
# we have 100 decision trees inside the forest having bootstrapped data in each of them
# we can tweak the number of decision trees to get better predictions
ran_clf.estimators_

[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=209652396, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=398764591, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, ma

In [431]:
# now we wanna score it...
print(ran_clf.score(X_test, y_test))

0.8565022421524664


In [432]:
get_submission_file(ran_clf, "random_forest_1.csv")