**Step 1) Importing and Instantiating Classes**

In [88]:
pip install sweetviz

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [89]:
from utils import open_csv_file, analyze_dataframe, create_report

from correlation_matrix import Correlation
from data import DataFrame
from machine_learning import MachineLearning
from sklearn import metrics, svm
from sklearn.metrics import classification_report
import pandas as pd

import numpy as np

from graph import Graph
import seaborn as sns

training_data = open_csv_file('train.csv')
data_frame_instance = DataFrame(training_data)

analysis_result = analyze_dataframe(data_frame_instance)
print("Analysis Result:")
print(analysis_result)

Analysis Result:
{'total_categorical_columns': 3, 'total_numerical_columns': 7, 'categorical_column_names': ['Name', 'Sex', 'Ticket'], 'numerical_column_names': ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']}


**Step 2) Inspecting and Cleaning Data**

In [90]:
# Replace missing values with median
data_frame_instance.replace_missing_value_with_median('Age')


**Step 3) Transforming Data**

In [91]:
# Create column for last name
data_frame_instance.create_last_name_column('Name')


In [92]:
# Label encoding
data_frame_instance.label_encoding('Last_name')

# One-hot encoding
data_frame_instance.hot_encoding('Embarked')
data_frame_instance.hot_encoding('Sex')

In [93]:
data_frame_instance.min_max_scaler('Last_name_encoded')


In [94]:
data_frame_instance.get_bin_edges('Fare','quantile')

array([array([  0.    ,   7.8542,  10.5   ,  21.6792,  39.6875, 512.3292])],
      dtype=object)

In [95]:
data_frame_instance.bucket_quantile('Age')
data_frame_instance.bucket_quantile('Fare')



In [96]:
data_frame_instance.remove_columns(['PassengerId', 'Name', 'Cabin', 'Age', 'Fare', 'Ticket', 'Last_name'])

**Step 4) Feature Selection**

In [97]:
# ## feature selection
corr_mat_type_two = data_frame_instance.get_df().corr().abs()
# ## created a matrix of ones with the same shape with corr mat
mask = np.triu(np.ones_like(corr_mat_type_two, dtype=bool))
mask_df = corr_mat_type_two.mask(mask)
mask_df

Unnamed: 0,Survived,Pclass,SibSp,Parch,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,Sex_female,Sex_male,Last_name_encoded_scaled,Age_bucket,Fare_bucket
Survived,,,,,,,,,,,,,
Pclass,0.338481,,,,,,,,,,,,
SibSp,0.035322,0.083081,,,,,,,,,,,
Parch,0.081629,0.018443,0.414838,,,,,,,,,,
Embarked_C,0.16824,0.243292,0.059528,0.011069,,,,,,,,,
Embarked_Q,0.00365,0.221009,0.026354,0.081228,0.148258,,,,,,,,
Embarked_S,0.15566,0.08172,0.070941,0.063036,0.778359,0.496624,,,,,,,
Embarked_nan,0.060095,0.074282,0.022508,0.022467,0.022864,0.014588,0.076588,,,,,,
Sex_female,0.543351,0.1319,0.114631,0.245489,0.082853,0.074115,0.125722,0.064296,,,,,
Sex_male,0.543351,0.1319,0.114631,0.245489,0.082853,0.074115,0.125722,0.064296,1.0,,,,


**Step 5) Machine learning and evaluation**

In [98]:
data_to_use = data_frame_instance.get_df().iloc[:, 1:13]
data_to_use
data_to_target = data_frame_instance.get_df().iloc[:, 0]
data_to_target
ml = MachineLearning(data_to_use, data_to_target)
# Split the data
ml.split_data()

# Fit the KNN classifier
ml.fit_knn_classification()

X_test = ml.X_test

y_pred_knn = ml.predict_model(ml.knn_model)

ml.evaluate_model(y_pred_knn)

print('Accuracy', metrics.accuracy_score(ml.y_test, y_pred_knn))



Accuracy 0.7910447761194029




In [99]:
ml.decision_tree_gini_classifier()
ml.decision_tree_depth_classifier()
ml.decision_tree_entropy_classifier()


In [100]:
y_pred_gini = ml.predict_model(ml.decision_tree_gini_model)
y_pred_depth = ml.predict_model(ml.decision_tree_depth_model)
y_pred_entropy = ml.predict_model(ml.decision_tree_entropy_model)


In [101]:
ml.evaluate_model(y_pred_gini)

print('Accuracy', metrics.accuracy_score(ml.y_test, y_pred_gini))

Accuracy 0.75




In [102]:
ml.evaluate_model(y_pred_depth)

print('Accuracy', metrics.accuracy_score(ml.y_test, y_pred_depth))

Accuracy 0.7761194029850746




In [103]:
ml.evaluate_model(y_pred_entropy)

print('Accuracy', metrics.accuracy_score(ml.y_test, y_pred_entropy))

Accuracy 0.7313432835820896




In [104]:
ml.svm_clf_linear_classifier()
ml.svm_clf_sigmoid_classifier()
ml.svm_clf_rbf_classifier()

y_pred_linear = ml.predict_model(ml.svm_clf_linear_model)
y_pred_sigmoid = ml.predict_model(ml.svm_clf_sigmoid_model)
y_pred_rbf = ml.predict_model(ml.svm_clf_rbf_model)

In [105]:
ml.evaluate_model(y_pred_linear)

print('Accuracy', metrics.accuracy_score(ml.y_test, y_pred_linear))

Accuracy 0.7873134328358209




In [106]:
ml.evaluate_model(y_pred_sigmoid)

print('Accuracy', metrics.accuracy_score(ml.y_test, y_pred_sigmoid))

Accuracy 0.6194029850746269




In [107]:
ml.evaluate_model(y_pred_rbf)
print('Accuracy', metrics.accuracy_score(ml.y_test, y_pred_rbf))

Accuracy 0.7873134328358209




In [108]:
print(classification_report(ml.y_test, y_pred_knn, output_dict=True))
pd.DataFrame(classification_report(ml.y_test, y_pred_knn, output_dict=True))

{'0': {'precision': 0.8375, 'recall': 0.8170731707317073, 'f1-score': 0.8271604938271605, 'support': 164.0}, '1': {'precision': 0.7222222222222222, 'recall': 0.75, 'f1-score': 0.7358490566037735, 'support': 104.0}, 'accuracy': 0.7910447761194029, 'macro avg': {'precision': 0.7798611111111111, 'recall': 0.7835365853658536, 'f1-score': 0.781504775215467, 'support': 268.0}, 'weighted avg': {'precision': 0.7927653399668325, 'recall': 0.7910447761194029, 'f1-score': 0.7917262047553985, 'support': 268.0}}


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.8375,0.722222,0.791045,0.779861,0.792765
recall,0.817073,0.75,0.791045,0.783537,0.791045
f1-score,0.82716,0.735849,0.791045,0.781505,0.791726
support,164.0,104.0,0.791045,268.0,268.0
