<a href="https://colab.research.google.com/github/dunefro/machine_learning/blob/main/kaggle/titanic/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data-preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # setting seaborn as default mechanism for plots

## Importing the dataset

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



## Feature Extraction

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Extracting important features

### Handling missing data

In [6]:
## Combining both dataframes
dataset = train.append(test,ignore_index=True)

In [7]:
# Applying simple imputer for missing data in age
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

train_test_data = dataset.iloc[:,:].values
imputer.fit(train_test_data[:,5:6])
train_test_data[:,5:6] = imputer.transform(train_test_data[:,5:6])

In [8]:
# Dropping the index where Embarked is NaN as the total no of NaN values is less than 1 %
index_list = [i for i in range(len(train_test_data)) if isinstance(train_test_data[i][-1],float)]

# specifying axis=0 is important otherwise the output array is flattened
train_test_data = np.delete(train_test_data,index_list,axis=0)

### Removing Unnecessary columns

In [9]:
train_test_data

array([[1, 0.0, 3, ..., 7.25, nan, 'S'],
       [2, 1.0, 1, ..., 71.2833, 'C85', 'C'],
       [3, 1.0, 3, ..., 7.925, nan, 'S'],
       ...,
       [1307, nan, 3, ..., 7.25, nan, 'S'],
       [1308, nan, 3, ..., 8.05, nan, 'S'],
       [1309, nan, 3, ..., 22.3583, nan, 'C']], dtype=object)

In [10]:
train_test_data = train_test_data[ : , [1, 2, 4, 5, 6, 7, 11]]

In [11]:
print(train_test_data[0])

[0.0 3 'male' 22.0 1 0 'S']


### Encoding data

In [12]:
# OHE for Embarked column

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder(), [-1])], remainder ='passthrough' )
train_test_data = np.array(ct.fit_transform(train_test_data))
print(train_test_data[0])
train_test_data = train_test_data[: , 1:]

[0.0 0.0 1.0 0.0 3 'male' 22.0 1 0]


In [13]:
print(train_test_data)

[[0.0 1.0 0.0 ... 22.0 1 0]
 [0.0 0.0 1.0 ... 38.0 1 0]
 [0.0 1.0 1.0 ... 26.0 0 0]
 ...
 [0.0 1.0 nan ... 38.5 0 0]
 [0.0 1.0 nan ... 29.881137667304014 0 0]
 [0.0 0.0 nan ... 29.881137667304014 1 1]]


In [14]:
# Label Encoding for the gender column
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_test_data[: , 4] = le.fit_transform(train_test_data[:,4])

In [15]:
print(train_test_data)

[[0.0 1.0 0.0 ... 22.0 1 0]
 [0.0 0.0 1.0 ... 38.0 1 0]
 [0.0 1.0 1.0 ... 26.0 0 0]
 ...
 [0.0 1.0 nan ... 38.5 0 0]
 [0.0 1.0 nan ... 29.881137667304014 0 0]
 [0.0 0.0 nan ... 29.881137667304014 1 1]]


## Removing the prediction dataset

In [16]:
# Counting the no of values for training set by checking the Survived column
count = 0
for i in train_test_data:
  if np.isnan(i[2]):
    count +=1

print(count)



418


In [17]:
y = train_test_data[ : (len(train_test_data)- count) , 2]
X = train_test_data[ : (len(train_test_data)-count) , [0,1,3, 4, 5, 6, 7] ]
X_pred = train_test_data[(len(train_test_data)-count) : ,  [0,1,3, 4, 5, 6, 7] ]

In [18]:
y = y.astype('int')

In [19]:
X

array([[0.0, 1.0, 3, ..., 22.0, 1, 0],
       [0.0, 0.0, 1, ..., 38.0, 1, 0],
       [0.0, 1.0, 3, ..., 26.0, 0, 0],
       ...,
       [0.0, 1.0, 3, ..., 29.881137667304014, 1, 2],
       [0.0, 0.0, 1, ..., 26.0, 0, 0],
       [1.0, 0.0, 3, ..., 32.0, 0, 0]], dtype=object)

In [20]:
X_pred

array([[1.0, 0.0, 3, ..., 34.5, 0, 0],
       [0.0, 1.0, 3, ..., 47.0, 1, 0],
       [1.0, 0.0, 2, ..., 62.0, 0, 0],
       ...,
       [0.0, 1.0, 3, ..., 38.5, 0, 0],
       [0.0, 1.0, 3, ..., 29.881137667304014, 0, 0],
       [0.0, 0.0, 3, ..., 29.881137667304014, 1, 1]], dtype=object)

## Feature Scaling

In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[:, [2,4]] = sc.fit_transform(X[ :, [2, 4] ])
# X_test[: , [2,4]] = sc.transform(X_test[ : , [2,4]])

# Training of different models on training set

## Splitting training set into train and prediction set to compare models

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

In [23]:
X_train.shape

(711, 7)

In [24]:
y_train

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,

## Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Predicting the test set using logistic regression

In [26]:
y_pred_lr = lr.predict(X_test)

In [48]:
y_pred_lr

array([1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0])

### Accuracy score

In [27]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_lr))

0.8426966292134831


## K-NN

In [28]:
from sklearn.neighbors import KNeighborsClassifier
knnc = KNeighborsClassifier()
knnc.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

### Predicting the test set using K-NN classification

In [29]:
y_pred_knn = knnc.predict(X_test)

In [47]:
y_pred_knn

array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0])

### Accuracy score

In [30]:
print(accuracy_score(y_test, y_pred_knn))


0.7865168539325843


## SVM

In [31]:
from sklearn.svm import SVC
svc = SVC(kernel='linear' , random_state=0)
svc.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [32]:
y_pred_svm = svc.predict(X_test)

In [33]:
print(accuracy_score(y_test, y_pred_svm))

0.8314606741573034


## Kernel SVM

In [34]:
from sklearn.svm import SVC
ksvc = SVC(kernel='rbf',random_state=0)
ksvc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [35]:
y_pred_ksvc=ksvc.predict(X_test)

In [36]:
print(accuracy_score(y_test, y_pred_ksvc))

0.8426966292134831


## RFC

In [37]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [38]:
y_pred_rfc = classifier.predict(X_test)
print(accuracy_score(y_test,y_pred_rfc))

0.8089887640449438


## Naive-Bayes

In [41]:
from sklearn.naive_bayes import GaussianNB
nbc = GaussianNB()
nbc.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [42]:
y_pred_nb = nbc.predict(X_test)
print(accuracy_score(y_test, y_pred_nb))

0.8033707865168539


## DTC

In [43]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=0,criterion='entropy')
dtc.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [44]:
y_pred_dtc = dtc.predict(X_test)
print(accuracy_score(y_test,y_pred_dtc))

0.7640449438202247


In [46]:
X_pred

array([[1.0, 0.0, 3, ..., 34.5, 0, 0],
       [0.0, 1.0, 3, ..., 47.0, 1, 0],
       [1.0, 0.0, 2, ..., 62.0, 0, 0],
       ...,
       [0.0, 1.0, 3, ..., 38.5, 0, 0],
       [0.0, 1.0, 3, ..., 29.881137667304014, 0, 0],
       [0.0, 0.0, 3, ..., 29.881137667304014, 1, 1]], dtype=object)

In [52]:
X_pred[: , [2,4]] = sc.transform(X_pred[ : , [2,4]])

In [60]:
with open('newfile.txt','a') as f:
  for i in lr.predict(X_pred):
    f.write(str(i)+"\n")