# Principal Component Analysis

In [1]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt 
import seaborn as sns

import cufflinks as cf

In [2]:
cf.go_offline()

In [3]:
titanic = sns.load_dataset('titanic')

In [19]:
titanic.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [18]:
#titanic.drop(columns=['deck'], inplace=True)
#titanic.age = titanic.age.fillna(titanic.age.mean())
#titanic.dropna(inplace=True)

In [20]:
unlabelled  = titanic.drop(['survived', 'alive'], axis=1)

In [21]:
unlabelled = pd.get_dummies(unlabelled, drop_first=True)

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
scaler = StandardScaler()

In [24]:
scaled = scaler.fit(unlabelled)

In [25]:
scaled = scaler.transform(unlabelled)

In [26]:
xscaled = pd.DataFrame(scaled)

In [27]:
from sklearn.decomposition import PCA

In [28]:
pca = PCA(n_components=3)

In [29]:
pca.fit(scaled)

PCA(n_components=3)

In [30]:
eigen = pca.transform(scaled)

In [31]:
xeigen = pd.DataFrame(eigen)

In [32]:
#xeigen

In [33]:
pd.concat([xeigen, titanic.survived, titanic.alive], axis=1).iplot(kind='scatter3d', x=0, y=1, z=2, categories='alive')

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
feat_train, feat_test, lab_train, lab_test = train_test_split(xeigen, titanic.survived, test_size=0.2, random_state=101)

In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [37]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(feat_train, lab_train)
kpred = knn.predict(feat_test)

In [38]:
dtc = DecisionTreeClassifier()
dtc.fit(feat_train, lab_train)
dpred = dtc.predict(feat_test)

In [39]:
svc = SVC()
svc.fit(feat_train, lab_train)
spred = svc.predict(feat_test)

In [40]:
rfc = RandomForestClassifier(n_estimators=500)
rfc.fit(feat_train, lab_train)
rpred = rfc.predict(feat_test)

In [41]:
logit = LogisticRegression()
logit.fit(feat_train, lab_train)
lpred = logit.predict(feat_test)

In [42]:
xgb = XGBClassifier()
xgb.fit(feat_train, lab_train)
xpred = xgb.predict(feat_test)

In [43]:
from sklearn.metrics import classification_report

In [45]:
print(classification_report(kpred, lab_test))

              precision    recall  f1-score   support

           0       0.94      0.80      0.87       126
           1       0.65      0.88      0.75        52

    accuracy                           0.83       178
   macro avg       0.80      0.84      0.81       178
weighted avg       0.86      0.83      0.83       178



In [46]:
print(classification_report(spred, lab_test))

              precision    recall  f1-score   support

           0       0.93      0.80      0.86       125
           1       0.65      0.87      0.74        53

    accuracy                           0.82       178
   macro avg       0.79      0.83      0.80       178
weighted avg       0.85      0.82      0.83       178



In [47]:
print(classification_report(dpred, lab_test))

              precision    recall  f1-score   support

           0       0.73      0.78      0.75       100
           1       0.69      0.63      0.66        78

    accuracy                           0.71       178
   macro avg       0.71      0.70      0.71       178
weighted avg       0.71      0.71      0.71       178



In [48]:
print(classification_report(rpred, lab_test))

              precision    recall  f1-score   support

           0       0.83      0.82      0.82       109
           1       0.72      0.74      0.73        69

    accuracy                           0.79       178
   macro avg       0.78      0.78      0.78       178
weighted avg       0.79      0.79      0.79       178



In [49]:
print(classification_report(lpred, lab_test))

              precision    recall  f1-score   support

           0       0.92      0.82      0.87       119
           1       0.70      0.85      0.77        59

    accuracy                           0.83       178
   macro avg       0.81      0.84      0.82       178
weighted avg       0.85      0.83      0.83       178



In [50]:
print(classification_report(xpred, lab_test))

              precision    recall  f1-score   support

           0       0.83      0.82      0.82       109
           1       0.72      0.74      0.73        69

    accuracy                           0.79       178
   macro avg       0.78      0.78      0.78       178
weighted avg       0.79      0.79      0.79       178



# Performance (based on accuracy)
1. Logistic regression - 83%
2. K-Nearest Neighbor - 83%
3. Support Vector Machine - 82%
4. XGBoot - 79%
5. Random Forest - 79%
6. Decision tree - 71% 