# Titanic Clustering

Connect to Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Import pandas, seaborn, matplotlib.  Also import k-means clustering and PCA from sklearn.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

Read data.

In [None]:
mydata = pd.read_csv("/content/drive/MyDrive/320/train.csv",
                     sep=",",  # delimiter
                     header=0, # header in first row
                     index_col=0 # ids in first column
)
mydata.head()

Select columns named Survived, Sex, Age, and Fare.

In [None]:
mydata = mydata[["Survived", "Sex", "Age", "Fare", "Embarked"]]
mydata.describe(include="all")

There are missing values for age, so let's replace the missing values with the median.

In [None]:
mydata["Age"] = mydata["Age"].fillna(mydata["Age"].median())
mydata.describe(include='all')
mydata.head()

Create dummy variables for Sex and Embarked.  Note that there are missing values in Embarked.

In [None]:
mydata = pd.get_dummies(mydata, drop_first=True)
mydata.head()

Normalize the data by subtracting the column means and dividing by column standard deviations.

In [None]:
mydata = (mydata - mydata.mean())/mydata.std()
mydata.head()

Cluster with k-means.

In [None]:
mykmeans = KMeans(n_clusters=3).fit(mydata)
mykmeans.labels_

Evaluate clusters using PCA.

In [None]:
mypca = PCA(n_components=2)
myscores = mypca.fit_transform(mydata)

myscores = pd.DataFrame(myscores, columns=["PC1", "PC2"])
sns.scatterplot(data=myscores,
                x="PC1", 
                y="PC2", 
                c=mykmeans.labels_)

Describe the clusters.

In [None]:
myloadings = pd.DataFrame(mypca.components_.T, 
             index=mydata.columns,
             columns=["PC1","PC2"])
myloadings