# Data science

## Titanic

### Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

%matplotlib inline

### Util functions

In [None]:
def count_bins(arr1, arr2):
    return len(set(arr1)), len(set(arr2))

### Import data from CSV

In [None]:
titanic_data = np.genfromtxt('titanic.dat', delimiter=',', skip_header=1)

classes = titanic_data[:, 0]
ages = titanic_data[:, 1]
sexes = titanic_data[:, 2]
survived = titanic_data[:, 3]

### Class vs Age vs Sex

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig_3d = plt.figure()
ax = Axes3D(fig_3d)
ax.set_xlabel('class')
ax.set_ylabel('age')
ax.set_zlabel('sex')
ax.set_title('class vs age vs sex')
_ = ax.scatter(classes, ages, sexes)


### Class vs Age

In [None]:
fig_class_age = plt.figure()
plt.xlabel('class')
plt.ylabel('age')
plt.title('class vs age')
_ = plt.hist2d(classes, ages, norm=LogNorm(), bins=count_bins(classes, ages))
_ = plt.colorbar()

### Class vs Sex

In [None]:
fig_class_sex = plt.figure(3)
plt.xlabel('class')
plt.ylabel('sex')
plt.title('class vs sex')
_ = plt.hist2d(classes, sexes, norm=LogNorm())
_ = plt.colorbar()

### Age vs Sex

In [None]:
fig_age_sex = plt.figure(4)
plt.xlabel('age')
plt.ylabel('sex')
plt.title('age vs sex')
_ = plt.hist2d(ages, sexes, norm=LogNorm(), bins=count_bins(ages, sexes))
_ = plt.colorbar()

### Recherche de K optimal

In [None]:
from scipy.spatial import distance
from sklearn.cluster import KMeans

K = list(range(1, 50))
mean_dists = []
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(titanic_data[:,:3])
    
    dists = []
    for i, label in enumerate(kmeans.labels_):
        row = titanic_data[:,:3][i]
        cluster_center = kmeans.cluster_centers_[label]
        dist = distance.euclidean(row, cluster_center)
        dists.append(dist)
    
    mean_dist = np.mean(dists)
    mean_dists.append(mean_dist)

In [None]:
plt.axes().set_yscale('linear')
_ = plt.plot(K, mean_dists)

In [None]:
plt.axes().set_yscale('log')
_ = plt.plot(K, mean_dists)

## Partie 2

### Load breast cancer and wine datasets

In [None]:
from sklearn import datasets
from pprint import pprint

breast_cancer = datasets.load_breast_cancer()
wine = datasets.load_wine()

### Normalization of data

In [None]:
from sklearn import preprocessing



### Learning

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

breast_cancer_kneighbors = KNeighborsClassifier().fit(breast_cancer['data'], breast_cancer['target'])
breast_cancer_decisiontree = DecisionTreeClassifier().fit(breast_cancer['data'], breast_cancer['target'])
breast_cancer_MLPC = MLPClassifier().fit(breast_cancer['data'], breast_cancer['target'])

wine_kneighbors = KNeighborsClassifier().fit(wine['data'], wine['target'])
wine_decisiontree = DecisionTreeClassifier().fit(wine['data'], wine['target'])
wine_MLPC = MLPClassifier().fit(wine['data'], wine['target'])


### Validation croisée à 5 segments