# Data science

## Partie 1: Titanic

### Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

%matplotlib inline

### Fonctions utilitaires

In [None]:
def count_bins(arr1, arr2):
    '''Counts the number of bins on each dimensions for a 2d histogram'''
    return len(set(arr1)), len(set(arr2))

### Import des données depuis un fichier CSV

In [None]:
titanic_data = np.genfromtxt('titanic.dat', delimiter=',', skip_header=1)

classes = titanic_data[:, 0]
ages = titanic_data[:, 1]
sexes = titanic_data[:, 2]
survived = titanic_data[:, 3]

### Nettoyage des données

#### Ages

Il y a deux valeurs possibles dans la variable age : 

In [None]:
ages_values = list(set(ages))
ages_count_per_value = {v: sum(ages == v) for v in ages_values}
ages_count_per_value

Il parait probable que la première valeur, attribuée à 2092 personnes, désigne les adultes, tandis que l'autre valeur, attribuée à 109 personnes, désigne enfants, car il y a probablement plus d'adultes que d'enfants sur le bateau.

On peut binariser le tableau `ages` pour en faire un tableau `is_adult`, qui contient 1 si la personne est adulte et 0 si la personne est un enfant:

In [None]:
is_adult = (ages < 0).astype(int)
is_adult

#### Sexes

Il y a deux valeurs possibles pour les sexes:

In [None]:
sexes_values = list(set(sexes))
sexes_count_per_value = {v: sum(sexes == v) for v in sexes_values}
sexes_count_per_value

Il parait raisonnable de penser que sur un bateau de 1912, il y avait plus d'hommes que de femmes, soit 1731 hommes pour 470 femmes.

On peut binariser le tableau `sexes` en `is_male`:

In [None]:
is_male = (sexes > 0).astype(int)
is_male

#### Classes

Il y a 4 valeurs possibles dans classes:

In [None]:
classes_values = list(set(classes))
classes_count_per_value = {v: sum(classes== v) for v in classes_values}
classes_count_per_value


En se renseignant un peu sur le titanic, on peut apprendre qu'il y avait 3 classes sur le bâteau, plus l'équipage. Selon wikipédia, les effectifs sont les suivants:

- Première classe: 325 personnes
- Seconde classe: 285 personnes
- Troisième classe: 706 personnes
- Equipage: 908 personnes

Notre jeu de données ne semble pas contenir tout l'équipage, mais les autres chiffres correspondent. 

On peut extraire 4 nouvelles variables booléennes de la variable `class`:

In [None]:
is_first_class = (classes < -1).astype(int)
is_second_class = ((classes < 0) & (classes > -1)).astype(int)
is_third_class = ((classes > 0) & (classes < 0.1)).astype(int)
is_crew = (classes > 0.1).astype(int)

### Class vs Age vs Sex

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig_3d = plt.figure()
ax = Axes3D(fig_3d)

ax.set_xlabel('class')

ax.set_ylabel('age')
ax.set_yticks([0, 1])
ax.set_yticklabels(["child", "adult"])

ax.set_zlabel('sex')

ax.set_title('class vs age vs sex')
_ = ax.scatter(classes, is_adult, sexes)


### Class vs Age

In [None]:
fig_class_age = plt.figure()
plt.xlabel('class')
plt.ylabel('age')
plt.title('class vs age')
_ = plt.hist2d(classes, ages, norm=LogNorm(), bins=count_bins(classes, ages))
_ = plt.colorbar()

### Class vs Sex

In [None]:
fig_class_sex = plt.figure(3)
plt.xlabel('class')
plt.ylabel('sex')
plt.title('class vs sex')
_ = plt.hist2d(classes, sexes, norm=LogNorm())
_ = plt.colorbar()

### Age vs Sex

In [None]:
fig_age_sex = plt.figure(4)
plt.xlabel('age')
plt.ylabel('sex')
plt.title('age vs sex')
_ = plt.hist2d(ages, sexes, norm=LogNorm(), bins=count_bins(ages, sexes))
_ = plt.colorbar()

### Recherche de K optimal

In [None]:
from scipy.spatial import distance
from sklearn.cluster import KMeans

K = list(range(1, 50))
mean_dists = []
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(titanic_data[:,:3])
    
    dists = []
    for i, label in enumerate(kmeans.labels_):
        row = titanic_data[:,:3][i]
        cluster_center = kmeans.cluster_centers_[label]
        dist = distance.euclidean(row, cluster_center)
        dists.append(dist)
    
    mean_dist = np.mean(dists)
    mean_dists.append(mean_dist)

In [None]:
plt.axes().set_yscale('linear')
_ = plt.plot(K, mean_dists)

In [None]:
plt.axes().set_yscale('log')
_ = plt.plot(K, mean_dists)

## Partie 2

### Load breast cancer and wine datasets

In [None]:
from sklearn import datasets
from pprint import pprint

breast_cancer = datasets.load_breast_cancer()
wine = datasets.load_wine()

### Normalization of data

In [None]:
from sklearn import preprocessing



### Learning

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

breast_cancer_kneighbors = KNeighborsClassifier().fit(breast_cancer['data'], breast_cancer['target'])
breast_cancer_decisiontree = DecisionTreeClassifier().fit(breast_cancer['data'], breast_cancer['target'])
breast_cancer_MLPC = MLPClassifier().fit(breast_cancer['data'], breast_cancer['target'])

wine_kneighbors = KNeighborsClassifier().fit(wine['data'], wine['target'])
wine_decisiontree = DecisionTreeClassifier().fit(wine['data'], wine['target'])
wine_MLPC = MLPClassifier().fit(wine['data'], wine['target'])


### Validation croisée à 5 segments

## Partie 3

In [None]:
# data from https://archive.ics.uci.edu/ml/datasets/online+news+popularity
news_data = np.genfromtxt('onlineNewsPopularity.csv', delimiter=',', skip_header=1)
nb_columns = np.shape(news_data)[1]
(
    url, # URL of the article (non-predictive)
    timedelta, # Days between the article publication and the dataset acquisition (non-predictive)
    n_tokens_title, # Number of words in the title
    n_tokens_content, # Number of words in the content
    n_unique_tokens, # Rate of unique words in the content
    n_non_stop_words, # Rate of non-stop words in the content
    n_non_stop_unique_tokens, # Rate of unique non-stop words in the content
    num_hrefs, # Number of links
    num_self_hrefs, # Number of links to other articles published by Mashable
    num_imgs, # Number of images
    num_videos, # Number of videos
    average_token_length, # Average length of the words in the content
    num_keywords, # Number of keywords in the metadata
    data_channel_is_lifestyle, # Is data channel 'Lifestyle'?
    data_channel_is_entertainment, # Is data channel 'Entertainment'?
    data_channel_is_bus, # Is data channel 'Business'?
    data_channel_is_socmed, # Is data channel 'Social Media'?
    data_channel_is_tech, # Is data channel 'Tech'?
    data_channel_is_world, # Is data channel 'World'?
    kw_min_min, # Worst keyword (min. shares)
    kw_max_min, # Worst keyword (max. shares)
    kw_avg_min, # Worst keyword (avg. shares)
    kw_min_max, # Best keyword (min. shares)
    kw_max_max, # Best keyword (max. shares)
    kw_avg_max, # Best keyword (avg. shares)
    kw_min_avg, # Avg. keyword (min. shares)
    kw_max_avg, # Avg. keyword (max. shares)
    kw_avg_avg, # Avg. keyword (avg. shares)
    self_reference_min_shares, # Min. shares of referenced articles in Mashable
    self_reference_max_shares, # Max. shares of referenced articles in Mashable
    self_reference_avg_sharess, # Avg. shares of referenced articles in Mashable
    weekday_is_monday, # Was the article published on a Monday?
    weekday_is_tuesday, # Was the article published on a Tuesday?
    weekday_is_wednesday, # Was the article published on a Wednesday?
    weekday_is_thursday, # Was the article published on a Thursday?
    weekday_is_friday, # Was the article published on a Friday?
    weekday_is_saturday, # Was the article published on a Saturday?
    weekday_is_sunday, # Was the article published on a Sunday?
    is_weekend, # Was the article published on the weekend?
    LDA_00, # Closeness to LDA topic 0
    LDA_01, # Closeness to LDA topic 1
    LDA_02, # Closeness to LDA topic 2
    LDA_03, # Closeness to LDA topic 3
    LDA_04, # Closeness to LDA topic 4
    global_subjectivity, # Text subjectivity
    global_sentiment_polarity, # Text sentiment polarity
    global_rate_positive_words, # Rate of positive words in the content
    global_rate_negative_words, # Rate of negative words in the content
    rate_positive_words, # Rate of positive words among non-neutral tokens
    rate_negative_words, # Rate of negative words among non-neutral tokens
    avg_positive_polarity, # Avg. polarity of positive words
    min_positive_polarity, # Min. polarity of positive words
    max_positive_polarity, # Max. polarity of positive words
    avg_negative_polarity, # Avg. polarity of negative words
    min_negative_polarity, # Min. polarity of negative words
    max_negative_polarity, # Max. polarity of negative words
    title_subjectivity, # Title subjectivity
    title_sentiment_polarity, # Title polarity
    abs_title_subjectivity, # Absolute subjectivity level
    abs_title_sentiment_polarity, # Absolute polarity level
    shares, # Number of shares (target)
) = (news_data[:, i] for i in range(nb_columns))