# Python for Data Scientist
## From data manipulation to machine learning

# scikit-learn

* classification
* regression
* clustering
* preprocessing
* ...

In [1]:
import seaborn as sns

from sklearn import svm
from sklearn.model_selection import train_test_split

### Carico il dataset dei passeggeri del titanic da seaborn

In [2]:
original_data = sns.load_dataset("titanic")
original_data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


### Creo una copia del dataset in modo da mantenere i dati orginali

In [3]:
data = original_data.copy()

In [4]:
data.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


### Converto alcune colonne di tipo categorico in tipo numerico

In [5]:
# Nota: se eseguita due volte, questa operazione rende tutta la colonna NaN
data.sex = data.sex.map({"male": -1, "female": 1})
data.sex.unique()

array([-1,  1])

In [6]:
# nota: class è già presente in formato numerico come classe pclass
data["class"] = data["class"].map({"First": 1, "Second": 2, "Third": 3})
data["class"].unique()

[3, 1, 2]
Categories (3, int64): [1, 2, 3]

### Elimino alcuni valori NaN dalla colonna `age` impostandoli al valor medio
Questo non è ne l'unico ne l'ottimale approccio per "eliminare" i valori NaN. Una strategia di pulitura dei dati mancanti va sempre stabilita in base al tipo di dataset e ai propri obiettivi e con una buona dose di buon senso. 

**Nota**: la funzione `count()` ritorna il numero di valori diversi da **NaN**/**None**

In [7]:
data.age.count?
# len(data.age[data.age.isna()])
data.age.count()

714

In [8]:
mean_age = data.age[data.age.notna()].mean()
data.age.fillna(mean_age, inplace=True)
data.age.count()

891

### Seleziono e normalizzo le colonne che daremo in pasto alla SVM 
Buona norma quando si usano algoritmi come le SVM è quello di normalizzare i dati in modo che la scala dei numeri passati non vada a intaccare l'algoritmo

Saltare questo passaggio causa un peggioramento visibile delle performance nel caso di una SVM

In [9]:
columns = ["pclass", "age", "sibsp", "parch", "fare", "sex"]

for column in columns:
    col = data[column]
    data[column] = (col - col.mean()) / col.std()

In [10]:
data[columns].describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,-7.575933000000001e-17,2.27278e-16,3.5886e-17,4.5854330000000005e-17,-1.1962000000000002e-17,1.5949330000000003e-17
std,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.565228,-2.251891,-0.4742788,-0.4734077,-0.6480577,-0.737281
25%,-0.3691575,-0.592148,-0.4742788,-0.4734077,-0.4888737,-0.737281
50%,0.8269128,0.0,-0.4742788,-0.4734077,-0.3571902,-0.737281
75%,0.8269128,0.407697,0.4325504,-0.4734077,-0.02423274,1.354813
max,0.8269128,3.868699,6.780355,6.970233,9.66174,1.354813


### Separo i dati preparati in due insiemi
#### un insieme di train (usato per addestrare il modello)
#### un insieme di test (usato per validare le performance del mio modello)

In [11]:
train, test = train_test_split(data, test_size=0.2)

len(train), len(test)

(712, 179)

### Creo il mio modello SVM di tipo classificatore (SVC) e lo addestro sull'insieme di train

In [12]:
classifier = svm.SVC()

classifier.fit(train[columns], train.survived)

### Provo a predire la sopravvivenza dei passeggeri nell'insieme di test
Per convenienza la aggiungo come colonna del dataset

In [13]:
test["predict"] = classifier.predict(test[columns])

In [14]:
test

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,predict
102,0,-1.565228,-0.737281,-0.669059,-0.474279,0.767199,0.907228,S,1,man,True,D,Southampton,no,False,0
335,0,0.826913,-0.737281,0.000000,-0.474279,-0.473408,-0.489167,S,3,man,True,,Southampton,no,True,0
638,0,0.826913,1.354813,0.869164,-0.474279,5.729626,0.150589,S,3,woman,False,,Southampton,no,False,0
506,1,-0.369158,1.354813,0.253875,-0.474279,2.007806,-0.124850,S,2,woman,False,,Southampton,yes,False,1
242,0,-0.369158,-0.737281,-0.053770,-0.474279,-0.473408,-0.436762,S,2,man,True,,Southampton,no,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,0,0.826913,-0.737281,-0.976704,-0.474279,-0.473408,-0.473739,S,3,man,True,,Southampton,no,True,0
99,0,-0.369158,-0.737281,0.330786,0.432550,-0.473408,-0.124850,S,2,man,True,,Southampton,no,False,0
621,1,-1.565228,-0.737281,0.946075,0.432550,-0.473408,0.409511,S,1,man,True,D,Southampton,yes,False,0
408,0,0.826913,-0.737281,-0.669059,-0.474279,-0.473408,-0.491598,S,3,man,True,,Southampton,no,True,0


In [15]:
print(f"Predict correctly: {len(test[test.predict == test.survived])}/{len(test)}")
test[test.predict != test.survived]

Predict correctly: 155/179


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,predict
701,1,-1.565228,-0.737281,0.407697,-0.474279,-0.473408,-0.119064,S,1,man,True,E,Southampton,yes,True,0
402,0,0.826913,1.354813,-0.669059,0.43255,-0.473408,-0.450345,S,3,woman,False,,Southampton,no,False,1
744,1,0.826913,-0.737281,0.100052,-0.474279,-0.473408,-0.48858,S,3,man,True,,Southampton,yes,True,0
205,0,0.826913,1.354813,-2.130371,-0.474279,0.767199,-0.437517,S,3,child,False,G,Southampton,no,False,1
328,1,0.826913,1.354813,0.100052,0.43255,0.767199,-0.235025,S,3,woman,False,,Southampton,yes,False,0
25,1,0.826913,1.354813,0.63843,0.43255,5.729626,-0.016435,S,3,woman,False,,Southampton,yes,False,0
81,1,0.826913,-0.737281,-0.05377,-0.474279,-0.473408,-0.456886,S,3,man,True,,Southampton,yes,True,0
449,1,-1.565228,-0.737281,1.715187,-0.474279,-0.473408,-0.034294,S,1,man,True,C,Southampton,yes,True,0
547,1,-0.369158,-0.737281,0.0,-0.474279,-0.473408,-0.369097,C,2,man,True,,Cherbourg,yes,True,0
38,0,0.826913,1.354813,-0.899793,1.33938,-0.473408,-0.285837,S,3,woman,False,,Southampton,no,False,1
