# Régréssion logistique

## Les données Titanic

### Importer les données Titanic ".csv"

In [1]:
import pandas as pd

In [2]:
# Lecture du fichier ".csv"
path = "~/Documents/PDS/Cours/Data/train.csv"
train = pd.read_csv(path, sep=",")

# Affichage des cinq premières lignes
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Ajout d'un Index

In [3]:
# Ajouter l'index
# train.set_index("PassengerId", inplace=True, drop=True)
# train.head()

### Exemples de commandes usuelles sur les variables

In [4]:
# Type des variables
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
# Informations concernant les variables
train.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
# Compte le nombre de valeurs
train.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [7]:
# Affiche les noms des colonnes
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [8]:
# Compte le nombre de colonnes
len(train.columns)

12

In [9]:
# Compte le nombre de lignes
len(train)
len(train.index)

891

## Machine Learning

In [None]:
# On veut prédire le nombre de survivants du bateau Titanic

# Sibsp : Sibling = brother, sister, stepbrother, stepsister
#         Spouse = husband, wife (mistresses and fiancés were ignored)
# Parch : Parent = mother, father
#         Child = daughter, son, stepdaughter, stepson
#         Some children travelled only with a nanny, therefore parch=0 for them.

### Création du modèle général

In [10]:
# Création du modèle général 
def parse_model(X, use_columns):
    if "Survived" not in X.columns:
        raise ValueError("target column survived should belong to df")
    target = X["Survived"]
    X = X[use_columns]
    return X, target

# Essai avec les variables 'SisSp', 'Parch' et 'Fare'
model_cols1 = ['SibSp', 'Parch', 'Fare']
X, y = parse_model(X=train.copy(), use_columns=model_cols1)

In [11]:
X
# Note : For integer/ None inputs, if the estimator is a classifier and y is either binary or multiclass, StratifiedKid is used.
# In all other cases, KFol is used.

Unnamed: 0,SibSp,Parch,Fare
0,1,0,7.2500
1,1,0,71.2833
2,0,0,7.9250
3,1,0,53.1000
4,0,0,8.0500
5,0,0,8.4583
6,0,0,51.8625
7,3,1,21.0750
8,0,2,11.1333
9,1,0,30.0708


### Application de la régression logistique

In [12]:
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np



In [13]:
# Utiliser un simple split simple des données sera trop dépendant de l'échantillon
# On rq que en prenant 5% des données pour le test on a 75% de score
# alors que en prenant 95% on a un score de 61%

lr = LogisticRegression()

list_test_size = [a/20.0 for a in list(range(0,20,1))][1:]
scores = []

for ts in list_test_size:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=0)
    clf = lr.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))

scores
np.array(scores).mean()

0.68687891525611244

### Calcul du score

In [14]:
# On utilise donc la moyenne de plusieurs validation croisées pour augmenter
# la significativité de la validation
def compute_score(clf, X, y, cv=5):
    """compute score in a classification modelisation.
    clf: classifier
    X: features
    y:target
    """
    xval = cross_val_score(clf, X, y, cv=5)
    print("Accurancy: %0.2f (+/- %0.2f)" % (xval.mean(), xval.std() * 2))
    return xval

compute_score(lr,X, y)

Accurancy: 0.67 (+/- 0.09)


array([ 0.59776536,  0.74301676,  0.66853933,  0.6741573 ,  0.68926554])