## Traiter les données manquantes

In [35]:
import pandas as pd
import numpy

titanic_survival = pd.read_csv("titanic_survival.csv")

### Trouver les valeurs manquantes

In [5]:
# pandas.isnull()
sex = titanic_survival["sex"]
sex_is_null = sex.isnull() # or pd.isnull(sex)
sex_is_null

0       False
1       False
2       False
3       False
4       False
        ...  
1305    False
1306    False
1307    False
1308    False
1309     True
Name: sex, Length: 1310, dtype: bool

In [6]:
sex_null = sex[sex_is_null]
sex_null

1309    NaN
Name: sex, dtype: object

In [12]:
age = titanic_survival["age"]
age_is_null = age.isnull()
age_null = age[age_is_null]
age_null_count = len(age_null)
print(age_null_count)

264


In [13]:
titanic_survival.shape

(1310, 14)

### Problème avec les valeurs manquantes

In [18]:
mean_age = sum(titanic_survival["age"]) / len(titanic_survival["age"])
print(mean_age)

nan


In [25]:
good_ages = titanic_survival["age"][age_is_null == False]

mean_age = sum(good_ages) / len(good_ages)
mean_age

29.8811345124283

In [27]:
mean_age = titanic_survival["age"].mean()
mean_age

29.8811345124283

In [28]:
mean_fare = titanic_survival["fare"].mean()
mean_fare

33.29547928134572

### Calculer des statistiques de prix

In [33]:
fare_by_class = {}

passagers_classes = [1,2,3]

for classe in passagers_classes:
    rows_of_classe = titanic_survival[titanic_survival["pclass"] == classe]
    col_fare = rows_of_classe["fare"]
    mean_of_classe = col_fare.mean()
    fare_by_class[classe] = mean_of_classe
print(fare_by_class)

{1: 87.50899164086687, 2: 21.1791963898917, 3: 13.302888700564957}


### Introduction aux pivots de table

In [37]:
# Dataframe.pivot_table(), aggfun est par defauts sur numpy.mean
fare_by_class = titanic_survival.pivot_table(index="pclass", values="fare", aggfunc=numpy.mean)
print(fare_by_class)

             fare
pclass           
1.0     87.508992
2.0     21.179196
3.0     13.302889


In [38]:
# Moyenne des ages par classes de passagers
age_by_class = titanic_survival.pivot_table(index="pclass", values="age", aggfunc=numpy.mean)
print(age_by_class)

              age
pclass           
1.0     39.159918
2.0     29.506705
3.0     24.816367


In [39]:
# la moyenne des survivants par classe 
survived_by_class = titanic_survival.pivot_table(index="pclass", values="survived", aggfunc=numpy.mean)
print(survived_by_class)

        survived
pclass          
1.0     0.619195
2.0     0.429603
3.0     0.255289


In [42]:
# la somme d'argent et de survivant par port 
ports_stats = titanic_survival.pivot_table(index="embarked", values=["fare","survived"], aggfunc=numpy.sum)
print(ports_stats)

                fare  survived
embarked                      
C         16830.7922     150.0
Q          1526.3085      44.0
S         25033.3862     304.0


###  Eliminer les valeurs manquantes

In [44]:
# dataframe.dropna(axis=0 ou axis=1, subset = [col]) 0 : pour col et 1 : pour row
dropna_rows = titanic_survival.dropna(axis=0)
dropna_rows

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest


In [45]:
dropna_cols = titanic_survival.dropna(axis=1)
dropna_cols

0
1
2
3
4
...
1305
1306
1307
1308
1309


In [48]:
titanic_survival.dropna(axis=0, subset=["name"])

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3.0,0.0,"Zabour, Miss. Hileni",female,14.5000,1.0,0.0,2665,14.4542,,C,,328.0,
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,


In [49]:
new_titanic_survival = titanic_survival.dropna(axis=0, subset = ["age","sex"])
new_titanic_survival.shape

(1046, 14)

###  iloc pour accéder à des lignes

In [50]:
new_titanic_survival.head()
# loc[] : donne la position dans l'ordre actuel du datagramme
# iloc[] : donne l'index d'origine de la ligne dans le datagramme

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [55]:
first_ten_rows = new_titanic_survival.iloc[0:10]
row_position_fifth = new_titanic_survival.iloc[5]
row_index_25 = new_titanic_survival.loc[25]
row_index_25

pclass                         1
survived                       0
name         Birnbaum, Mr. Jakob
sex                         male
age                           25
sibsp                          0
parch                          0
ticket                     13905
fare                          26
cabin                        NaN
embarked                       C
boat                         NaN
body                         148
home.dest      San Francisco, CA
Name: 25, dtype: object

### Les indexes de colonnes

In [60]:
row_index_1100_age = new_titanic_survival.loc[1100,"age"]
row_index_1100_age

29.0

In [61]:
row_index_25_survived = new_titanic_survival.loc[25,"survived"]
row_index_25_survived

0.0

In [62]:
five_rows_three_cols = new_titanic_survival.iloc[0:5,0:3]
five_rows_three_cols

Unnamed: 0,pclass,survived,name
0,1.0,1.0,"Allen, Miss. Elisabeth Walton"
1,1.0,1.0,"Allison, Master. Hudson Trevor"
2,1.0,0.0,"Allison, Miss. Helen Loraine"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)"


### Réindexer les lignes d'un dataframe

In [63]:
new_titanic_survival

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1301,3.0,0.0,"Youseff, Mr. Gerious",male,45.5000,0.0,0.0,2628,7.2250,,C,,312.0,
1304,3.0,0.0,"Zabour, Miss. Hileni",female,14.5000,1.0,0.0,2665,14.4542,,C,,328.0,
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,


In [66]:
# dataframe.reset_index()
titanic_reindexed = new_titanic_survival.reset_index(drop = True)
five_rows_three_cols = titanic_reindexed.iloc[0:5,0:3]
five_rows_three_cols

Unnamed: 0,pclass,survived,name
0,1.0,1.0,"Allen, Miss. Elisabeth Walton"
1,1.0,1.0,"Allison, Master. Hudson Trevor"
2,1.0,0.0,"Allison, Miss. Helen Loraine"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)"


### Appliquer des fonctions sur un dataframe

In [72]:
# dataframe.apply()
def series_na(serie):
    result = serie.isnull()
    elt_null = serie[result]
    return len(elt_null)

column_null_count = titanic_survival.apply(series_na)
column_null_count

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64

### Appliquer une fonction à une ligne

In [80]:
# dataframe.apply(fun, axis = 1)
def notice_age(row):
    age = row["age"]
    if pd.isnull(age):
        return 'Unknown'
    elif age < 18:
        return 'minor'
    else :
        return 'major'
    
age_labels = titanic_survival.apply(notice_age, axis=1)
age_labels

0         major
1         minor
2         minor
3         major
4         major
         ...   
1305    Unknown
1306      major
1307      major
1308      major
1309    Unknown
Length: 1310, dtype: object

### Cas pratique : % de survie par groupe d'age

In [81]:
titanic_survival["age_labels"] = age_labels

In [83]:
age_group_survival = titanic_survival.pivot_table(index ="age_labels", values ="survived", aggfunc = numpy.mean)
age_group_survival

Unnamed: 0_level_0,survived
age_labels,Unnamed: 1_level_1
Unknown,0.277567
major,0.387892
minor,0.525974
