___

<a href='https://oxiane-institut.com/'> <img src='../oxiane.jpg' /></a>
___

## Dataframe


DataFrames are the cornerstone of pandas and are directly inspired by the R programming language. 

We can think of a DataFrame as a collection of Series objects grouped together to share the same index. 

Let's use pandas to explore this topic!


In [294]:
import pandas as pd
import numpy as np


pd.set_option(
    'display.max_colwidth', 100     # Default: 50
)

pd.set_option(
    'display.max_rows', 100         # Default: 15
)

# vérifier la version
pd.__version__

'2.2.2'

### From Python data

In [295]:
"A B C D E".split()

['A', 'B', 'C', 'D', 'E']

In [296]:
from numpy.random import randn

np.random.seed(101)

data = randn(5, 4)
indices = ['A', 'B', 'C', 'D', 'E']
colonnes = ['W', 'X', 'Y', 'Z']


df = pd.DataFrame(data, index=indices, columns=colonnes)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Let's load a more complete dataset, from CSV

In [297]:
df = pd.read_csv("data/heart.csv")

In [298]:
# vérifions le type de df
type(df)

pandas.core.frame.DataFrame

In [299]:
df.shape

(270, 13)

In [300]:
df.head()

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,70,masculin,D,130,322,A,C,109,non,24,2,D,presence
1,67,feminin,C,115,564,A,C,160,non,16,2,A,absence
2,57,masculin,B,124,261,A,A,141,non,3,1,A,presence
3,64,masculin,D,128,263,A,A,105,oui,2,2,B,absence
4,74,feminin,B,120,269,A,C,121,oui,2,1,B,absence


In [301]:
df.columns

Index(['age', 'sexe', 'type_douleur', 'pression', 'cholester', 'sucre',
       'electro', 'taux_max', 'angine', 'depression', 'pic', 'vaisseau',
       'coeur'],
      dtype='object')

In [302]:
df.dtypes

age              int64
sexe            object
type_douleur    object
pression         int64
cholester        int64
sucre           object
electro         object
taux_max         int64
angine          object
depression       int64
pic              int64
vaisseau        object
coeur           object
dtype: object

In [303]:
df

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,70,masculin,D,130,322,A,C,109,non,24,2,D,presence
1,67,feminin,C,115,564,A,C,160,non,16,2,A,absence
2,57,masculin,B,124,261,A,A,141,non,3,1,A,presence
3,64,masculin,D,128,263,A,A,105,oui,2,2,B,absence
4,74,feminin,B,120,269,A,C,121,oui,2,1,B,absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,masculin,C,172,199,B,A,162,non,5,1,A,absence
266,44,masculin,B,120,263,A,A,173,non,0,1,A,absence
267,56,feminin,B,140,294,A,C,153,non,13,2,A,absence
268,57,masculin,D,140,192,A,A,148,non,4,2,A,absence


## Indexing // accessing the data

### Single column

In [304]:
df['sexe']

0      masculin
1       feminin
2      masculin
3      masculin
4       feminin
         ...   
265    masculin
266    masculin
267     feminin
268    masculin
269    masculin
Name: sexe, Length: 270, dtype: object

In [305]:
df.sexe

0      masculin
1       feminin
2      masculin
3      masculin
4       feminin
         ...   
265    masculin
266    masculin
267     feminin
268    masculin
269    masculin
Name: sexe, Length: 270, dtype: object

### Multiple columns

In [306]:
df[["sexe", "age"]]

# ! /!\ it needs to be in an array
# df["sexe", "age"] # -> throws an error

Unnamed: 0,sexe,age
0,masculin,70
1,feminin,67
2,masculin,57
3,masculin,64
4,feminin,74
...,...,...
265,masculin,52
266,masculin,44
267,feminin,56
268,masculin,57


### Create and delete columns

In [307]:
df["new_column"] = df["depression"] + df["pic"]
df

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur,new_column
0,70,masculin,D,130,322,A,C,109,non,24,2,D,presence,26
1,67,feminin,C,115,564,A,C,160,non,16,2,A,absence,18
2,57,masculin,B,124,261,A,A,141,non,3,1,A,presence,4
3,64,masculin,D,128,263,A,A,105,oui,2,2,B,absence,4
4,74,feminin,B,120,269,A,C,121,oui,2,1,B,absence,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,masculin,C,172,199,B,A,162,non,5,1,A,absence,6
266,44,masculin,B,120,263,A,A,173,non,0,1,A,absence,1
267,56,feminin,B,140,294,A,C,153,non,13,2,A,absence,15
268,57,masculin,D,140,192,A,A,148,non,4,2,A,absence,6


In [308]:
df.drop("new_column", axis=1)

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,70,masculin,D,130,322,A,C,109,non,24,2,D,presence
1,67,feminin,C,115,564,A,C,160,non,16,2,A,absence
2,57,masculin,B,124,261,A,A,141,non,3,1,A,presence
3,64,masculin,D,128,263,A,A,105,oui,2,2,B,absence
4,74,feminin,B,120,269,A,C,121,oui,2,1,B,absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,masculin,C,172,199,B,A,162,non,5,1,A,absence
266,44,masculin,B,120,263,A,A,173,non,0,1,A,absence
267,56,feminin,B,140,294,A,C,153,non,13,2,A,absence
268,57,masculin,D,140,192,A,A,148,non,4,2,A,absence


In [309]:
# Not inplace unless specified!
df

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur,new_column
0,70,masculin,D,130,322,A,C,109,non,24,2,D,presence,26
1,67,feminin,C,115,564,A,C,160,non,16,2,A,absence,18
2,57,masculin,B,124,261,A,A,141,non,3,1,A,presence,4
3,64,masculin,D,128,263,A,A,105,oui,2,2,B,absence,4
4,74,feminin,B,120,269,A,C,121,oui,2,1,B,absence,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,masculin,C,172,199,B,A,162,non,5,1,A,absence,6
266,44,masculin,B,120,263,A,A,173,non,0,1,A,absence,1
267,56,feminin,B,140,294,A,C,153,non,13,2,A,absence,15
268,57,masculin,D,140,192,A,A,148,non,4,2,A,absence,6


In [310]:
df.drop("new_column", axis=1, inplace=True)
df

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,70,masculin,D,130,322,A,C,109,non,24,2,D,presence
1,67,feminin,C,115,564,A,C,160,non,16,2,A,absence
2,57,masculin,B,124,261,A,A,141,non,3,1,A,presence
3,64,masculin,D,128,263,A,A,105,oui,2,2,B,absence
4,74,feminin,B,120,269,A,C,121,oui,2,1,B,absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,masculin,C,172,199,B,A,162,non,5,1,A,absence
266,44,masculin,B,120,263,A,A,173,non,0,1,A,absence
267,56,feminin,B,140,294,A,C,153,non,13,2,A,absence
268,57,masculin,D,140,192,A,A,148,non,4,2,A,absence


In [311]:
df.drop(0, axis=0)

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
1,67,feminin,C,115,564,A,C,160,non,16,2,A,absence
2,57,masculin,B,124,261,A,A,141,non,3,1,A,presence
3,64,masculin,D,128,263,A,A,105,oui,2,2,B,absence
4,74,feminin,B,120,269,A,C,121,oui,2,1,B,absence
5,65,masculin,D,120,177,A,A,140,non,4,1,A,absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,masculin,C,172,199,B,A,162,non,5,1,A,absence
266,44,masculin,B,120,263,A,A,173,non,0,1,A,absence
267,56,feminin,B,140,294,A,C,153,non,13,2,A,absence
268,57,masculin,D,140,192,A,A,148,non,4,2,A,absence


## Dataframe object

In [316]:
df_sorted_by_age = df.sort_values(by="age")

df_sorted_by_age.head()

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
214,29,masculin,B,130,204,A,C,202,non,0,1,A,absence
174,34,masculin,A,118,182,A,C,174,non,0,1,A,absence
138,34,feminin,B,118,210,A,A,192,non,7,1,A,absence
224,35,feminin,D,138,183,A,A,182,non,14,1,A,absence
81,35,masculin,D,120,198,A,A,130,oui,16,2,A,presence


In [317]:
df_reset_index = df_sorted_by_age.reset_index()
df_reset_index

Unnamed: 0,index,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,214,29,masculin,B,130,204,A,C,202,non,0,1,A,absence
1,174,34,masculin,A,118,182,A,C,174,non,0,1,A,absence
2,138,34,feminin,B,118,210,A,A,192,non,7,1,A,absence
3,224,35,feminin,D,138,183,A,A,182,non,14,1,A,absence
4,81,35,masculin,D,120,198,A,A,130,oui,16,2,A,presence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,15,71,feminin,D,112,149,A,A,125,non,16,2,A,absence
266,255,71,feminin,B,160,302,A,A,162,non,4,1,C,absence
267,4,74,feminin,B,120,269,A,C,121,oui,2,1,B,absence
268,73,76,feminin,C,140,197,A,B,116,non,11,2,A,absence


In [318]:
df_reset_index.rename(columns={"index": "old index"})

Unnamed: 0,old index,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,214,29,masculin,B,130,204,A,C,202,non,0,1,A,absence
1,174,34,masculin,A,118,182,A,C,174,non,0,1,A,absence
2,138,34,feminin,B,118,210,A,A,192,non,7,1,A,absence
3,224,35,feminin,D,138,183,A,A,182,non,14,1,A,absence
4,81,35,masculin,D,120,198,A,A,130,oui,16,2,A,presence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,15,71,feminin,D,112,149,A,A,125,non,16,2,A,absence
266,255,71,feminin,B,160,302,A,A,162,non,4,1,C,absence
267,4,74,feminin,B,120,269,A,C,121,oui,2,1,B,absence
268,73,76,feminin,C,140,197,A,B,116,non,11,2,A,absence


In [319]:
df_age_index = df_reset_index.set_index("age")
df_age_index

Unnamed: 0_level_0,index,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
29,214,masculin,B,130,204,A,C,202,non,0,1,A,absence
34,174,masculin,A,118,182,A,C,174,non,0,1,A,absence
34,138,feminin,B,118,210,A,A,192,non,7,1,A,absence
35,224,feminin,D,138,183,A,A,182,non,14,1,A,absence
35,81,masculin,D,120,198,A,A,130,oui,16,2,A,presence
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,15,feminin,D,112,149,A,A,125,non,16,2,A,absence
71,255,feminin,B,160,302,A,A,162,non,4,1,C,absence
74,4,feminin,B,120,269,A,C,121,oui,2,1,B,absence
76,73,feminin,C,140,197,A,B,116,non,11,2,A,absence


In [320]:
df_age_index.loc[71]

Unnamed: 0_level_0,index,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
71,29,feminin,C,110,265,B,C,130,non,0,1,B,absence
71,15,feminin,D,112,149,A,A,125,non,16,2,A,absence
71,255,feminin,B,160,302,A,A,162,non,4,1,C,absence
