In [1]:
""" Data Transformation """

' Data Transformation '

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
""" Retirer les duplicats """

' Retirer les duplicats '

In [5]:
data = pd.DataFrame({'k1':['one','two'] * 3 + ['two'],
                     'k2' : [1,1,2,3,3,4,4]})

In [6]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [7]:
#retourner un Series désignant quelle ligne est un duplicat
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [8]:
#retourner un dataframe sans les duplicats
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [9]:
data['v1'] = range(7)

In [10]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [11]:
#retirer les duplicats en se basant uniquement sur la colonne 'k1'
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [12]:
""" Transforming Data Using a Function or Mapping """

' Transforming Data Using a Function or Mapping '

In [13]:
# Ajouter des elements avec map()
data = pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],'ounces':[4,3,12,6,7.5,8,3,5,6]})

In [14]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [15]:
# ajouter une colonne pour le type de l'animal
meat_to_animal ={'bacon':'pig','pulled pork':'pig','pastrami':'pig','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}

In [16]:
#generer une Series depuis le dataframe avec tous les noms en minuscule
lowercased = data['food'].str.lower()

In [17]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [18]:
#creer un dataframe combiné .map()
data['animal'] = lowercased.map(meat_to_animal)


In [19]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,pig
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,pig
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [20]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       pig
4       cow
5       pig
6       pig
7       pig
8    salmon
Name: food, dtype: object

In [21]:
""" Remplacer des valeurs"""

' Remplacer des valeurs'

In [22]:
data = pd.Series([1.,-999.,2.,-999.,-1000.,3.])

In [23]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [24]:
#remplacer une donnée cible
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [25]:
# remplacer plusieurs valeurs cibles
data.replace([-999.,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [26]:
# Faire une correnspondance de remplacement
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [27]:
#remplacement à travers un dict
data.replace({-999 : np.nan, -1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [28]:
""" Renommer l'index des axes """

" Renommer l'index des axes "

In [29]:
data = pd.DataFrame(np.arange(12).reshape((3,4)), index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])

In [30]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [31]:
#transformer avec .map()
transform = lambda x: x[:4].upper()

In [32]:
transform

<function __main__.<lambda>>

In [33]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [34]:
data.index = data.index.map(transform)

In [35]:
data.index

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [36]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [37]:
#creer une version transformée du dataset sans modifier l'original rename()
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [38]:
# utiliser rename() avec un dict
data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [39]:
""" Discretization and Binning : """

' Discretization and Binning : '

In [40]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]

In [41]:
#diviser ces elements dans des 'bins' (boite) representant des tranches d'âge
bins = [18,25,35,60,100]

In [42]:
cats = pd.cut(ages,bins)

In [43]:
cats#renvoie dans quelle intervalle se trouve chaque nombre

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [44]:
cats.codes# renvoie des codes pour chaque elt par rapport aux intervalles

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [45]:
cats.categories #nom des categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [46]:
pd.value_counts(cats)# nombre d'elements pas categorie

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [47]:
# renommer les bins
group_names = ['Youth','YoungAdult','MiddleAges','Senior']

In [48]:
group_names

['Youth', 'YoungAdult', 'MiddleAges', 'Senior']

In [49]:
pd.cut(ages,bins,labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAges, MiddleAges, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAges < Senior]

In [50]:
data = np.random.rand(20)

In [51]:
data

array([ 0.44325794,  0.29099175,  0.2236473 ,  0.27617865,  0.85873892,
        0.74911754,  0.59776129,  0.95739466,  0.99440243,  0.75839179,
        0.08063299,  0.41630174,  0.88859305,  0.24872178,  0.9005196 ,
        0.86904634,  0.71342446,  0.51550962,  0.28661166,  0.44759989])

In [52]:
pd.cut(data,4,precision=2)#choisir le nombre de bins et la precision après la virgule

[(0.31, 0.54], (0.08, 0.31], (0.08, 0.31], (0.08, 0.31], (0.77, 0.99], ..., (0.77, 0.99], (0.54, 0.77], (0.31, 0.54], (0.08, 0.31], (0.31, 0.54]]
Length: 20
Categories (4, interval[float64]): [(0.08, 0.31] < (0.31, 0.54] < (0.54, 0.77] < (0.77, 0.99]]

In [53]:
data = np.random.randn(1000)#distribution normale

In [54]:
data

array([  4.48437330e-01,  -2.82321399e-01,  -8.15769776e-01,
         4.63516284e-01,   9.24076284e-01,   1.44440993e+00,
         5.01171649e-01,   1.58359420e-01,   9.66596811e-01,
         1.12434865e-01,  -1.61027006e+00,   3.08346792e-01,
         6.27273956e-01,   8.08964915e-01,  -4.64932459e-01,
        -1.12062110e+00,   5.65107822e-01,   9.16962911e-01,
        -8.79396336e-01,   1.48270766e+00,  -1.70939451e+00,
        -3.28766830e-01,   2.92768857e-02,   7.96883537e-01,
        -1.04766245e+00,  -9.21215873e-01,  -1.15920098e+00,
         7.66262001e-01,   8.10009119e-01,   9.67307667e-01,
        -3.50288225e-01,   2.13123087e-01,   5.45833869e-01,
         1.51614335e+00,   4.80875271e-01,   6.74179761e-01,
         6.49000600e-01,  -2.07567384e+00,   2.10975399e-01,
        -9.32351929e-01,   3.98604368e-01,  -1.00594926e+00,
        -1.23529634e+00,   4.16597721e-01,  -1.07725672e+00,
         7.42932769e-01,  -1.15772530e-01,  -3.29125843e-01,
        -2.24411354e-01,

In [55]:
cats = pd.qcut(data,4)#division par quartiles

In [56]:
cats

[(0.0732, 0.802], (-0.682, 0.0732], (-3.518, -0.682], (0.0732, 0.802], (0.802, 2.889], ..., (0.0732, 0.802], (0.802, 2.889], (0.802, 2.889], (-3.518, -0.682], (0.0732, 0.802]]
Length: 1000
Categories (4, interval[float64]): [(-3.518, -0.682] < (-0.682, 0.0732] < (0.0732, 0.802] < (0.802, 2.889]]

In [57]:
pd.value_counts(cats)

(0.802, 2.889]      250
(0.0732, 0.802]     250
(-0.682, 0.0732]    250
(-3.518, -0.682]    250
dtype: int64

In [58]:
#definir ses propres quantiles
pd.qcut(data,[0,0.1,0.5,0.9,1.])

[(0.0732, 1.433], (-1.286, 0.0732], (-1.286, 0.0732], (0.0732, 1.433], (0.0732, 1.433], ..., (0.0732, 1.433], (0.0732, 1.433], (1.433, 2.889], (-1.286, 0.0732], (0.0732, 1.433]]
Length: 1000
Categories (4, interval[float64]): [(-3.518, -1.286] < (-1.286, 0.0732] < (0.0732, 1.433] < (1.433, 2.889]]

In [59]:
""" Detecting and Filtering outliers """

' Detecting and Filtering outliers '