# Nettoyage des valeurs non significatives

In [2]:
import pandas as pd
import numpy as np

In [3]:
ma_dict = ({'banane' : [10, 20], 'ananas': [23, 16], 'pomme' : [35, 50]})

In [4]:
fruits = pd.DataFrame.from_dict(ma_dict, orient = 'index')

In [5]:
fruits

Unnamed: 0,0,1
banane,10,20
ananas,23,16
pomme,35,50


In [6]:
fr2 =  pd.DataFrame.from_dict(ma_dict, orient = 'index')

In [7]:
frame = [fruits,fr2]

In [8]:
fruits = pd.concat(frame)

In [9]:
fruits

Unnamed: 0,0,1
banane,10,20
ananas,23,16
pomme,35,50
banane,10,20
ananas,23,16
pomme,35,50


###  Détection des lignes dupliquées avec pandas

In [10]:
fruits.duplicated()

banane    False
ananas    False
pomme     False
banane     True
ananas     True
pomme      True
dtype: bool

In [11]:
fruits.drop_duplicates(inplace= True)

In [12]:
fruits

Unnamed: 0,0,1
banane,10,20
ananas,23,16
pomme,35,50


### Génération d'une regression

In [13]:
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=20, n_features=4,n_informative = 2,  noise=3, random_state=42)

In [14]:
X.shape

(20, 4)

In [15]:
list(zip(X.flat, y))

[(0.11092258970986608, -48.23041762248722),
 (-1.1509935774223028, -81.34100869867015),
 (-0.5443827245251827, -50.57710158219891),
 (0.37569801834567196, -0.8158567825974703),
 (0.3142473325952739, -31.507801841574196),
 (-0.9080240755212109, -28.6426383127758),
 (-1.0128311203344238, 107.56746992868294),
 (-1.4123037013352915, -13.941901556315575),
 (0.6116762888408679, 16.034777640238858),
 (1.030999522495951, -47.33341554260981),
 (-0.6769220003059587, 46.06697178891597),
 (0.9312801191161986, -27.242176285645005),
 (-0.29900735046586746, -27.301641745729718),
 (0.0917607765355023, -60.24385000345572),
 (0.08704706823817122, 128.37519412767696),
 (-1.9875689146008928, 69.1866698125634),
 (0.5425600435859647, -82.98463225525909),
 (-0.46341769281246226, -153.11384211685123),
 (-0.4694743859349521, -26.991358741246103),
 (-0.46572975357025687, 31.507787053024916)]

In [16]:
data = pd.DataFrame(X, columns=['x1', 'x2', 'x3', 'x4'])

In [17]:
data['y'] = y


In [18]:
data.insert(4, 'x5', 0.2)

### Utilisation de la méthode nunique de pandas pour compter les valeurs differentes

In [19]:
data

Unnamed: 0,x1,x2,x3,x4,x5,y
0,0.110923,-1.150994,-0.544383,0.375698,0.2,-48.230418
1,0.314247,-0.908024,-1.012831,-1.412304,0.2,-81.341009
2,0.611676,1.031,-0.676922,0.93128,0.2,-50.577102
3,-0.299007,0.091761,0.087047,-1.987569,0.2,-0.815857
4,0.54256,-0.463418,-0.469474,-0.46573,0.2,-31.507802
5,-0.234137,1.579213,-0.234153,0.767435,0.2,-28.642638
6,1.35624,-0.07201,0.812526,1.003533,0.2,107.56747
7,-1.76304,0.324084,0.343618,-0.385082,0.2,-13.941902
8,-0.64512,0.361396,0.361636,1.538037,0.2,16.034778
9,-0.185659,-1.106335,-0.479174,-1.196207,0.2,-47.333416


In [20]:
data.nunique()

x1    20
x2    20
x3    20
x4    20
x5     1
y     20
dtype: int64

In [21]:
data.drop('x5',axis = 1,  inplace = True)

In [22]:
data.nunique()

x1    20
x2    20
x3    20
x4    20
y     20
dtype: int64

### Utilisation de sklearn pour supprimer des colonnes à faible variance 

In [27]:
from sklearn.feature_selection import VarianceThreshold
from sklearn import set_config
# nouveau dans sklearn sorite vers Pandas
#set_config(transform_output = "pandas")
variance = VarianceThreshold().set_output(transform= 'pandas')
dt2 = variance.fit_transform(data.iloc[:,:4])
print(dt2.shape)

(20, 4)


In [28]:
dt2

Unnamed: 0,x1,x2,x3,x4
0,0.110923,-1.150994,-0.544383,0.375698
1,0.314247,-0.908024,-1.012831,-1.412304
2,0.611676,1.031,-0.676922,0.93128
3,-0.299007,0.091761,0.087047,-1.987569
4,0.54256,-0.463418,-0.469474,-0.46573
5,-0.234137,1.579213,-0.234153,0.767435
6,1.35624,-0.07201,0.812526,1.003533
7,-1.76304,0.324084,0.343618,-0.385082
8,-0.64512,0.361396,0.361636,1.538037
9,-0.185659,-1.106335,-0.479174,-1.196207


##  Connaitre la version de sklearn

In [29]:
import sklearn
print(sklearn.__version__)

1.2.2
