## REGRESSION

### Importation des données

In [1]:
import pickle
import pandas as pd

with open('dataframe.pkl', 'rb') as file:
    df = pickle.load(file)

# Maintenant, df contient la DataFrame importée depuis le fichier
df

Unnamed: 0,No disposition,Date mutation,Nature mutation,Valeur fonciere,Type de voie,Voie,Code postal,Commune,Code commune,Section,No plan,1er lot,Surface Carrez du 1er lot,Nombre de lots,Type local,Surface reelle bati,Nombre pieces principales,Nature culture,Surface terrain,col_concat
3,1,04/01/2021,Vente,204332.0,ALL,DES ECUREUILS,1310.0,BUELLAS,65,B,1325,,,0,Maison,88.0,4.0,S,866.0,04/01/2021ALL7.00276DES ECUREUILS1310.0BUELLAS
14,2,04/01/2021,Vente,226700.0,CHE,DU MOULIN DE POLAIZE,1310.0,POLLIAT,301,AA,289,,,0,Maison,96.0,3.0,,,04/01/2021CHE173.00164DU MOULIN DE POLAIZE1310...
15,1,08/01/2021,Vente,185000.0,RUE,DES GRANGES BONNET,1960.0,PERONNAS,289,AD,31,,,0,Maison,100.0,4.0,S,703.0,08/01/2021RUE46.00161DES GRANGES BONNET1960.0P...
16,1,07/01/2021,Vente,114500.0,RUE,DE LA MAIRIE,1340.0,FOISSIAT,163,AB,302,,,0,Maison,85.0,2.0,S,87.0,07/01/2021RUE179.00110DE LA MAIRIE1340.0FOISSIAT
19,1,08/01/2021,Vente,145000.0,IMP,DE CHAMANDRE,1340.0,FOISSIAT,163,WC,215,,,0,Maison,92.0,1.0,S,2480.0,08/01/2021IMP8.00255DE CHAMANDRE1340.0FOISSIAT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4649204,1,12/03/2021,Vente,383000.0,RUE,DES TOURNELLES,75004.0,PARIS 04,104,AO,117,14,28.86,2,Appartement,30.0,2.0,,,12/03/2021RUE8.09382DES TOURNELLES75004.0PARIS 04
4649205,1,17/03/2021,Vente,260000.0,RUE,SAINTE FOY,75002.0,PARIS 02,102,AP,128,304,,2,Appartement,37.0,2.0,,,17/03/2021RUE8.08561SAINTE FOY75002.0PARIS 02
4649206,1,19/03/2021,Vente,38000.0,RUE,DES GUILLEMITES,75004.0,PARIS 04,104,AH,68,121,,1,Dépendance,0.0,0.0,,,19/03/2021RUE6.04398DES GUILLEMITES75004.0PARI...
4649207,1,30/03/2021,Vente,38000.0,RUE,DU PETIT MUSC,75004.0,PARIS 04,104,AP,5,399,,1,Dépendance,0.0,0.0,,,30/03/2021RUE30.07338DU PETIT MUSC75004.0PARIS 04


### Echantillonage

In [12]:
variables_explicatives = ['Type local', 'Nombre pieces principales', 'Nombre de lots']

X = df[variables_explicatives]
X = pd.get_dummies(data=X, columns=['Type local'])
X

Unnamed: 0,Nombre pieces principales,Nombre de lots,Type local_Appartement,Type local_Dépendance,Type local_Local industriel. commercial ou assimilé,Type local_Maison
3,4.0,0,0,0,0,1
14,3.0,0,0,0,0,1
15,4.0,0,0,0,0,1
16,2.0,0,0,0,0,1
19,1.0,0,0,0,0,1
...,...,...,...,...,...,...
4649204,2.0,2,1,0,0,0
4649205,2.0,2,1,0,0,0
4649206,0.0,1,0,1,0,0
4649207,0.0,1,0,1,0,0


In [16]:
# notre target
Y = df['Valeur fonciere']
Y

3          204332.0
14         226700.0
15         185000.0
16         114500.0
19         145000.0
             ...   
4649204    383000.0
4649205    260000.0
4649206     38000.0
4649207     38000.0
4649208    690000.0
Name: Valeur fonciere, Length: 656022, dtype: float64

In [40]:
# on remplace les NA dans la colonne 'Nombre pieces principales' par la moyenne
X['Nombre pieces principales'].fillna(X['Nombre pieces principales'].mean(), inplace=True)
X['Nombre pieces principales'].isna().value_counts()

False    656022
Name: Nombre pieces principales, dtype: int64

In [38]:
pourcentage_manquant = (X.isna().sum() / len(X)) * 100
pourcentage_manquant

Nombre pieces principales                              0.0
Nombre de lots                                         0.0
Type local_Appartement                                 0.0
Type local_Dépendance                                  0.0
Type local_Local industriel. commercial ou assimilé    0.0
Type local_Maison                                      0.0
dtype: float64

# Pour ne pas donner plus d'importance aux variables explicatives à forte variance, il est essentiel de centrer et réduire les données en amont. On centre et réduit également afin de les ramener à la même échelle 

In [47]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# Appliquez la standardisation aux données de X
X_stand = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_stand

Unnamed: 0,Nombre pieces principales,Nombre de lots,Type local_Appartement,Type local_Dépendance,Type local_Local industriel. commercial ou assimilé,Type local_Maison
0,0.597106,-0.520261,-0.509915,-0.444745,-0.273491,0.888357
1,0.107892,-0.520261,-0.509915,-0.444745,-0.273491,0.888357
2,0.597106,-0.520261,-0.509915,-0.444745,-0.273491,0.888357
3,-0.381321,-0.520261,-0.509915,-0.444745,-0.273491,0.888357
4,-0.870535,-0.520261,-0.509915,-0.444745,-0.273491,0.888357
...,...,...,...,...,...,...
656017,-0.381321,1.469638,1.961112,-0.444745,-0.273491,-1.125673
656018,-0.381321,1.469638,1.961112,-0.444745,-0.273491,-1.125673
656019,-1.359748,0.474689,-0.509915,2.248480,-0.273491,-1.125673
656020,-1.359748,0.474689,-0.509915,2.248480,-0.273491,-1.125673


In [41]:
from sklearn.model_selection import train_test_split
#70% des données pour l’apprentissage
#30% des données pour l'échantillon test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.30, random_state = 42)

### Régression linéaire multiple

#### Apprentissage

On lance l'apprentissage du modèle sur l'échantillon d'entrainement

In [43]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
model_LinearRegression = lm.fit(X_train,y_train)

### Test

In [45]:
y_pred = model_LinearRegression.predict(X_test)

array([471781.5, 183433. , 747142. , ..., 255740. , 402941.5, 150398. ])

In [46]:
from sklearn.metrics import mean_squared_error
#on multiplie par 0.3 pour avoir un résultat en metre
mean_squared_error(y_test, y_pred, squared=False)

3902113.364803185