## Importation données

In [2]:
import pandas as pd

In [3]:
annonces = pd.read_csv("data.tsv",sep="\t")

In [4]:
annonces.head()

Unnamed: 0,Id,Genre,Neuf,Surface,Pieces,Quartier,Prix
0,annonce-138905473-376235,Appartement,0,90.0,3.0,cathédrale,374400.0
1,annonce-140620177-376235,Appartement,0,146.27,5.0,sud,499200.0
2,annonce-140620179-376235,Appartement,0,110.0,5.0,prébendes,499200.0
3,annonce-133494153-376235,Maison,0,132.0,6.0,prébendes,508000.0
4,annonce-137425993-376235,Maison,0,185.0,7.0,strasbourg,676000.0


In [5]:
annonces.drop(columns="Id",inplace=True)

In [6]:
annonces.head()

Unnamed: 0,Genre,Neuf,Surface,Pieces,Quartier,Prix
0,Appartement,0,90.0,3.0,cathédrale,374400.0
1,Appartement,0,146.27,5.0,sud,499200.0
2,Appartement,0,110.0,5.0,prébendes,499200.0
3,Maison,0,132.0,6.0,prébendes,508000.0
4,Maison,0,185.0,7.0,strasbourg,676000.0


In [8]:
annonces.dtypes

Genre        object
Neuf          int64
Surface     float64
Pieces      float64
Quartier     object
Prix        float64
dtype: object

In [9]:
annonces.isna().sum()

Genre         0
Neuf          0
Surface       8
Pieces        1
Quartier    692
Prix          1
dtype: int64

In [10]:
annonces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1647 entries, 0 to 1646
Data columns (total 6 columns):
Genre       1647 non-null object
Neuf        1647 non-null int64
Surface     1639 non-null float64
Pieces      1646 non-null float64
Quartier    955 non-null object
Prix        1646 non-null float64
dtypes: float64(3), int64(1), object(2)
memory usage: 77.3+ KB


In [11]:
annonces.dropna(subset=("Surface","Pieces","Prix"),inplace=True) #modifie le tableau
annonces.isna().sum()

Genre         0
Neuf          0
Surface       0
Pieces        0
Quartier    688
Prix          0
dtype: int64

# Suppression des na

In [12]:
annonces_sans_na = annonces.dropna()

In [13]:
annonces_sans_na.isna().sum()

Genre       0
Neuf        0
Surface     0
Pieces      0
Quartier    0
Prix        0
dtype: int64

In [14]:
annonces_sans_na.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 951 entries, 0 to 1643
Data columns (total 6 columns):
Genre       951 non-null object
Neuf        951 non-null int64
Surface     951 non-null float64
Pieces      951 non-null float64
Quartier    951 non-null object
Prix        951 non-null float64
dtypes: float64(3), int64(1), object(2)
memory usage: 52.0+ KB


In [30]:
import numpy as np

In [11]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler,MinMaxScaler

In [66]:
gestion_quartier = OneHotEncoder()

In [67]:
quartiers = gestion_quartier.fit_transform(annonces_sans_na["Quartier"].values.reshape(-1,1))
quartiers.shape                                           

(951, 26)

In [68]:
gestion_genre = OneHotEncoder()
genre = gestion_genre.fit_transform(annonces_sans_na["Genre"].values.reshape(-1,1))
genre.shape

(951, 2)

In [69]:
reste = annonces_sans_na[["Neuf","Surface","Pieces"]].values
reste.shape

(951, 3)

In [70]:
X = np.concatenate((quartiers.toarray(),genre.toarray(),reste),axis=1)
X.shape

(951, 31)

In [71]:
normalisation = StandardScaler()
X = normalisation.fit_transform(X)
X.shape

(951, 31)

In [72]:
X.min(axis=0)

array([-0.13889782, -0.17417193, -0.04590731, -0.19547278, -0.45312602,
       -0.3233104 , -0.0562544 , -0.06499127, -0.07270084, -0.12659242,
       -0.0562544 , -0.24050374, -0.2663568 , -0.09774528, -0.07270084,
       -0.11304668, -0.48979148, -0.10308731, -0.14278048, -0.26180016,
       -0.13889782, -0.03244428, -0.12223462, -0.3233104 , -0.11304668,
       -0.19835388, -1.60473922, -0.62315421, -0.33508313, -1.12353386,
       -1.25962424])

In [73]:
y = annonces_sans_na["Prix"].values
y.shape

(951,)

# Apprentissage

In [9]:
from sklearn.model_selection import (train_test_split,
                                    RandomizedSearchCV,
                                    GridSearchCV)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [10]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RFR

In [75]:
svr = GridSearchCV(estimator=SVR(),
                  param_grid=dict(C=[0.01,0.1,1,10,100],
                                 epsilon=[0.001,0.01,0.1,1,10,100]
                                 ),
                   )

In [76]:
svr.fit(X_train,y_train)









GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'epsilon': [0.001, 0.01, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [77]:
print(svr.best_params_,svr.best_score_)

{'C': 100, 'epsilon': 100} -0.09883733918857102


In [78]:
rfr = GridSearchCV(estimator=RFR(),
                  param_grid=dict(n_estimators=list(range(10,110,10))
                                 )
                  )

In [79]:
rfr.fit(X_train,y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'n_estimators': [10, 20, 3

In [80]:
print(rfr.best_params_,rfr.best_score_)

{'n_estimators': 40} 0.6297228886924781


In [81]:
predicteur = rfr.best_estimator_

In [82]:
predicteur.score(X_test, y_test)

0.725470343539021

In [8]:
from scipy.stats import uniform

In [84]:
svr_bis = RandomizedSearchCV(estimator=SVR(),
                            param_distributions=dict(
                                C=uniform(1,1000),
                                epsilon=uniform(1,1000)
                            ),
                            n_iter=100)

In [85]:
svr_bis.fit(X_train,y_train)























RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, shrinking=True,
                                 tol=0.001, verbose=False),
                   iid='warn', n_iter=100, n_jobs=None,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001ED7A978390>,
                                        'epsilon': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001ED7A9781D0>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [86]:
print(svr_bis.best_params_,svr_bis.best_score_)

{'C': 997.1824731106304, 'epsilon': 254.1940113281409} -0.02373168937409709


In [87]:
svr_choisi = svr_bis.best_estimator_
svr_choisi.score(X_train, y_train)

-0.004596492296293331

In [88]:
from scipy.stats import randint

In [89]:
rfr_bis = RandomizedSearchCV(estimator=RFR(),
                            param_distributions=dict(
                                n_estimators=randint(10,100)
                            ),
                            n_iter=100)
rfr_bis.fit(X_train, y_train)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_st

In [92]:
print(rfr_bis.best_params_,rfr_bis.best_score_)

{'n_estimators': 71} 0.631165855462475


In [93]:
rfr_choisi = rfr_bis.best_estimator_
rfr_choisi.score(X_train, y_train)

0.942217087859678

In [94]:
predicteur = rfr_choisi
predicteur.score(X_test, y_test)

0.7413745980797574

## Suppression de quartier

In [95]:
annonces_sans_quartier = annonces.drop(columns="Quartier")
annonces_sans_quartier

Unnamed: 0,Genre,Neuf,Surface,Pieces,Prix
0,Appartement,0,90.00,3.0,374400.0
1,Appartement,0,146.27,5.0,499200.0
2,Appartement,0,110.00,5.0,499200.0
3,Maison,0,132.00,6.0,508000.0
4,Maison,0,185.00,7.0,676000.0
...,...,...,...,...,...
1642,Appartement,1,66.30,3.0,254900.0
1643,Appartement,0,42.00,2.0,61500.0
1644,Appartement,0,76.00,3.0,108500.0
1645,Maison,1,84.40,4.0,320000.0


In [96]:
annonces_sans_quartier.isna().sum()

Genre      0
Neuf       0
Surface    0
Pieces     0
Prix       0
dtype: int64

In [97]:
annonces_sans_quartier.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1639 entries, 0 to 1646
Data columns (total 5 columns):
Genre      1639 non-null object
Neuf       1639 non-null int64
Surface    1639 non-null float64
Pieces     1639 non-null float64
Prix       1639 non-null float64
dtypes: float64(3), int64(1), object(1)
memory usage: 76.8+ KB


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [115]:
gestion_genre = OneHotEncoder()
genre = gestion_genre.fit_transform(annonces_sans_quartier["Genre"].values.reshape(-1,1))
reste = annonces_sans_quartier[["Neuf","Surface","Pieces"]].values
X = np.concatenate((genre.toarray(),reste),axis=1)
normalisation = StandardScaler()
X = normalisation.fit_transform(X)
y = annonces_sans_quartier["Prix"].values
svr_bis.fit(X_train,y_train)
rfr_bis.fit(X_train, y_train)
svr_choisi = svr_bis.best_estimator_
rfr_choisi = rfr_bis.best_estimator_























In [116]:
svr_choisi.score(X_train, y_train)

-0.004282980251983259

In [117]:
rfr_choisi.score(X_train, y_train)

0.940944988785174

In [119]:
gestion_genre = OneHotEncoder()
genre = gestion_genre.fit_transform(annonces_sans_quartier["Genre"].values.reshape(-1,1))
reste = annonces_sans_quartier[["Neuf","Surface","Pieces"]].values
X = np.concatenate((genre.toarray(),reste),axis=1)
normalisation = MinMaxScaler()
X = normalisation.fit_transform(X)
y = annonces_sans_quartier["Prix"].values
svr_bis.fit(X_train,y_train)
rfr_bis.fit(X_train, y_train)
svr_choisi = svr_bis.best_estimator_
rfr_choisi = rfr_bis.best_estimator_























In [120]:
svr_choisi.score(X_train, y_train)

-0.005544560098953877

In [121]:
rfr_choisi.score(X_train, y_train)

0.9569893630071403

In [124]:
gestion_genre = OneHotEncoder()
genre = gestion_genre.fit_transform(annonces_sans_quartier["Genre"].values.reshape(-1,1))
reste = annonces_sans_quartier[["Neuf","Surface","Pieces"]].values
X = np.concatenate((genre.toarray(),reste),axis=1)
#normalisation = MinMaxScaler()
#X = fit_transform(X)
y = annonces_sans_quartier["Prix"].values
svr_bis.fit(X_train,y_train)
rfr_bis.fit(X_train, y_train)
svr_choisi = svr_bis.best_estimator_
rfr_choisi = rfr_bis.best_estimator_























In [125]:
svr_choisi.score(X_train, y_train)

-0.004953499131839623

In [126]:
rfr_choisi.score(X_train, y_train)

0.9577417191259

In [127]:
predicteur = rfr_choisi
predicteur.score(X_test, y_test)

0.7371489353804654

In [128]:
from sklearn.linear_model import LogisticRegression

In [129]:
lr = GridSearchCV(LogisticRegression(), 
             {
                 "C" : [2 ** n for n in range(-5, 12)],
                 "penalty" : ["l1", "l2"],
                 "max_iter" : [1000],
                 "solver" : ["liblinear"]
             },
                    cv=5)

In [None]:
lr.fit(X_train,y_train)









In [None]:
lr_choisi = lr.best_estimator_
lr_choisi.score(X_train, y_train)

In [6]:
from sklearn.pipeline import make_pipeline

In [12]:
modeles_rfr = [
    RFR(),
    make_pipeline(StandardScaler(), RFR()),
    make_pipeline(MinMaxScaler(),RFR()),
]
modeles_svr = [
    SVR(),
    make_pipeline(StandardScaler(), SVR()),
    make_pipeline(MinMaxScaler(),SVR()),
]

In [None]:
resultat