# Pipeline et modèle

In [443]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error

import pickle

In [444]:
df = pd.read_csv("../data/new_AmesHousing.csv")

In [445]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2285 entries, 0 to 2284
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   TotalSF         2285 non-null   float64
 1   Overall Qual    2285 non-null   int64  
 2   Neighborhood    2285 non-null   object 
 3   Bsmt Qual       2285 non-null   object 
 4   Exter Qual      2285 non-null   object 
 5   Kitchen Qual    2285 non-null   object 
 6   Garage Cars     2285 non-null   float64
 7   TotalBathrooms  2285 non-null   float64
 8   Age             2285 non-null   int64  
 9   Foundation      2285 non-null   object 
 10  SalePrice       2285 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 196.5+ KB


In [446]:
df

Unnamed: 0,TotalSF,Overall Qual,Neighborhood,Bsmt Qual,Exter Qual,Kitchen Qual,Garage Cars,TotalBathrooms,Age,Foundation,SalePrice
0,19289664.0,6,NAmes,TA,TA,TA,2.0,1.0,50,CBlock,215000
1,7150276.0,5,NAmes,TA,TA,TA,1.0,1.0,49,CBlock,105000
2,15896169.0,6,NAmes,TA,TA,Gd,1.0,1.5,52,CBlock,172000
3,40068900.0,7,NAmes,TA,Gd,Ex,2.0,2.5,42,CBlock,244000
4,17522596.0,5,Gilbert,Gd,TA,TA,2.0,2.5,13,PConc,189900
...,...,...,...,...,...,...,...,...,...,...,...
2280,11410884.0,5,Mitchel,TA,TA,TA,2.0,2.0,29,CBlock,160000
2281,13483584.0,5,Mitchel,TA,TA,TA,2.0,1.0,46,CBlock,131000
2282,9054081.0,6,Mitchel,TA,TA,TA,2.0,1.0,22,CBlock,142500
2283,17363889.0,5,Mitchel,Gd,TA,TA,2.0,1.0,32,CBlock,170000


In [447]:
numeric_features = ["TotalSF", "Overall Qual", "Garage Cars", "TotalBathrooms", "Age"]
ordinal_features = [ "Bsmt Qual",  "Exter Qual", "Kitchen Qual"]
cat_feature = ["Neighborhood", "Foundation"]
all_col = numeric_features.copy()
all_col.extend(ordinal_features)
all_col.extend(cat_feature)

X = df[all_col]
y = df[["SalePrice"]]



In [448]:
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, 
                                                    shuffle=True, 
                                                    stratify=X[['Overall Qual']]
                                                    )

In [449]:
ordinal_features

['Bsmt Qual', 'Exter Qual', 'Kitchen Qual']

In [450]:
ordinal_features[0]

'Bsmt Qual'

In [451]:
for i in ordinal_features:
    print(f"{i} cat: {list(X[i].unique())}")



Bsmt Qual cat: ['TA', 'Gd', 'Ex', 'Fa', 'Po']
Exter Qual cat: ['TA', 'Gd', 'Ex', 'Fa']
Kitchen Qual cat: ['TA', 'Gd', 'Ex', 'Fa', 'Po']


In [452]:
numeric_transformer = SimpleImputer()

Bsmt_cat = list(X[ordinal_features[0]].unique())
exter_cat = list(X[ordinal_features[0]].unique())
kitchen_cat = list(X[ordinal_features[0]].unique())

ordinal_transformer = OrdinalEncoder(categories=[Bsmt_cat, exter_cat, kitchen_cat])

categorical_transformer = OneHotEncoder()

In [453]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, cat_feature)
    ]
)

In [454]:
reg = Lasso()

In [455]:
pipe = Pipeline([
     ('preprocessor', preprocessor),
     ('reg', reg)
])

pipe.fit(X_train, y_train) 

In [456]:
pipe.score(X_test,y_test)

predict_train  = pipe.predict(X_train)
predict_test  = pipe.predict(X_test)

# Root Mean Squared Error on train and test date
print('MAE on train data: ', mean_absolute_error(y_train, predict_train))
print('MAE on test data: ',  mean_absolute_error(y_test, predict_test))

MAE on train data:  19994.535284007583
MAE on test data:  19034.945608499773


In [457]:
filename = '../real_estate_app/main_app/static/models/finalized_model.pkl'
pickle.dump(pipe, open(filename, 'wb'))

In [458]:
dico = {'Year_Built': 2000, 'Total_Bsmt_SF': 1, '1st_Flr_SF': 1, 'Gr_Liv_Area': 1, 'Garage_Area': 1, 'Overall_Qual': 1, 'Full_Bath': 1, 'Exter_Qual': 'Po', 'Kitchen_Qual': 'Po', 'Neighborhood': 'Gilbert'}

In [459]:
help(pipe.predict)

Help on method predict in module sklearn.pipeline:

predict(X, **predict_params) method of sklearn.pipeline.Pipeline instance
    Transform the data, and apply `predict` with the final estimator.
    
    Call `transform` of each transformer in the pipeline. The transformed
    data are finally passed to the final estimator that calls `predict`
    method. Only valid if the final estimator implements `predict`.
    
    Parameters
    ----------
    X : iterable
        Data to predict on. Must fulfill input requirements of first step
        of the pipeline.
    
    **predict_params : dict of string -> object
        Parameters to the ``predict`` called at the end of all
        transformations in the pipeline. Note that while this may be
        used to return uncertainties from some models with return_std
        or return_cov, uncertainties that are generated by the
        transformations in the pipeline are not propagated to the
        final estimator.
    
        .. versionad

In [460]:
pipe.predict([list(dico.values()),])

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
pipe.score({dico})

TypeError: unhashable type: 'dict'