# Machine Learning PipeLine

In [35]:
import warnings
warnings.filterwarnings('ignore')

In [36]:
import numpy as np
import pandas as pd

In [37]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [41]:
np.random.seed(12)

In [42]:
df = pd.read_csv('data/train.csv')

In [43]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [45]:
df.shape

(1460, 81)

In [58]:
def missing_values(df):
    missing_columns = [column for column in df.columns if df[column].isnull().sum() > 0]
    no_of_missing_value = [missing for missing in df.isnull().sum() if missing > 0]
    return list(zip(missing_columns, no_of_missing_value))

In [59]:
missing_values(df)

[('LotFrontage', 259),
 ('Alley', 1369),
 ('MasVnrType', 8),
 ('MasVnrArea', 8),
 ('BsmtQual', 37),
 ('BsmtCond', 37),
 ('BsmtExposure', 38),
 ('BsmtFinType1', 37),
 ('BsmtFinType2', 38),
 ('Electrical', 1),
 ('FireplaceQu', 690),
 ('GarageType', 81),
 ('GarageYrBlt', 81),
 ('GarageFinish', 81),
 ('GarageQual', 81),
 ('GarageCond', 81),
 ('PoolQC', 1453),
 ('Fence', 1179),
 ('MiscFeature', 1406)]

### Sklearn PipeLine

In [12]:
classifiers = [               
               LinearRegression(n_jobs=-1), 
               Ridge(alpha=0.003, max_iter=30), 
               Lasso(alpha=.0005),                                              
               SVR(kernel="linear"),
               LinearSVR(),
               RandomForestRegressor(n_jobs=-1, n_estimators=350, 
                                     max_depth=12, random_state=1),
               GradientBoostingRegressor(n_estimators=500, max_depth=2),               
]

clf_names = [            
            "linear", 
            "ridge",
            "lasso",                        
            "svr",
            "linearsvr",
            "randomforest",                         
            "gbm",
]

## Data Cleaning

In [21]:
def clean_data(data, is_train_data=True):
    for column in data.columns:
        if data[column].isnull().sum() > 500:
            data.drop(column,1,inplace=True)
    data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
     # add here the code that you only want to apply to your training data and not the test set
    # e.g. removing outliers from the training data works... 
    # ...but you cannot remove samples from your test set.
    if is_train_data == True:
        data = data[data.GrLivArea < 4000]
        
    return data

### Prepare the Data

In [22]:
def prepare_data(df, is_train_data=True):
    # split data into numerical & categorical in order to process seperately in the pipeline
    numerical   = df.select_dtypes("number").copy()
    categorical = df.select_dtypes("object").copy()
    
    # for training data only...
    # ...convert SalePrice to log values and drop "Id" and "SalePrice" columns
    if is_train_data == True :
        SalePrice = numerical.SalePrice
        y = np.log1p(SalePrice)
        numerical.drop(["Id", "SalePrice"], axis=1, inplace=True)
        
    # for the test data: just drop "Id" and set "y" to None
    else:
        numerical.drop(["Id"], axis=1, inplace=True)
        y = None
        
     # concatenate numerical and categorical data to X (our final training data)
    X = pd.concat([numerical, categorical], axis=1)
    
    return X, y, numerical.columns, categorical.columns

### Get Pipeline

In [23]:
def get_pipeline(classifier, num_cols, cat_cols):
    # the numeric transformer gets the numerical data acording to num_cols
    # the first step is the imputer which imputes all missing values to the mean
    # in the second step all numerical data gets scaled by the StandardScaler()
    numeric_transformer = Pipeline(steps=[
        ('imputer', make_pipeline(SimpleImputer(strategy='mean'))),
        ('scaler', StandardScaler())])
    
    # the categorical transformer gets all categorical data according to cat_cols
    # again: first step is imputing missing values and one hot encoding the categoricals
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    # the column transformer creates one Pipeline for categorical and numerical data each
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)])
    
    # return the whole pipeline with the classifier provided in the function call    
    return Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

### Model Score

In [24]:
def score_models(df):
    # retrieve X, y and the seperate columns names
    X, y, num_cols, cat_cols = prepare_data(df)
    
    # since we converted SalePrice to log values, we use neg_mean_squared_error... 
    # ...rather than *neg_mean_squared_log_error* 
    scoring_metric = "neg_mean_squared_error"
    scores = []
    
    for clf_name, classifier in zip(clf_names, classifiers):
        # create a pipeline for each classifier
        clf = get_pipeline(classifier, num_cols, cat_cols)
        # set a kfold with 3 splits to get more robust scores. 
        # increase to 5 or 10 to get more precise estimations on models score
        kfold = KFold(n_splits=3, shuffle=True, random_state=1)  
        # crossvalidate and return the square root of the results
        results = np.sqrt(-cross_val_score(clf, X, y, cv=kfold, scoring=scoring_metric))
        scores.append([clf_name, results.mean()])

    scores = pd.DataFrame(scores, columns=["classifier", "rmse"]).sort_values("rmse", ascending=False)
    # just for good measure: add the mean of all scores to dataframe
    scores.loc[len(scores) + 1, :] = ["mean_all", scores.rmse.mean()]
    return scores.reset_index(drop=True)

### Fit Model

In [25]:
def train_models(df): 
    X, y, num_cols, cat_cols = prepare_data(df)
    pipelines = []
    
    for clf_name, classifier in zip(clf_names, classifiers):
        clf = get_pipeline(classifier, num_cols, cat_cols)
        clf.fit(X, y)
        pipelines.append(clf)
    
    return pipelines

### Predict Model

In [61]:
def predict_from_models(df_test, pipelines):
    X_test, _ , _, _ = prepare_data(df_test, is_train_data=False)
    predictions = []
    
    for pipeline in pipelines:
        preds = pipeline.predict(X_test)
        # we return the exponent of the predictions since we have log converted y for training
        predictions.append(np.expm1(preds))
    
    return predictions

In [62]:
df = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

# We clean the data
df = clean_data(df)
df_test = clean_data(df_test, is_train_data=False)

In [None]:
# We score the models on the preprocessed training data
my_scores = score_models(df)
display(my_scores)

In [29]:
models = train_models(df)
predictions = predict_from_models(df_test, models)
# We average over the results of all 12 classifiers (simple ensembling)
# we exclude the DummyRegressor and the SGDRegressor: they perform worst...
prediction_final = pd.DataFrame(predictions[2:]).mean().T.values

submission = pd.DataFrame({'Id': df_test.Id.values, 'SalePrice': prediction_final})
submission.to_csv(f"submission.csv", index=False)