# Libraries to import

In [43]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline


import warnings
warnings.filterwarnings('ignore')


# Generic functions to get the baseline models, ie with the default parameters

In [31]:
# Function 1: Let's define a function which will instantiate as many models 
#as you wish to

def Reg_getBasedModel():
    basedModels = []
    basedModels.append(('LinearR'   , LinearRegression()))
    basedModels.append(('Ridge'   , Ridge()))
    basedModels.append(('Lasso'   , Lasso()))
    basedModels.append(('DT' , DecisionTreeRegressor()))
    #basedModels.append(('SVM-R'  , SVR()))
    basedModels.append(('RF'   , RandomForestRegressor()))
    return basedModels

In [33]:
# Function 2: Let's define a function that will train 
# each individual model described in GetBasedModel() function

def Reg_basedModels(X_train, y_train,models):
    """
    BasedModels will return the evaluation metric 'r2 after performing
    a CV for each of the models
    input:
    X_train
    y_train
    models = array containing the different instantiated models
    
    output:
    names = names of the diff models tested
    results = results of the diff models
    """
    # Test options and evaluation metric
    num_folds = 10
    
    #num_folds =  k_folds
    scoring = 'r2'

    results = []
    names = []
    
    for name, model in models:
        kfold = StratifiedKFold(n_splits=num_folds)
        cv_results = cross_val_score(model, X_train,
                                     y_train, cv=kfold, scoring=scoring)
        results.append(cv_results.mean())
        names.append(name)
        msg = "%s: Accuracy = %f (std = %f)" % (name, 
                                                cv_results.mean(), 
                                                cv_results.std())
        print(msg)
    scoreDataFrame = pd.DataFrame({'Model':names, 'Score': results})
       
        
    return scoreDataFrame

In [32]:
models = Reg_getBasedModel()

# Pipeline with the scaling methods

In [44]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler

# We will define a function to apply any preprocessing method to the raw data

def Reg_GetScaledModel(nameOfScaler):
    """
    Function to define whether we want to apply any preprocessing method to the raw data.
    input:
    nameOfScale  = 'standard' (standardize),  'minmax' or 'robustscaler'
    """
    
    if nameOfScaler == 'standard':
        scaler = StandardScaler()
        
    elif nameOfScaler =='minmax':
        scaler = MinMaxScaler()
        
    elif nameOfScaler == 'robustscaler':
        scaler = RobustScaler()

    pipelines = []
    pipelines.append((nameOfScaler+'LinearR'  , 
                      Pipeline([('Scaler', scaler),
                                ('LinearR'   , LinearRegression())])))
    
    pipelines.append((nameOfScaler+'Ridge' , 
                      Pipeline([('Scaler', scaler),('Ridge', Ridge())])))
    
    pipelines.append((nameOfScaler+'Lasso', 
                      Pipeline([('Scaler', scaler),
                                ('Lasso'   , Lasso())])))
    
    pipelines.append((nameOfScaler+'DT' ,
                      Pipeline([('Scaler', scaler),
                                ('DT' , DecisionTreeRegressor())])))
    
    #pipelines.append((nameOfScaler+'SVM' ,
         #             Pipeline([('Scaler', scaler),
        #                        ('SVM-R' , SVR(kernel = 'rbf'))])))
    

    pipelines.append((nameOfScaler+'RF'  , 
                      Pipeline([('Scaler', scaler),
                                ('RF'  , DecisionTreeRegressor())])))
    return pipelines 

# Let's test it with a real dataset!

In [23]:
df = pd.read_csv('../../../_module3_/datasets/3.3.2.1.diamonds.csv')
df

<IPython.core.display.Javascript object>

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


## Define the target and non-target vars, together with the train_test split

In [39]:
X = df[['carat', 'depth', 'x', 'y', 'z']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,
                                                    random_state=42)    

<IPython.core.display.Javascript object>

## Step 1: Define the baseline models

In [51]:
models = Reg_getBasedModel()

# Step 2: Fit the models

In [None]:
Base_model = Reg_basedModels(X_train, y_train,models)

LinearR: Accuracy = 0.843648 (std = 0.037594)
Ridge: Accuracy = 0.843993 (std = 0.036567)
Lasso: Accuracy = 0.849397 (std = 0.020702)
DT: Accuracy = 0.771361 (std = 0.009133)


In [None]:
Base_model

# Step 3: Get the models with the scaled data

In [45]:
models = Reg_GetScaledModel('standard')
models

[('standardLinearR',
  Pipeline(steps=[('Scaler', StandardScaler()), ('LinearR', LinearRegression())])),
 ('standardRidge',
  Pipeline(steps=[('Scaler', StandardScaler()), ('Ridge', Ridge())])),
 ('standardLasso',
  Pipeline(steps=[('Scaler', StandardScaler()), ('Lasso', Lasso())])),
 ('standardDT',
  Pipeline(steps=[('Scaler', StandardScaler()), ('DT', DecisionTreeRegressor())])),
 ('standardRF',
  Pipeline(steps=[('Scaler', StandardScaler()), ('RF', DecisionTreeRegressor())]))]

In [46]:
scaledScoreStandard = Reg_basedModels(X_train, y_train,models)

standardLinearR: Accuracy = 0.843648 (std = 0.037594)
standardRidge: Accuracy = 0.844148 (std = 0.036117)
standardLasso: Accuracy = 0.850016 (std = 0.018980)
standardDT: Accuracy = 0.772538 (std = 0.009335)
standardRF: Accuracy = 0.773599 (std = 0.008283)


<IPython.core.display.Javascript object>

In [47]:
scaledScoreStandard

Unnamed: 0,Model,Score
0,standardLinearR,0.843648
1,standardRidge,0.844148
2,standardLasso,0.850016
3,standardDT,0.772538
4,standardRF,0.773599


In [None]:
compareModels = pd.concat([Base_model,scaledScoreStandard], axis=1)
compareModels