In [31]:
import pandas as pd
df = pd.read_csv("regression_exercise.csv")
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [32]:
y = df["Item_Outlet_Sales"]
df = df.drop(["Item_Outlet_Sales","Item_Identifier"],axis = 1)

In [33]:
df_train = df.sample(frac=0.8).sort_index()
y_train = y[y.index.isin(df_train.index.tolist())]

In [34]:
df_test = df[~df.index.isin(df_train.index.tolist())].sort_index()
y_test = y[y.index.isin(df_test.index.tolist())]

In [35]:
cat_feats = df.dtypes[df.dtypes == 'object'].index.tolist()
num_feats = df.dtypes[~df.dtypes.index.isin(cat_feats)].index.tolist()

In [36]:
from sklearn.preprocessing import FunctionTransformer
def numFeat(data):
    return data[num_feats]

def catFeat(data):
    return data[cat_feats]

In [37]:
keep_num = FunctionTransformer(numFeat)
keep_cat = FunctionTransformer(catFeat)

In [38]:
#from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Imputer
null_replace_num = Imputer(strategy="mean")
null_replace_cat = Imputer(strategy="most_frequent")

In [39]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [40]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

In [41]:
# copy pasted from the article
class ToDenseTransformer():

    # here you define the operation it should perform
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    # just return self
    def fit(self, X, y=None, **fit_params):
        return self

In [42]:
to_dense = ToDenseTransformer()

In [43]:
from sklearn.feature_selection import SelectKBest
k_best = SelectKBest(k=3)

In [44]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

base_model = Ridge()

In [45]:
from sklearn.pipeline import Pipeline, FeatureUnion

In [46]:
num_pipeline = Pipeline([
    ("num_feats", keep_num),
    ("impute_num", null_replace_num),
    ("kBest", k_best)
])

cat_pipeline = Pipeline([
    ("cat_feats", keep_cat),
    ("impute_cat", null_replace_cat),
    ("dummies", ohe),
    ("to_dense", to_dense),
    ("pca", pca)
])

all_features = FeatureUnion([
    ('numeric_features', num_pipeline),
    ('categorical_features', cat_pipeline),
])

main_pipeline = Pipeline([
    ('all_features', all_features),
    ('modeling', base_model)
])

In [47]:
model = main_pipeline.fit(df_train, y_train)

ValueError: could not convert string to float: 'Supermarket Type1'

In [230]:
res = model.predict(df_test)

In [231]:
model.score(df_test,y_test)

0.3957982235890033

# Task II

In [208]:
from sklearn.model_selection import GridSearchCV

In [209]:
models_to_fit = [Ridge(), RandomForestRegressor(), GradientBoostingRegressor()]

In [216]:
params = [
    {'all_features__numeric_features__impute_num__strategy': ['mean','median'],
     'all_features__numeric_features__kBest__k': [2,3,'all'],
     'all_features__categorical_features__pca__n_components': [3,4,5],
     'modeling': [Ridge()],
     'modeling__alpha': [0.1,0.3,0.5,0.7]
    },
    {'all_features__numeric_features__impute_num__strategy': ['mean','median'],
     'all_features__numeric_features__kBest__k': [2,3,'all'],
     'all_features__categorical_features__pca__n_components': [3,4,5],
     'modeling': [RandomForestRegressor()],
     'modeling__n_estimators': [10,20,50],
     'modeling__max_depth': [4,6,8],
    },
    {'all_features__numeric_features__impute_num__strategy': ['mean','median'],
     'all_features__numeric_features__kBest__k': [2,3,'all'],
     'all_features__categorical_features__pca__n_components': [3,4,5],
     'modeling': [GradientBoostingRegressor()],
     'modeling__n_estimators': [10,20,50],
     'modeling__max_depth': [4,6,8],
    },
]

In [217]:
tuned_model = GridSearchCV(main_pipeline, params, verbose=2, n_jobs = -1).fit(df_train, y_train)

Fitting 5 folds for each of 396 candidates, totalling 1980 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 1980 out of 1980 | elapsed: 12.8min finished


In [218]:
tuned_model.best_params_

{'all_features__categorical_features__pca__n_components': 3,
 'all_features__numeric_features__impute_num__strategy': 'median',
 'all_features__numeric_features__kBest__k': 3,
 'modeling': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=6, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=50, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False),
 'modeling__max_depth': 6,
 'modeling__n_estimators': 50}

In [219]:
print('Final score is: ', tuned_model.score(df_test, y_test))

Final score is:  0.6241741712069144
