In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler
import category_encoders as ce
from sklearn import linear_model
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
%matplotlib inline

In [None]:
# read the train data
data = pd.read_csv('Data/cleaned.csv')
data


In [None]:
data.Outlet_Location_Type = data.Outlet_Location_Type.astype('str')
data.select_dtypes(['int', 'float']).columns

In [None]:
# create an object of the OneHotEncoder
Encoder = ce.OneHotEncoder(cols=['Item_Fat_Content',
                                 'Item_Type',
                                'Outlet_Identifier',
                                'Outlet_Size',
                                'Outlet_Location_Type',
                                'Outlet_Type'],use_cat_names=True)
# encode the categorical variables
data = Encoder.fit_transform(data)

In [None]:
num_cols = ['Item_Weight','Item_Visibility','Item_MRP']

In [None]:
# create the StandardScaler
scaler = StandardScaler()
# fit the Item_MRP and Weight
data[num_cols] = scaler.fit_transform(data[num_cols])

In [None]:
data.head()

In [None]:
# define a fun to do col selections and split 
def split_traintest(df, dropvals, yvals = 'Item_Outlet_Sales'):

    # separate the predictors and target variable 
    X = df.drop(columns=dropvals)
    Y = data[yvals]

    # randomly split the data
    train_x, test_x, train_y, test_y = train_test_split(X, Y,test_size=0.2,random_state=42)
    return train_x, test_x, train_y, test_y

train_x, test_x, train_y, test_y = split_traintest(data, ['Item_Identifier','Item_Outlet_Sales'])

In [None]:
# check shape of train and test splits
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

In [10]:
# baselines
num_folds = 10
seed = 7

models = []
models.append(('LM', linear_model.LinearRegression()))
models.append(('L1', linear_model.Lasso()))
models.append(('L2', linear_model.Ridge()))
models.append(('BayesRidge',linear_model.BayesianRidge()))
models.append(('Tweedie',linear_model.TweedieRegressor(link='log',  max_iter=5000)))
models.append(('RF', RandomForestRegressor(max_depth=10, random_state=0)))
# build a pipeline
results = []
names = []
# inspired https://www.kaggle.com/richarde/easy-pipeline-and-model-selection#2.0-Process-the-Data
for name, model in models:
    kfold = KFold(n_splits=num_folds)
    cv_results = cross_val_score(model, train_x, train_y,  scoring = 'r2',cv=kfold)
    results.append(cv_results)
    names.append(name)
    msg = "%s %f %f " % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# compare algorithms
fig = plt.figure()
fig.suptitle('Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show();

# Tune glm 

In [None]:
# create an object of the RandomForestRegressor
iterations = [5000]
links = ['auto', 'identity', 'log']
coefs = []
results = []
names = []
for i in links:
    model_glm = linear_model.TweedieRegressor( link=i,   max_iter=5000)
    # fit the model with the training data
    model_glm.fit(train_x, train_y)
    # predict the target on train and test data
    predict_train = model_glm.predict(train_x)
    predict_test = model_glm.predict(test_x)  
    # get the coef
    coef = pd.Series(model_glm.coef_, model_glm.feature_names_in_, name = 'value').sort_values()
    coefs.append(coef)
    

    print('r2_score on train data: ', r2_score(train_y, predict_train))
    print('r2_score on test data: ',  r2_score(test_y, predict_test))    
    plt.figure(figsize=(18,6))
    plt.subplot(1, 3, 1)
    plt.scatter(train_y,predict_train)
    plt.title( 'Link function ' + str(i) + '(Train data)')
    plt.subplot(1, 3, 2)
    plt.scatter(test_y,predict_test)
    plt.title( 'Link function ' + str(i) + '(Test data)')
    plt.subplot(1, 3, 3)
    coef[coef>0.1].plot(kind='bar', title='Model Coefficients')
    
    plt.show()
# create an object of the RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {'bootstrap': [True],
              'max_depth': [5, 10, None], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]
              }
              
g_search = GridSearchCV(estimator = RandomForestRegressor(), param_grid = param_grid, 
                        cv = 5, n_jobs = 1, verbose = 0,
                        return_train_score=True)


In [None]:
type(coefs)

In [None]:
pd.DataFrame(coefs)


# OUTLET TYPE seems big factor in glm model
find a way to convert type 1 to type 3? 

# Tune RF

In [None]:
# create an object of the RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {'bootstrap': [True],
              'max_depth': [5, 10, None], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]
              }
              
g_search = GridSearchCV(estimator = RandomForestRegressor(), param_grid = param_grid, 
                        cv = 5, n_jobs = 1, verbose = 0,
                        return_train_score=True)
g_search.fit(train_x, train_y);

print(g_search.best_params_)

In [None]:
df_grid = pd.concat([pd.DataFrame(g_search.cv_results_["params"]),
                     pd.DataFrame(g_search.cv_results_["mean_test_score"], 
                                  columns=["mean_test_score"]),
                     pd.DataFrame(g_search.cv_results_["rank_test_score"], 
                                  columns=["rank_test_score"])],axis=1)
df_grid.to_csv("Data/rf_gridsearch.csv", index=False)


In [None]:
print(g_search.best_params_)

In [None]:

model_rf = RandomForestRegressor( bootstrap=True, max_depth=5, max_features='auto', n_estimators=13, random_state=42) 
# fit the model with the training data
model_rf.fit(train_x, train_y)
# predict the target on train and test data
predict_train = model_rf.predict(train_x)
predict_test = model_rf.predict(test_x)  
# get the coef
# coef = pd.Series(model_rf.coef_, model_rf.feature_names_in_, name = 'value').sort_values()
# coefs.append(coef)

print('r2_score on train data: ', r2_score(train_y, predict_train))
print('r2_score on test data: ',  r2_score(test_y, predict_test))    
# plt.figure(figsize=(18,6))
# plt.subplot(1, 3, 1)
# plt.scatter(train_y,predict_train)
# plt.title( 'Link function ' + str(i) + '(Train data)')
# plt.subplot(1, 3, 2)
# plt.scatter(test_y,predict_test)
# plt.title( 'Link function ' + str(i) + '(Test data)')
# plt.subplot(1, 3, 3)


In [None]:
feature_impt = pd.Series(model_rf.feature_importances_, model_rf.feature_names_in_, name = 'value').sort_values()
feature_impt[feature_impt>0.01].plot(kind = 'bar')


In [None]:
pd.DataFrame(feature_impt[feature_impt>0]).to_csv('Data/rf_featureimpt.csv')

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
train_x, test_x, train_y, test_y = split_traintest(data, ['Item_Identifier','Item_Outlet_Sales'])

In [None]:
scaler = StandardScaler()
pca = PCA()
ridge = Ridge()
train_x = scaler.fit_transform(train_x)
train_x = pca.fit_transform(train_x)
ridge.fit(train_x, train_y)
from sklearn.pipeline import Pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reduce_dim', PCA()),
        ('regressor', linear_model.TweedieRegressor(max_iter=3000))
        ])

In [None]:
pipe = pipe.fit(train_x, train_y)
print('Testing score: ', pipe.score(test_x, test_y))

In [None]:
# Compute train and test errors
from sklearn import linear_model
alphas = np.logspace(-5, 1, 10)
enet = linear_model.ElasticNet(l1_ratio=0.7, max_iter=10000)
train_errors = list()
test_errors = list()
for alpha in alphas:
    enet.set_params(alpha=alpha)
    enet.fit(train_x, train_y)
    train_errors.append(enet.score(train_x, train_y))
    test_errors.append(enet.score(test_x, test_y))

i_alpha_optim = np.argmax(test_errors)
alpha_optim = alphas[i_alpha_optim]
print("Optimal regularization parameter : %s" % alpha_optim)

# Estimate the coef_ on full data with optimal regularization parameter
enet.set_params(alpha=alpha_optim)
coef_ = enet.fit(X, Y).coef_

In [None]:
plt.subplot(2, 1, 1)
plt.semilogx(alphas, train_errors, label="Train")
plt.semilogx(alphas, test_errors, label="Test")
plt.vlines(
    alpha_optim,
    plt.ylim()[0],
    np.max(test_errors),
    color="k",
    linewidth=3,
    label="Optimum on test",
)
plt.legend(loc="lower left")
plt.ylim([0, 1.2])
plt.xlabel("Regularization parameter")
plt.ylabel("Performance")

# Show estimated coef_ vs true coef
plt.subplot(2, 1, 2)

plt.plot(coef_, label="Estimated coef")
plt.legend()
plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.26)
plt.show()