# Goals:
-	Build a model to predict the number of sales (Item_Outlet_Sales) using the available features.
-	Gain insights about the number of sales.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
%matplotlib inline

# Overview

In [None]:
# read the train data
data = pd.read_csv('Data\BigMart.csv')
# check for the null values
data.isna().sum()
# Drop row number 
data.drop("Unnamed: 0", axis = 1, inplace=True) 


In [None]:
# data.Item_Weight.hist()
data.Outlet_Size.hist()

In [None]:
data.describe()
df_ = data.select_dtypes(exclude=['int', 'float'])
for col in df_.columns:
    print(df_[col].unique()) # to print categories name only
    # print(df_[col].value_counts()) # to print count of every category

## Data clean

In [None]:
# Fix some irregular values 
data.loc[data['Item_Fat_Content'].isin(['LF','low fat']), 'Item_Fat_Content'] = 'Low Fat'
data.loc[data['Item_Fat_Content'].isin(['reg']), 'Item_Fat_Content'] = 'Regular'

In [6]:
# impute NAs by mean
data.Item_Weight.fillna(data.Item_Weight.mean(),inplace=True)
# during EDA in R, discovered NAs present in Grocery and Supermarket type 1. 
# assume Grocery store are small and use mode for Type1 supermarket 

data.loc[:, ['Outlet_Type', 'Outlet_Size']].drop_duplicates()
data.loc[data['Outlet_Type'].eq('Grocery Store') & data['Outlet_Size'].isna(), 'Outlet_Size'] = 'Small'
data.Outlet_Size.mode()
data.loc[data['Outlet_Type'].eq('Supermarket Type1') & data['Outlet_Size'].isna(), 'Outlet_Size'] = 'Medium'
data.loc[:, ['Outlet_Type', 'Outlet_Size']].drop_duplicates()
data.loc[:, ['Item_Type']].drop_duplicates()
data.Outlet_Location_Type = data.Outlet_Location_Type.astype('str')
# save it 
data.to_csv("Data/cleaned.csv", index=False)


In [None]:
# create an object of the OneHotEncoder
Encoder = ce.OneHotEncoder(cols=['Item_Fat_Content',
                                 'Item_Type',
                                'Outlet_Identifier',
                                'Outlet_Size',
                                'Outlet_Location_Type',
                                'Outlet_Type'],use_cat_names=True)
# encode the categorical variables
data = Encoder.fit_transform(data)

In [None]:
# some feature enigeering change supermarket type 2 to 1

# data.loc[data['Item_Type'].isin(['Household', 'Health and Hygiene']), 'Item_Type'] = 'Noneedible'
# data.loc[data['Outlet_Type'].eq('Supermarket Type2'), 'Outlet_Type'] = 'Supermarket Type1'

In [None]:
data.isna().sum()
data.describe()



In [None]:
# create the StandardScaler
scaler = StandardScaler()
# fit the Item_MRP and Weight
num_cols = ['Item_Weight','Item_Visibility','Item_MRP']

# fit the Item_MRP and Weight
data[num_cols] = scaler.fit_transform(data[num_cols])


In [None]:
# separate the predictors and target variable 
train_X = data.drop(columns=['Item_Identifier','Item_Outlet_Sales'])
train_Y = data['Item_Outlet_Sales']

# randomly split the data
train_x, test_x, train_y, test_y = train_test_split(train_X, train_Y,test_size=0.2,random_state=0)

# shape of train and test splits
train_x.shape, test_x.shape, train_y.shape, test_y.shape

In [None]:
# Try plot a learning curve
from sklearn.linear_model import RidgeCV


from yellowbrick.model_selection import LearningCurve

# Instantiate the regression model and visualizer
model = LinearRegression()
visualizer = LearningCurve(model, scoring='r2')

visualizer.fit(train_X, train_Y)        # Fit the data to the visualizer
visualizer.show('Reports/lmlearningcurve.png')           # Finalize and render the figure




In [None]:
from yellowbrick.model_selection import ValidationCurve



viz = ValidationCurve(
    RandomForestRegressor(), param_name="max_depth",
    param_range=np.arange(1, 11), cv=10, scoring="r2"
)

# Fit and show the visualizer
viz.fit(train_X, train_Y)
viz.show(outpath='Reports/rf_maxdepth.png')

In [None]:

from yellowbrick.model_selection import LearningCurve

from sklearn import linear_model
model = linear_model.TweedieRegressor(link='log', max_iter=1000)
visualizer = LearningCurve(model, scoring='r2')

visualizer.fit(train_X, train_Y)        # Fit the data to the visualizer


In [None]:
error2 = pd.DataFrame({'Actual-Values': np.array(y_test).flatten(), 'Predicted-Values': predi.flatten()})
error2.head(10) 

In [None]:
visualizer.show(outpath= 'Reports/glmLearningCurve.png')

In [None]:
# create an object of the LinearRegression Model
model_LR = LinearRegression(fit_intercept=False)

# fit the model with the training data
model_LR.fit(train_x, train_y)

# predict the target on train and test data 
predict_train = model_LR.predict(train_x)
predict_test  = model_LR.predict(test_x)

print('RMSE on train data: ', mean_squared_error(train_y, predict_train))
print('RMSE on test data: ',  mean_squared_error(test_y, predict_test))
print('MAE on train data: ', mean_absolute_error(train_y, predict_train))
print('MAE on test data: ',  mean_absolute_error(test_y, predict_test))
print('r2_score on train data: ', r2_score(train_y, predict_train))
print('r2_score on test data: ',  r2_score(test_y, predict_test))
plt.figure(figsize=(18,6))
plt.subplot(1, 2, 1)
plt.scatter(train_y,predict_train)
# plt.title( 'Link function ' + str(i) + '(Train data)')
plt.subplot(1, 2, 2)
plt.scatter(test_y,predict_test)
# plt.title( 'Link function ' + str(i) + '(Test data)')

In [None]:
d = {'featurename':model_LR.feature_names_in_, 
     'featurecoef':model_LR.coef_}
# coef = pd.Series(model_LR.coef_, model_LR.feature_names_in_, name = 'value').sort_values()

pd.DataFrame(d)


In [None]:
pd.DataFrame({'obs':test_y, 'pred':predict_test}).to_csv('Data/lmtestrs.csv', index = False)

In [None]:
# create an object of the RandomForestRegressor
model_RFR = RandomForestRegressor(max_depth=10)

# fit the model with the training data
model_RFR.fit(train_x, train_y)

# predict the target on train and test data
predict_train = model_RFR.predict(train_x)
predict_test = model_RFR.predict(test_x)

print('RMSE on train data: ', mean_squared_error(train_y, predict_train))
print('RMSE on test data: ',  mean_squared_error(test_y, predict_test))
print('MAE on train data: ', mean_absolute_error(train_y, predict_train))
print('MAE on test data: ',  mean_absolute_error(test_y, predict_test))
print('r2_score on train data: ', r2_score(train_y, predict_train))
print('r2_score on test data: ',  r2_score(test_y, predict_test))

# RF is better than LR so far. But overfitting 

# Try a lasso regression to drop some features?

In [None]:
from sklearn import linear_model
regressor = linear_model.Lasso(alpha=100,  
                               positive=True,
                               fit_intercept=False, 
                               max_iter=1000,
                               tol=0.0001)
regressor.fit(train_x, train_y)

In [None]:
import eli5
eli5.show_weights(regressor, top=-1, feature_names = train_x.columns.tolist())


## Three features lead the way? 
Not sure if it's going to work

In [None]:
train_x

In [None]:
# separate the independent and target variable 
train_X = data.loc[:,['Item_MRP', 'Outlet_Identifier_OUT027', 'Outlet_Type_Supermarket Type1', 'Outlet_Size_Medium']]
train_Y = data['Item_Outlet_Sales']

# split the data
train_x, test_x, train_y, test_y = train_test_split(train_X, train_Y,test_size=0.2,random_state=0)

# shape of train and test splits
train_x.shape, test_x.shape, train_y.shape, test_y.shape

In [None]:

# create an object of the LinearRegression Model
model_LR = LinearRegression()

# fit the model with the training data
model_LR.fit(train_x, train_y)

# predict the target on train and test data 
predict_train = model_LR.predict(train_x)
predict_test  = model_LR.predict(test_x)

# Root Mean Squared Error on train and test date
print('RMSE on train data: ', mean_squared_error(train_y, predict_train))
print('RMSE on test data: ',  mean_squared_error(test_y, predict_test))
print('MAE on train data: ', mean_absolute_error(train_y, predict_train))
print('MAE on test data: ',  mean_absolute_error(test_y, predict_test))
print('r2_score on train data: ', r2_score(train_y, predict_train))
print('r2_score on test data: ',  r2_score(test_y, predict_test))

In [None]:
# create an object of the RandomForestRegressor
model_RFR = RandomForestRegressor(max_depth=10)

# fit the model with the training data
model_RFR.fit(train_x, train_y)

# predict the target on train and test data
predict_train = model_RFR.predict(train_x)
predict_test = model_RFR.predict(test_x)

# Root Mean Squared Error on train and test data
print('RMSE on train data: ', mean_squared_error(train_y, predict_train)**(0.5))
print('RMSE on test data: ',  mean_squared_error(test_y, predict_test)**(0.5))
print('MAE on train data: ', mean_absolute_error(train_y, predict_train))
print('MAE on test data: ',  mean_absolute_error(test_y, predict_test))
print('r2_score on train data: ', r2_score(train_y, predict_train))
print('r2_score on test data: ',  r2_score(test_y, predict_test))

### Not doing great
Need a pipleline and model selection


In [None]:
# save a cleaned dataset 
data