**Steps of Sales Prediction Problem**

* **Hypothesis Generation** – understanding the problem better by brainstorming possible factors that can impact the outcome
* **Data Exploration** – looking at categorical and continuous feature summaries and making inferences about the data.
* **Data Cleaning** – imputing missing values in the data and checking for outliers
* **Feature Engineering** – modifying existing variables and creating new ones for analysis
* **Model Building** – making predictive models on the data





In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
from google.colab import drive
#readfiles
#


In [None]:
#Mount Google Drive for file path
drive.mount('/content/drive')

In [None]:
#upload files
#uploaded = files.upload()
url = "/content/drive/My Drive/Colab Notebooks/data-big-mart/Train.csv";
train = pd.read_csv(url);
url = "/content/drive/My Drive/Colab Notebooks/data-big-mart/Test.csv";
test = pd.read_csv(url);

In [None]:
print(train.shape, test.shape);
print(train.sample(5), test.sample(5));

In [None]:
#combining the dataset with a source column to record where each observation belong
train['source'] = 'train';
test['source'] = 'test';

data = pd.concat([train, test],ignore_index=True, sort=False);
print (train.shape, test.shape, data.shape);

In [None]:
#Checking for Missing Values in all Numerical variables
data.isnull().sum().sort_values(ascending=False)

Item Outlet Sales is our target variable and the missing values are from  the test dataset

In [None]:
#checking the unique values for all categorical variables
data.apply(lambda x:len(x.unique()))

In [None]:
data.describe()

In [None]:
missing_values_count = ((data.isnull().sum()/data.isnull().count())*100).sort_values(ascending=False)
plt.figure(figsize=(15,10))
plt.xlabel('Features', fontsize=15)
plt.ylabel('No of missing values in %ge', fontsize=15)
plt.title('Top 10 Variables with missing data', fontsize=15)
sns.barplot(missing_values_count[:10].index.values, missing_values_count[:10],palette="hls")



So, we can exclude Item_Outlet_sales since the missing values are from test set. we should treat Outlet_Size and Item_Weight features for missing values

In [None]:
#Treating Missing Values
#we will add avg item weight to the missing values
#full_ds["LotFrontage"] = full_ds.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median())) 
data['Item_Weight'].isnull().sum()
data['Item_Weight'] = data['Item_Weight'].fillna(data['Item_Weight'].mean())


In [None]:
#Outlet Size
data['Outlet_Size'] = data["Outlet_Size"].fillna(data["Outlet_Size"].mode()[0])

print (data['Item_Weight'].isnull().sum())
print (data['Item_Weight'].isnull().sum())



In [None]:
#Filter Categorical Variables & Explore frequency of different categories in categorical variable
#Seleting all Categorial Features
categorical_feats = data.dtypes[data.dtypes == "object"].index
#categorical_feats
#categorical_clm = [x for x in data.dtypes.index if data.dtypes[x]=='object']
#Exclude ID and Source column
#categorical_feats.drop('Item_Identifier',axis = 1, inplace= True)
#train.drop("Id", axis = 1, inplace = True)
categorical_feats = [x for x in categorical_feats if x not in ['Item_Identifier','Outlet_Identifier','source']]
print (categorical_feats)

In [None]:
#Printing Frequency of Categories:
for col in categorical_feats:
  print ("\n Frequency of Categories for variable: %s"%col)
  print (data[col].value_counts())

In [None]:
#Missing values treatment for Item_Visibility, filling with mean value
#data = data.set_index(data['Item_Visibility'])
data['Item_Visibility'] = data["Item_Visibility"].replace(0,data['Item_Visibility'].mean())
#print (data.loc[data['Item_Visibility']==0].count())
print ('0 Values after modification %s' %sum(data['Item_Visibility']==0))


In [None]:
#Get the first two characters of ID:
data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x: x[0:2])
#Rename them to more intuitive categories:
data['Item_Type_Combined'] = data['Item_Type_Combined'].map({'FD':'Food',
                                                             'NC':'Non-Consumable',
                                                             'DR':'Drinks'})
data['Item_Type_Combined'].value_counts()

In [None]:
#Years:
data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']
data['Outlet_Years'].describe()


In [None]:
#Change categories of low fat:
print ('Original Categories:')
print (data['Item_Fat_Content'].value_counts())

print ('\nModified Categories:')
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({'LF':'Low Fat',
                                                             'reg':'Regular',
                                                             'low fat':'Low Fat'})
print (data['Item_Fat_Content'].value_counts())

In [None]:
#Mark Non-Consumables as seperate categary in low_fat
data.loc[data['Item_Type_Combined']=="Non-Consumable","Item_Fat_Content"]="Non-Edible"
data['Item_Fat_Content'].value_counts()

In [None]:
#Step 6: numerical and hot encoding
#Lets start with coding all categorical variables as numeric using ‘LabelEncoder’ from sklearn’s preprocessing module.
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
#new Variable for Outlet

data['Outlet'] = le.fit_transform(data['Outlet_Identifier'])
var_mod = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Item_Type_Combined','Outlet_Type','Outlet']

le = LabelEncoder()
for i in var_mod:
  data[i] = le.fit_transform(data[i])

In [None]:
#one hot encoding

data = pd.get_dummies(data, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Item_Type_Combined','Outlet_Type','Outlet'])

data.dtypes


In [None]:
data[['Item_Fat_Content_0','Item_Fat_Content_1','Item_Fat_Content_2']].head(10)


In [None]:
#Step 7: Convert data back into train and test set
#Dropping columns which have been converted to different types
data.drop(['Item_Type','Outlet_Establishment_Year'],axis=1,inplace=True)

#Divinde in Test and train
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']

#Drop Unnecessary columns

test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

#Export files as modified versions:
train.to_csv("train_modified.csv",index=False)
test.to_csv("test_modified.csv",index=False)

In [None]:
#Model Building
#Lets start by making a baseline model. Baseline model is the one which requires no predictive model and 
#its like an informed guess. For instance, in this case lets predict the sales as the overall average sales. 

mean_sales = train['Item_Outlet_Sales'].mean()

#Define Dataframe with ID's for Submission

base1 = test[['Item_Identifier','Outlet_Identifier']]

base1['Item_Outlet_Sales'] = mean_sales




In [None]:
#I would like to define a generic function which takes the algorithm and data as input and makes the model, performs cross-validation and generates submission.
#Define Target & ID Columns

target = 'Item_Outlet_Sales'
IDcol = ['Item_Identifier','Outlet_Identifier']

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score



def modelfit(alg, dtrain, dtest, predictors, target, IDcol, filename):
  #fit the algorithm to the data
  alg.fit(dtrain[predictors],dtrain[target])
  
  #Predict training set:
  dtrain_predictions = alg.predict(dtrain[predictors])
  
  #Perform Cross Validation
  cv_score = cross_val_score(alg, dtrain[predictors], dtrain[target], cv=20, scoring ='neg_mean_squared_error')
  cv_score = np.sqrt(np.abs(cv_score))
  
  #print model report
  
  print ("\nModel Report:")
  print ("RMSE: %.4g"%np.sqrt(metrics.mean_squared_error(dtrain[target].values,dtrain_predictions)))
# print ("CV Score: Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" %np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score))
  print ("CV Score: Mean - {0} | Std - {1} | Min - {2} | Max - {3}".format(np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))

  #Predict on Test Data
  dtest[target] = alg.predict(dtest[predictors])
  
  IDcol.append(target)
  
  submission = pd.DataFrame({x: dtest[x] for x in IDcol})
  submission.to_csv(filename, index=False)
  
  
  
#linear_reg = LinearRegression()
#linear_reg.fit(X_train,y)
#mean_sq_er = cross_val_score(linear_reg,X_train,y,scoring='neg_mean_squared_error',cv=5)
#rmse_lin = rmse_cross_val(linear_reg)
#print("Mean Squared Error",mean_sq_er.mean())
#print("Root Mean Squared Error",rmse_lin.mean())
  
  

**Linear Regression Model**

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
predictors = [x for x in train.columns if x not in [target]+IDcol]

#Print Predictors

alg1 = LinearRegression(normalize=True)
modelfit(alg1, train, test, predictors, target, IDcol,'1-lr.csv' )
coef1 = pd.Series(alg1.coef_, predictors).sort_values()
plt.figure(figsize=(15,10))
coef1.plot(kind='bar', title="Linear Model Coefficients")


In [None]:
#Ridge Regression Model:
predictors = [x for x in train.columns if x not in [target]+IDcol]
alg2 = Ridge(alpha=0.05, normalize=True)
modelfit(alg2, train, test, predictors, target, IDcol,'2-ridge.csv' )
coef2 = pd.Series(alg2.coef_, predictors).sort_values()
plt.figure(figsize=(15,10))
coef2.plot(kind='bar', title="Ridge Model Coefficients")

In [None]:
#Decision Tree Model
from sklearn.tree import DecisionTreeRegressor
predictors = [x for x in train.columns if x not in [target]+IDcol]
alg3 = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)
modelfit(alg3, train, test, predictors, target, IDcol,'3-decision-tree.csv' )
coef3 = pd.Series(alg3.feature_importances_, predictors).sort_values(ascending=False)
plt.figure(figsize=(15,10))
coef3.plot(kind='bar', title="Decision Tree Feature Importances")


Here you can see that the RMSE is 1058 and the mean CV error is 1091. This tells us that the model is slightly overfitting. Lets try making a decision tree with just top 4 variables, a max_depth of 8 and min_samples_leaf as 150.



In [None]:
predictors = ['Item_MRP','Outlet_Type_0','Outlet_Type_3','Outlet_5','Outlet_Years']
alg4 = DecisionTreeRegressor(max_depth=8, min_samples_leaf=150)
modelfit(alg4, train, test, predictors, target, IDcol,'4-decision-tree.csv' )
coef4 = pd.Series(alg4.feature_importances_, predictors).sort_values(ascending=False)
plt.figure(figsize=(15,10))
coef4.plot(kind='bar', title="Decision Tree Feature Importances")



In [None]:
#Random Forest Model
from sklearn.ensemble import RandomForestRegressor
predictors = [x for x in train.columns if x not in [target]+IDcol]
alg5 = RandomForestRegressor(n_estimators=200,max_depth=5, min_samples_leaf=100, n_jobs=4)
modelfit(alg5, train, test, predictors, target, IDcol,'5-random-forest.csv' )
coef5 = pd.Series(alg5.feature_importances_, predictors).sort_values(ascending=False)
plt.figure(figsize=(15,10))
coef5.plot(kind='bar', title="Random Forest Feature Importances")




In [None]:
#Random Forest Model - revised
from sklearn.ensemble import RandomForestRegressor
predictors = [x for x in train.columns if x not in [target]+IDcol]
alg6 = RandomForestRegressor(n_estimators=400,max_depth=6, min_samples_leaf=100, n_jobs=4)
modelfit(alg6, train, test, predictors, target, IDcol,'6-random-forest.csv' )
coef6 = pd.Series(alg6.feature_importances_, predictors).sort_values(ascending=False)
plt.figure(figsize=(15,10))
coef6.plot(kind='bar', title="Random Forest Feature Importances")

