<a href="https://colab.research.google.com/github/gabriel51israel/MachineLearning/blob/main/Regression_BigMart_SalesPrice_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

**Data Investigation**

In [None]:
bigmart_data = pd.read_csv('Train.csv')
bigmart_data.head()

In [None]:
bigmart_data.shape

In [None]:
bigmart_data.info()

In [None]:
bigmart_data.apply(lambda x: len(x.unique()))

In [None]:
bigmart_data.isnull().sum()

In [None]:
# Check for Categorical/ Object variables
cat_col = []
for i in bigmart_data.dtypes.index:
  if bigmart_data.dtypes[i] == "object":
    cat_col.append(i)

cat_col

In [None]:
#Remove Identifier inside this object list
cat_col.remove('Item_Identifier')
cat_col.remove('Outlet_Identifier')

In [None]:
cat_col

In [None]:
for col_name in cat_col:
  print(col_name)
  print(bigmart_data[col_name].value_counts(),"\n")

**Data Cleaning**

In [None]:
bigmart_data.isnull().sum()

In [None]:
bigmart_data['Item_Weight']=bigmart_data['Item_Weight'].fillna(bigmart_data['Item_Weight'].mean())

In [None]:
bigmart_data['Outlet_Size'] = bigmart_data['Outlet_Size'].fillna(bigmart_data['Outlet_Size'].mode()[0])

In [None]:
sum(bigmart_data['Item_Visibility']==0)

In [None]:
bigmart_data.loc[:,'Item_Visibility'].replace([0],[bigmart_data['Item_Visibility'].mean()],inplace=True)

In [None]:
bigmart_data['Item_Fat_Content'] = bigmart_data['Item_Fat_Content'].replace({'LF':'Low Fat','low fat': 'Low Fat','low fat':'Low Fat','reg':'Regular'})

In [None]:
bigmart_data['Item_Fat_Content'].value_counts()

In [None]:
# Creating New features/Attributes
bigmart_data['New_Item_type'] = bigmart_data['Item_Identifier'].apply(lambda x:x[:2])

In [None]:
bigmart_data['New_Item_type'] = bigmart_data['New_Item_type'].map({'FD':'Food','NC':'Non-Consumable','DR':'Drinks'})

In [None]:
bigmart_data['New_Item_type'].value_counts()

In [None]:
bigmart_data.head()

In [None]:
bigmart_data.loc[bigmart_data['New_Item_type']=="Non-Consumable", "Item_Fat_Content"] = 'Non-Edible'

In [None]:
bigmart_data['Item_Fat_Content'].value_counts()

In [None]:
bigmart_data['Outlet_years']=2013 - bigmart_data['Outlet_Establishment_Year']

In [None]:
bigmart_data.head()

**Visualization**
*italicized text*
*Integer/Float datatype*

In [None]:
sns.distplot(bigmart_data['Item_Weight'])

In [None]:
sns.distplot(bigmart_data['Item_Visibility'])

In [None]:
sns.distplot(bigmart_data['Item_MRP'])

In [None]:
sns.distplot(bigmart_data['Outlet_years'])

In [None]:
sns.distplot(bigmart_data['Item_Outlet_Sales'])

**Normalizing/Log Transforming the predictor variable**

In [None]:
bigmart_data['Item_Outlet_Sales'] = np.log(1 + bigmart_data['Item_Outlet_Sales'])

In [None]:
sns.distplot(bigmart_data['Item_Outlet_Sales'])

**Visualization**

*Categorical Datatype*

In [None]:
sns.countplot(bigmart_data['Item_Fat_Content'])

In [None]:
plt.figure(figsize=(30,10))
sns.countplot(bigmart_data['Item_Type'])

In [None]:
plt.figure(figsize=(30,10))
sns.countplot(bigmart_data['Outlet_Identifier'])

In [None]:
sns.countplot(bigmart_data['Outlet_Size'])

In [None]:
plt.figure(figsize=(30,10))
sns.countplot(bigmart_data['Outlet_Type'])

In [None]:
sns.countplot(bigmart_data['New_Item_type'])

In [None]:
sns.countplot(bigmart_data['Outlet_Establishment_Year'])

**Check for Correlation**

In [None]:
corr = bigmart_data.corr()
sns.heatmap(corr,cmap='coolwarm',annot=True)

In [None]:
bigmart_data.head()

**Apply Label Encoding/OneHot Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
bigmart_data['Outlet']= le.fit_transform(bigmart_data['Outlet_Identifier'])
bigmart_data['Item_Type'] = le.fit_transform(bigmart_data['Item_Type'])
bigmart_data.head()

In [None]:
bigmart_data = pd.get_dummies(bigmart_data,columns=['Item_Fat_Content','Outlet_Size','Outlet_Location_Type','Outlet_Type','New_Item_type'], drop_first=True)
bigmart_data.head()

**Model Building**

In [None]:
X = bigmart_data.drop(columns=['Item_Identifier','Outlet_Establishment_Year','Outlet_Identifier','Item_Outlet_Sales'])
y = bigmart_data['Item_Outlet_Sales']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

# Accuracy Metric - R2 score --> should be close to 1 for a good model
r_sq = model.score(X_train, y_train)
print('coefficient of determination/R2 value:', r_sq)
print("Coefficients/Slope : \n",model.coef_,"\n")
print("Intercepts :\n",model.intercept_)

# Plotting the bar graph for Coefficient importance
coeff = pd.Series(model.coef_,X_train.columns).sort_values()
plt.figure(figsize =(10,8))
coeff.plot(kind='bar',title="Model Coefficients")

#Model Report - MSE or CV 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
print("Model Report")
print("MSE: ",mean_squared_error(y_test,y_pred))
cv_score = cross_val_score(model,X_train,y_train,scoring = 'neg_mean_squared_error',cv=5)
print("CV Score: ", cv_score)
print("CV Score: ", np.abs(np.mean(cv_score)))

In [None]:
model = Ridge(alpha=0.002,normalize=True)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

# Accuracy Metric - R2 score --> should be close to 1 for a good model
r_sq = model.score(X_train, y_train)
print('coefficient of determination/R2 value:', r_sq)
print("Coefficients/Slope : \n",model.coef_,"\n")
print("Intercepts :\n",model.intercept_)

# Plotting the bar graph for Coefficient importance
coeff = pd.Series(model.coef_,X_train.columns).sort_values()
plt.figure(figsize =(10,8))
coeff.plot(kind='bar',title="Model Coefficients")

#Model Report - MSE or CV 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
print("Model Report")
print("MSE: ",mean_squared_error(y_test,y_pred))
cv_score = cross_val_score(model,X_train,y_train,scoring = 'neg_mean_squared_error',cv=5)
print("CV Score: ", cv_score)
print("CV Score: ", np.abs(np.mean(cv_score)))

In [None]:
model = Lasso(alpha=0.001)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

# Accuracy Metric - R2 score --> should be close to 1 for a good model
r_sq = model.score(X_train, y_train)
print('coefficient of determination/R2 value:', r_sq)
print("Coefficients/Slope : \n",model.coef_,"\n")
print("Intercepts :\n",model.intercept_)

# Plotting the bar graph for Coefficient importance
coeff = pd.Series(model.coef_,X_train.columns).sort_values()
plt.figure(figsize =(10,8))
coeff.plot(kind='bar',title="Model Coefficients")

#Model Report - MSE or CV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
print("Model Report")
print("MSE: ",mean_squared_error(y_test,y_pred))
cv_score = cross_val_score(model,X_train,y_train,scoring = 'neg_mean_squared_error',cv=5)
print("CV Score: ", cv_score)
print("CV Score: ", np.abs(np.mean(cv_score)))

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

# Accuracy Metric - R2 score --> should be close to 1 for a good model
r_sq = model.score(X_train, y_train)
print('coefficient of determination/R2 value:', r_sq)

# Plotting the bar graph for Coefficient importance
coeff = pd.Series(model.feature_importances_,X_train.columns).sort_values()
plt.figure(figsize =(10,8))
coeff.plot(kind='bar',title="Model Coefficients")

#Model Report - MSE or CV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
print("Model Report")
print("MSE: ",mean_squared_error(y_test,y_pred))
cv_score = cross_val_score(model,X_train,y_train,scoring = 'neg_mean_squared_error',cv=5)
print("CV Score: ", cv_score)
print("CV Score: ", np.abs(np.mean(cv_score)))

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

# Accuracy Metric - R2 score --> should be close to 1 for a good model
r_sq = model.score(X_train, y_train)
print('coefficient of determination/R2 value:', r_sq)

# Plotting the bar graph for Coefficient importance
coeff = pd.Series(model.feature_importances_,X_train.columns).sort_values()
plt.figure(figsize =(10,8))
coeff.plot(kind='bar',title="Model Coefficients")

#Model Report - MSE or CV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
print("Model Report")
print("MSE: ",mean_squared_error(y_test,y_pred))
cv_score = cross_val_score(model,X_train,y_train,scoring = 'neg_mean_squared_error',cv=5)
print("CV Score: ", cv_score)
print("CV Score: ", np.abs(np.mean(cv_score)))

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

# Accuracy Metric - R2 score --> should be close to 1 for a good model
r_sq = model.score(X_train, y_train)
print('coefficient of determination/R2 value:', r_sq)

# Plotting the bar graph for Coefficient importance
coeff = pd.Series(model.feature_importances_,X_train.columns).sort_values()
plt.figure(figsize =(10,8))
coeff.plot(kind='bar',title="Model Coefficients")

#Model Report - MSE or CV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
print("Model Report")
print("MSE: ",mean_squared_error(y_test,y_pred))
cv_score = cross_val_score(model,X_train,y_train,scoring = 'neg_mean_squared_error',cv=5)
print("CV Score: ", cv_score)
print("CV Score: ", np.abs(np.mean(cv_score)))