In [1]:
import numpy as np
import pandas as pd
from dateutil import parser
from datetime import datetime
import matplotlib.pyplot as plt 
import seaborn  as sns
%matplotlib inline

In [2]:
test = pd.read_csv('Test-Set.csv')
train = pd.read_csv('Train-Set.csv')

In [3]:
test.head()

Unnamed: 0,ProductID,Weight,FatContent,ProductVisibility,ProductType,MRP,OutletID,EstablishmentYear,OutletSize,LocationType,OutletType
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [4]:
train.head()

Unnamed: 0,ProductID,Weight,FatContent,ProductVisibility,ProductType,MRP,OutletID,EstablishmentYear,OutletSize,LocationType,OutletType,OutletSales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [5]:
# The workflow
workflow = ['1. Get data ready',
 '2. Pick a model(estimator) or algorithm to suit your problem',
 '3. Fit the model to the data and make a prediction',
 '4. Evaluate the model',
 '5. Improve the model',
 '6. Save and reload a trained model',
 '7. Putting it all together']  # topics

# **1. Getting the data ready**

In [6]:
train.isna().sum()

ProductID               0
Weight               1463
FatContent              0
ProductVisibility       0
ProductType             0
MRP                     0
OutletID                0
EstablishmentYear       0
OutletSize           2410
LocationType            0
OutletType              0
OutletSales             0
dtype: int64

# **2. Fixing NaN values , do we drop or fill ?  ==> lets fill**

In [7]:
train.Weight.fillna(train.Weight.mean(), inplace = True)

In [8]:
train.OutletSize.fillna('missing', inplace = True)

In [9]:
train.dtypes

ProductID             object
Weight               float64
FatContent            object
ProductVisibility    float64
ProductType           object
MRP                  float64
OutletID              object
EstablishmentYear      int64
OutletSize            object
LocationType          object
OutletType            object
OutletSales          float64
dtype: object

In [10]:
# Lets splitt the model
X= train.drop('OutletSales', axis = 1)  # feature matrix
y =train['OutletSales']  #  target or label 

In [21]:
y

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: OutletSales, Length: 8523, dtype: float64

# **3. Converting non numeric to numeric**

In [16]:
# changing the car categories into to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# now defime our categorical features
categorical_features  = ['ProductID','FatContent','ProductType','OutletID','OutletSize','LocationType','OutletType']
one_hot = OneHotEncoder()
transformer = ColumnTransformer ([("one_hot",
                                   one_hot,
                                   categorical_features)],
                                 remainder= 'passthrough')   # will take in a turple

In [17]:
transformed_X = transformer.fit_transform(X)
transformed_X

<8523x1605 sparse matrix of type '<class 'numpy.float64'>'
	with 93227 stored elements in Compressed Sparse Row format>

In [18]:
# build ML model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()


# split into train and test
from sklearn.model_selection import train_test_split


# lets refit the model
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size = 0.2)

model.fit(X_train, y_train)

RandomForestRegressor()

In [22]:
model.score(X_test, y_test)

0.5613873724063994