In [None]:
% pylab inline

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Read train files:
train = pd.read_csv("../input/Train.csv")
print(train.shape)

In [None]:
train.head()

In [None]:
#lets have a look at the number of unique values in each of them.
train.apply(lambda x: len(x.unique()))


### Finding frequency of each Categorical data using loops


In [None]:
#Filter categorical variables
categorical_columns = [x for x in train.dtypes.index if train.dtypes[x]=='object']
#Exclude ID cols and source:
categorical_columns = [x for x in categorical_columns if x not in ['Item_Identifier','Outlet_Identifier','source']]
#Print frequency of categories
for col in categorical_columns:
    print ('\n\nFrequency of Categories for varible %s'%col)
    print (train[col].value_counts())

## Data Cleaning

In [None]:
train.describe()

In [None]:
train['Item_Visibility'].hist(bins=20)

In [None]:
train['Item_Fat_Content'].value_counts()

In [None]:
train['Outlet_Size'].value_counts()

In [None]:
train.boxplot(column='Item_MRP', by='Outlet_Size')

In [None]:
train.boxplot(column='Item_Visibility', by='Outlet_Type')

In [None]:
#Filling missing values

In [None]:
train['Outlet_Size'].mode()[0]

In [None]:
# fill the na for outlet size with medium(Which have the highest frequency)
train['Outlet_Size'] = train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])

In [None]:
# fill the na for item weight with the mean of weights
train['Item_Weight'] = train['Item_Weight'].fillna(train['Item_Weight'].mean())

In [None]:
train.boxplot(column='Item_Visibility')

In [None]:
# delete the observations
#deleting useless observations so that run time of the model can be improved

Q1 = train['Item_Visibility'].quantile(0.25)
Q3 = train['Item_Visibility'].quantile(0.75)
IQR = Q3 - Q1
filt_train = train.query('(@Q1 - 1.5 * @IQR) <= Item_Visibility <= (@Q3 + 1.5 * @IQR)')

In [None]:
filt_train.shape,  train.shape


In [None]:
train = filt_train
train.shape

In [None]:
#Feature engineering

In [None]:
#splitting item visibility in Low Viz', 'Viz', 'High Viz' 
train['Item_Visibility_bins'] = pd.cut(train['Item_Visibility'], [0.000, 0.065, 0.13, 0.2], labels=['Low Viz', 'Viz', 'High Viz'])
#replace nan by Low viz
train['Item_Visibility_bins'] = train['Item_Visibility_bins'].replace(NaN, 'Low Viz')

In [None]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace(['low fat', 'LF'], 'Low Fat')
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace(['low fat', 'LF'], 'Low Fat')
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace('reg', 'Regular')

In [None]:
#choosing the Fat content, item vizibility bins, outlet size, loc type and type for LABEL ENCODER

In [None]:
le = LabelEncoder()

In [None]:
train['Item_Fat_Content'].unique()

In [None]:
train['Item_Fat_Content'] = le.fit_transform(train['Item_Fat_Content'])

In [None]:
train['Item_Visibility_bins'] = le.fit_transform(train['Item_Visibility_bins'])

In [None]:
train['Outlet_Size'] = le.fit_transform(train['Outlet_Size'])

In [None]:
train['Outlet_Location_Type'] = le.fit_transform(train['Outlet_Location_Type'])

In [None]:
train = pd.concat([train, dummy], axis=1)

In [None]:
# create dummies for outlet type
dummy = pd.get_dummies(train['Outlet_Type'])
dummy.head()

In [None]:
# in linear regression that correlated features should not be present

train.corr()[((train.corr() < -0.85) | (train.corr() > 0.85)) & (train.corr() != 1)]

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
# got to drop all the object types features
train = train.drop(['Item_Identifier', 'Item_Type', 'Outlet_Identifier', 'Outlet_Type'], axis=1)

In [None]:
train.columns

In [None]:
#build the linear regression model
X = train.drop('Item_Outlet_Sales', axis=1)
y = train.Item_Outlet_Sales

## Same Operation for the test dataset
### I am doing  these operations seperatly because test and train dataset have not equal colums

In [None]:
test = pd.read_csv("../input/Test.csv")

In [None]:
test['Outlet_Size'] = test['Outlet_Size'].fillna('Medium')

In [None]:
test['Item_Visibility_bins'] = pd.cut(test['Item_Visibility'], [0.000, 0.065, 0.13, 0.2], labels=['Low Viz', 'Viz', 'High Viz'])

In [None]:
test['Item_Weight'] = test['Item_Weight'].fillna(test['Item_Weight'].mean())

In [None]:
test['Item_Visibility_bins'] = test['Item_Visibility_bins'].replace(NaN, 'Low Viz')
test['Item_Visibility_bins'].head()

In [None]:
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace(['low fat', 'LF'], 'Low Fat')
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace('reg', 'Regular')

In [None]:
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace(['low fat', 'LF'], 'Low Fat')

In [None]:
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace('reg', 'Regular')


In [None]:

test['Item_Fat_Content'] = le.fit_transform(test['Item_Fat_Content'])


In [None]:
test['Item_Visibility_bins'] = le.fit_transform(test['Item_Visibility_bins'])


In [None]:
test['Outlet_Size'] = le.fit_transform(test['Outlet_Size'])

In [None]:

test['Outlet_Location_Type'] = le.fit_transform(test['Outlet_Location_Type'])

In [None]:
dummy = pd.get_dummies(test['Outlet_Type'])
test = pd.concat([test, dummy], axis=1)

In [None]:
test.head()

In [None]:
X_test = test.drop(['Item_Identifier', 'Item_Type', 'Outlet_Identifier', 'Outlet_Type'], axis=1)



In [None]:
X.shape,y.shape,test.shape,X_test.shape

In [None]:
X.head()

In [None]:
X_test.head()

In [None]:
lin = LinearRegression()

In [None]:
lin.fit(X, y)
predictions = lin.predict(X_test)

In [None]:
# decision tree
dtree_class = DecisionTreeClassifier(criterion='gini', max_depth=25)
y = y.astype(int)

In [None]:
dtree_class.fit(X, y)

In [None]:
accuracy_score(y, dtree_class.predict(X))

In [None]:
r2_score(y, dtree_class.predict(X))

In [None]:
pred = dtree_class.predict(X_test)
pred

In [None]:
# # create submission file
# submission = pd.DataFrame(data=[], columns=['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
# submission['Item_Identifier'] = test['Item_Identifier']
# submission['Outlet_Identifier'] = test['Outlet_Identifier']
# submission['Item_Outlet_Sales'] = pred
# submission.to_csv('submission.csv', index=False)
# submission.head()

In [None]:
rmf = RandomForestClassifier(n_estimators=100, max_depth=10)