In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

Using TensorFlow backend.


In [37]:
# Set file paths for train and predict datasets
train_file = 'DataSet/Data_Train.xlsx'
predict_file = 'DataSet/Data_Test.xlsx'

In [38]:
# Extract train/predict data from spreadsheet into pandas dataframes
train_df = pd.read_excel(train_file)
predict_df = pd.read_excel(predict_file)

In [39]:
# Get numeric value from "Delivery_Time" field in train data
train_df['Del_Time'] = train_df['Delivery_Time'].apply(lambda x: pd.to_numeric(x.split('minutes')[0].strip()))
train_df['Del_Time_enc'] = train_df['Del_Time'].factorize()[0]
train_df.head()

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time,Del_Time,Del_Time_enc
0,ID_6321,"FTI College, Law College Road, Pune","Fast Food, Rolls, Burger, Salad, Wraps",200,50,3.5,12,4,30 minutes,30,0
1,ID_2882,"Sector 3, Marathalli","Ice Cream, Desserts",100,50,3.5,11,4,30 minutes,30,0
2,ID_1595,Mumbai Central,"Italian, Street Food, Fast Food",150,50,3.6,99,30,65 minutes,65,1
3,ID_5929,"Sector 1, Noida","Mughlai, North Indian, Chinese",250,99,3.7,176,95,30 minutes,30,0
4,ID_6123,"Rmz Centennial, I Gate, Whitefield","Cafe, Beverages",200,99,3.2,521,235,65 minutes,65,1


In [40]:
train_df.groupby(['Del_Time', 'Del_Time_enc']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Del_Time,Del_Time_enc,count
0,10,3,4
1,20,4,20
2,30,0,7406
3,45,2,2665
4,65,1,923
5,80,6,14
6,120,5,62


In [41]:
# Extract "Del_Time" field from train_df into NumPy array
train_y = np.array([train_df['Del_Time_enc'].values]).T
train_df.drop(['Del_Time', 'Delivery_Time', 'Del_Time_enc'], inplace=True, axis=1)
print("train_y: {}".format(train_y.shape))
print("Sample train_y data: \n{}".format(train_y[0:10,:]))

train_y: (11094, 1)
Sample train_y data: 
[[0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [2]
 [0]
 [1]]


In [42]:
# Combine the train and predict dataframes
combined_df = train_df.append(predict_df, sort=False, ignore_index=True)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 8)


In [43]:
# Read location analysis data
loc_analysis_df = pd.read_excel("Location_Analysis.xlsx")
loc_analysis_df.head()

Unnamed: 0,Location,State,City,Area
0,"FTI College, Law College Road, Pune",Maharashtra,Pune,"FTI College, Law College Road"
1,"Sector 3, Marathalli",Karnataka,Bangalore,"Sector 3, Marathalli"
2,Mumbai Central,Maharashtra,Mumbai,Mumbai Central
3,"Sector 1, Noida",New Delhi,Noida,Sector 1
4,"Rmz Centennial, I Gate, Whitefield",Karnataka,Bangalore,"Rmz Centennial, I Gate, Whitefield"


In [44]:
# Derive "State", "City" and "Area" based on "Location" field
combined_df['State'] = combined_df['Location'].apply(lambda x: loc_analysis_df[loc_analysis_df['Location'] == x]['State'].max(axis=0))
combined_df['City'] = combined_df['Location'].apply(lambda x: loc_analysis_df[loc_analysis_df['Location'] == x]['City'].max(axis=0))
combined_df['Area'] = combined_df['Location'].apply(lambda x: loc_analysis_df[loc_analysis_df['Location'] == x]['Area'].max(axis=0))
combined_df.drop(['Location'], inplace=True, axis=1)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 10)


In [45]:
# Convert "State" field to one-hot encoding
dummy_df = pd.get_dummies(combined_df['State'], prefix='State')
combined_df = pd.concat([combined_df, dummy_df], axis=1)
combined_df.drop(['State'], inplace=True, axis=1)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 14)


In [46]:
# Convert "City" field to one-hot encoding
dummy_df = pd.get_dummies(combined_df['City'], prefix='City')
combined_df = pd.concat([combined_df, dummy_df], axis=1)
combined_df.drop(['City'], inplace=True, axis=1)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 24)


In [47]:
# Column encoding for "Area" field
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 1), stop_words='english', max_features=10000)
features = tfidf.fit_transform(combined_df.Area).toarray()
features_df = pd.DataFrame(features, columns=tfidf.get_feature_names())
combined_df = pd.merge(combined_df, features_df, left_index=True, right_index=True)
combined_df.drop(['Area'], inplace=True, axis=1)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 94)


In [48]:
# Convert "Cuisines" field into categorical encoded individual fields
combined_df['Cuisine_List'] = combined_df['Cuisines'].apply(lambda x: x.split(', '))
combined_df = combined_df.drop('Cuisine_List', 1).join(combined_df.Cuisine_List.str.join('|').str.get_dummies())
combined_df.drop(['Cuisines'], inplace=True, axis=1)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 194)


In [49]:
# Fill invalid string value in "Average_Cost" field with mean value
combined_df['Avg_Cost'] = combined_df['Average_Cost'].apply(lambda x: np.nan if x == 'for' else x)
mean_cost = combined_df['Avg_Cost'].mean()
combined_df['Avg_Cost'] = combined_df['Avg_Cost'].fillna(mean_cost)
combined_df.drop(['Average_Cost'], inplace=True, axis=1)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 194)


In [50]:
# Fill invalid string value in "Rating" field based on below rule:
# 1) If ['NEW','Opening Soon','Temporarily Closed'] then 0
# 2) If '-', then NaN
combined_df['modified_rating'] = combined_df['Rating'].apply(lambda x: np.nan if x == '-' else (0 if x in ['NEW','Opening Soon','Temporarily Closed'] else pd.to_numeric(x)))
mean_rating = combined_df['modified_rating'].mean()
combined_df['modified_rating'] = combined_df['modified_rating'].fillna(mean_rating)
combined_df.drop(['Rating'], inplace=True, axis=1)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 194)


In [51]:
# Fill invalid string value in "Votes" field with mean value
combined_df['modified_votes'] = combined_df['Votes'].apply(lambda x: np.nan if x == '-' else pd.to_numeric(x))
mean_cost = combined_df['modified_votes'].mean()
combined_df['modified_votes'] = combined_df['modified_votes'].fillna(mean_cost)
combined_df.drop(['Votes'], inplace=True, axis=1)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 194)


In [52]:
# Fill invalid string value in "Reviews" field with mean value
combined_df['modified_reviews'] = combined_df['Reviews'].apply(lambda x: np.nan if x == '-' else pd.to_numeric(x))
mean_cost = combined_df['modified_reviews'].mean()
combined_df['modified_reviews'] = combined_df['modified_reviews'].fillna(mean_cost)
combined_df.drop(['Reviews'], inplace=True, axis=1)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 194)


In [53]:
combined_df.drop(['Restaurant'], inplace=True, axis=1)
print("combined_df: {}".format(combined_df.shape))

combined_df: (13868, 193)


In [54]:
# Check if any column has NaN value in dataframe
print("Column with NaN value: {}".format(combined_df.columns[combined_df.isnull().any()].tolist()))

Column with NaN value: []


In [55]:
# Segregate combined_df into train/predict datasets
train_x = combined_df[:11094]
predict_x = combined_df[11094:]

In [56]:
print(train_x.shape)
print(predict_x.shape)

(11094, 193)
(2774, 193)


In [57]:
# Scale the train_x/predict_x datasets
scaler_x = RobustScaler().fit(train_x)
train_x = scaler_x.transform(train_x)
predict_x = scaler_x.transform(predict_x)

In [58]:
# Split training data into train/validation/test datasets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=1)
for train_index, validation_index in sss.split(train_x, train_y):
    train_x, validation_x = train_x[train_index], train_x[validation_index]
    train_y, validation_y = train_y[train_index], train_y[validation_index]

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=1)
for train_index, test_index in sss.split(train_x, train_y):
    train_x, test_x = train_x[train_index], train_x[test_index]
    train_y, test_y = train_y[train_index], train_y[test_index]

In [59]:
print("------------------------- Training Dataset -------------------------")
print("train_x shape: {}".format(train_x.shape))
print("train_y shape: {}".format(train_y.shape))

print("\n------------------------- Validation Dataset -------------------------")
print("validation_x shape: {}".format(validation_x.shape))
print("validation_y shape: {}".format(validation_y.shape))

print("\n------------------------- Test Dataset -------------------------")
print("test_x shape: {}".format(test_x.shape))
print("test_y shape: {}".format(test_y.shape))

------------------------- Training Dataset -------------------------
train_x shape: (8985, 193)
train_y shape: (8985, 1)

------------------------- Validation Dataset -------------------------
validation_x shape: (1110, 193)
validation_y shape: (1110, 1)

------------------------- Test Dataset -------------------------
test_x shape: (999, 193)
test_y shape: (999, 1)


In [60]:
temp_df = pd.DataFrame(validation_y, columns=['Class'])
temp_df.groupby(['Class']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Class,count
0,0,741
1,1,92
2,2,267
3,4,2
4,5,6
5,6,2


In [61]:
temp_df = pd.DataFrame(test_y, columns=['Class'])
temp_df.groupby(['Class']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Class,count
0,0,667
1,1,83
2,2,240
3,4,2
4,5,6
5,6,1


In [62]:
# Handling class imbalance
sm = SMOTE(k_neighbors=3)
sm_x, sm_y = sm.fit_sample(train_x, train_y.ravel())
train_x = sm_x
train_y = np.array([sm_y]).T
print("Class balancing done.")
print("train_x shape: {}".format(train_x.shape))
print("train_y shape: {}".format(train_y.shape))

Class balancing done.
train_x shape: (41986, 193)
train_y shape: (41986, 1)


In [63]:
temp_df = pd.DataFrame(train_y, columns=['Class'])
temp_df.groupby(['Class']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Class,count
0,0,5998
1,1,5998
2,2,5998
3,3,5998
4,4,5998
5,5,5998
6,6,5998


In [64]:
print("------------------------- Training Dataset -------------------------")
print("train_x shape: {}".format(train_x.shape))
print("train_y shape: {}".format(train_y.shape))

print("\n------------------------- Validation Dataset -------------------------")
print("validation_x shape: {}".format(validation_x.shape))
print("validation_y shape: {}".format(validation_y.shape))

print("\n------------------------- Test Dataset -------------------------")
print("test_x shape: {}".format(test_x.shape))
print("test_y shape: {}".format(test_y.shape))

------------------------- Training Dataset -------------------------
train_x shape: (41986, 193)
train_y shape: (41986, 1)

------------------------- Validation Dataset -------------------------
validation_x shape: (1110, 193)
validation_y shape: (1110, 1)

------------------------- Test Dataset -------------------------
test_x shape: (999, 193)
test_y shape: (999, 1)


In [86]:
# Define model hyperparameters
params = {}
params["objective"] = "multiclass"
params['metric'] = 'multi_logloss'
params["max_depth"] = 7
params["num_leaves"] = 70
params["num_class"] = 7
params['boosting_type'] = 'gbdt'
params["min_data_in_leaf"] = 1
params["learning_rate"] = 0.00025
params["bagging_fraction"] = 0.8
params["feature_fraction"] = 0.8
params["bagging_freq"] = 5
params["bagging_seed"] = 0
params["verbosity"] = -1
num_rounds = 20000

In [87]:
lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
lgvalidation = lgb.Dataset(validation_x, label=validation_y.ravel())

In [88]:
model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgvalidation], early_stopping_rounds=200, verbose_eval=1000)

Training until validation scores don't improve for 200 rounds
[1000]	valid_0's multi_logloss: 1.70041
[2000]	valid_0's multi_logloss: 1.5269
[3000]	valid_0's multi_logloss: 1.39699
[4000]	valid_0's multi_logloss: 1.29234
[5000]	valid_0's multi_logloss: 1.20266
[6000]	valid_0's multi_logloss: 1.1291
[7000]	valid_0's multi_logloss: 1.06763
[8000]	valid_0's multi_logloss: 1.01639
[9000]	valid_0's multi_logloss: 0.97355
[10000]	valid_0's multi_logloss: 0.937674
[11000]	valid_0's multi_logloss: 0.907542
[12000]	valid_0's multi_logloss: 0.881859
[13000]	valid_0's multi_logloss: 0.859459
[14000]	valid_0's multi_logloss: 0.839729
[15000]	valid_0's multi_logloss: 0.822653
[16000]	valid_0's multi_logloss: 0.807479
[17000]	valid_0's multi_logloss: 0.793784
[18000]	valid_0's multi_logloss: 0.781414
[19000]	valid_0's multi_logloss: 0.770256
[20000]	valid_0's multi_logloss: 0.760199
Did not meet early stopping. Best iteration is:
[20000]	valid_0's multi_logloss: 0.760199


In [89]:
y_pred=model.predict(test_x)
predicted_labels = np.array([np.argmax(y_pred, axis=1)]).T

In [90]:
temp_df = pd.DataFrame(predicted_labels, columns=['Class'])
temp_df.groupby(['Class']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Class,count
0,0,605
1,1,114
2,2,264
3,4,6
4,5,9
5,6,1


In [91]:
f1 = f1_score(test_y, predicted_labels, average='weighted') * 100
accuracy = accuracy_score(test_y, predicted_labels)
print("F1 Score: {}".format(f1))
print("Accuracy: {}".format(accuracy))

F1 Score: 71.85646553874373
Accuracy: 0.7057057057057057
