In [1]:
# import python library
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#ignore warning
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import data
train = pd.read_excel('restaurant_Data_Train.xlsx')
test=pd.read_excel('restaurant_Data_Test.xlsx')

In [3]:
train["source"] = "train"
test["source"] = "test"

In [4]:
#join 2 data frame 
df = pd.concat([train,test])

In [5]:
#check information
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16921 entries, 0 to 4230
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TITLE          16921 non-null  object 
 1   RESTAURANT_ID  16921 non-null  int64  
 2   CUISINES       16921 non-null  object 
 3   TIME           16921 non-null  object 
 4   CITY           16774 non-null  object 
 5   LOCALITY       16793 non-null  object 
 6   RATING         16917 non-null  object 
 7   VOTES          15315 non-null  object 
 8   COST           12690 non-null  float64
 9   source         16921 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 1.4+ MB


In [6]:
#Investigating the entire dataset first
df.duplicated().sum()

26

In [7]:
#drop duplicates values
df= df.drop_duplicates()

In [8]:
#check null values
df.isna().sum()

TITLE               0
RESTAURANT_ID       0
CUISINES            0
TIME                0
CITY              147
LOCALITY          128
RATING              4
VOTES            1602
COST             4230
source              0
dtype: int64

In [9]:
# Data exploration for CITY
# CITY has 147 null values
#combining City and locality
df['Location']=df['CITY']+' '+df['LOCALITY']
df.drop(columns=['CITY','LOCALITY'])

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,RATING,VOTES,COST,source,Location
0,CASUAL DINING,9438,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",3.6,49 votes,1200.0,train,Thane Dombivali East
1,"CASUAL DINING,BAR",13198,"Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),4.2,30 votes,1500.0,train,Chennai Ramapuram
2,CASUAL DINING,10915,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",3.8,221 votes,800.0,train,Chennai Saligramam
3,QUICK BITES,6346,"Tibetan, Chinese",11:30am – 1am (Mon-Sun),4.1,24 votes,800.0,train,Mumbai Bandra West
4,DESSERT PARLOR,15387,Desserts,11am – 1am (Mon-Sun),3.8,165 votes,300.0,train,Mumbai Lower Parel
...,...,...,...,...,...,...,...,...,...
4226,CASUAL DINING,9057,"North Indian, Mughlai, Chinese",11:30am – 11:30pm (Mon-Sun),3.9,287 votes,,test,New Delhi Punjabi Bagh
4227,,1247,"Biryani, North Indian, Sandwich, Salad, Wraps",11am – 1am (Mon-Sun),4.3,469 votes,,test,Bangalore HSR Layout
4228,QUICK BITES,8617,"Continental, North Indian",9:30am – 10:30pm (Mon-Sun),3.7,53 votes,,test,Faridabad Sector 86
4229,QUICK BITES,6485,"Rolls, Beverages","11am – 11:30pm (Mon, Tue, Wed, Thu, Sat, Sun),...",-,,,test,Kochi Kochi


In [10]:
df.dropna(subset=['Location'],inplace=True)

In [11]:
from fuzzywuzzy import process
 
names_array=[]
def match_names(wrong_names,correct_names):
    for row in wrong_names:
        x=process.extractOne(row, correct_names)
        if x[1]<60:
            names_array.append('Others')
        else:
            names_array.append(x[0])
    return names_array
  
#Wrong country names dataset

correct_names=['Bangalore','Thane',
'Hyderabad','Andheri',
'Delhi', 'Kerala',
'Chennai', 'Bandra',
'Mumbai', 'Telangana',
'Kochi', 
'Noida', 
'Gurgaon', 'Ernakulam',
'Faridabad', 'Ghaziabad',
'Secunderabad' ]
name_match=match_names(df.Location,correct_names)    

print(len(names_array))
df['Location']=names_array

16747


In [12]:
cuisines_list=[]
for row in df['CUISINES']:
    cuisines_list.append(list(row.split(',')))

df['CUISINES']=cuisines_list

In [13]:
df['CUISINES'].isna().sum()

0

In [14]:
df_cuisines=df['CUISINES'].apply(lambda x: pd.Series(1, x))

In [15]:
title_list=[]
for row in df['TITLE']:
    title_list.append(list(row.split(',')))
df['TITLE']=title_list

In [16]:
df_title=df['TITLE'].apply(lambda x: pd.Series(1, x))

In [17]:
df_title.head()

Unnamed: 0,CASUAL DINING,BAR,QUICK BITES,DESSERT PARLOR,CAFÉ,MICROBREWERY,BEVERAGE SHOP,IRANI CAFE,BAKERY,None,...,FOOD TRUCK,MESS,KIOSK,CLUB,CONFECTIONERY,DHABA,MEAT SHOP,COCKTAIL BAR,PAAN SHOP,BHOJANALYA
0,1.0,,,,,,,,,,...,,,,,,,,,,
1,1.0,1.0,,,,,,,,,...,,,,,,,,,,
2,1.0,,,,,,,,,,...,,,,,,,,,,
3,,,1.0,,,,,,,,...,,,,,,,,,,
4,,,,1.0,,,,,,,...,,,,,,,,,,


In [18]:
# cleaning time - pending
df[df['RATING'].isna()]

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST,source,Location
204,[BAKERY],15062,[Bakery],Not Available,Kochi,Marine Drive,,,200.0,train,Kochi
421,[CAFÉ],14127,"[Cafe, Fast Food]",Not Available,Hyderabad,Banjara Hills,,,350.0,train,Hyderabad
2035,[QUICK BITES],2776,"[Kerala, Arabian]",Not Available,Kochi,Kakkanad,,,,test,Kochi
2758,[None],11301,"[Chinese, North Indian, South Indian]",Not Available,Kochi,Tripunithura,,,,test,Kochi


In [19]:
df["RATING"] = df.groupby("CITY").RATING.transform(lambda x : x.fillna(x.mode()[0]))

In [20]:
df['RATING']=df['RATING'].str.extract('(\d+)').astype(float)

In [21]:
df['VOTES'].isna().sum()

1579

In [22]:
df.VOTES.fillna('0',inplace=True)
df['VOTES']=df['VOTES'].str.extract('(\d+)').astype(float)

In [23]:
df.drop(columns='CITY',inplace=True)
df.drop(columns='LOCALITY',inplace=True)
df.drop(columns='CUISINES',inplace=True)

In [24]:
df_City=pd.get_dummies(df['Location'])
df.drop(columns='Location',inplace=True)
df_City.head()

Unnamed: 0,Andheri,Bandra,Bangalore,Chennai,Delhi,Ernakulam,Faridabad,Ghaziabad,Gurgaon,Hyderabad,Kerala,Kochi,Mumbai,Noida,Others,Secunderabad,Telangana,Thane
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [25]:
df = pd.concat([df,df_City,df_cuisines,df_title], axis=1)

In [26]:
df.drop(columns='TITLE',inplace=True)

In [27]:
df_column_category = df.select_dtypes(exclude=np.number).columns
df_column_category

Index(['TIME', 'source'], dtype='object')

In [28]:
#df.drop(columns='City found',inplace=True)
df.drop(columns='TIME',inplace=True)

In [29]:
df.fillna(0,inplace=True)

In [30]:
train_final = df[df.source=="train"]
test_final = df[df.source=="test"]

In [31]:
train_final.shape

(12552, 277)

In [32]:
train_final.drop(columns=["source"],inplace=True)

In [33]:
test_final.drop(columns=["source",'COST'],inplace=True)

In [34]:
train_X = train_final.drop(columns=["COST",'RESTAURANT_ID'])

In [35]:
train_Y = train_final["COST"]

In [36]:
test_X = test_final.drop(columns=["RESTAURANT_ID"])

In [37]:
train_X.fillna(0,inplace=True)
train_X.isna().sum()

RATING          0
VOTES           0
Andheri         0
Bandra          0
Bangalore       0
               ..
DHABA           0
MEAT SHOP       0
COCKTAIL BAR    0
PAAN SHOP       0
BHOJANALYA      0
Length: 274, dtype: int64

In [38]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(train_X, train_Y)
dtrain_predictions = model.predict(train_X)

In [39]:
from sklearn.model_selection import cross_val_score
a = cross_val_score(model, train_X, train_Y, cv=5, scoring='neg_mean_squared_error')

In [40]:
#Print model report:
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_squared_log_error, r2_score
print("\nModel Report")
print("RMSE : %.4g" % np.sqrt(mean_squared_error(train_Y.values, dtrain_predictions)))
    
#Predict on testing data:
test_X.fillna(0,inplace=True)
test_final["res_linear"] =  model.predict(test_X)



Model Report
RMSE : 331.3


In [41]:
print('r2 train',r2_score(train_Y,dtrain_predictions))
#print('r2 test',r2_score(test_y,test_predict))

r2 train 0.7224620540548967


In [42]:
Linear_submission = test_final[["RESTAURANT_ID","res_linear"]]

In [43]:
Linear_submission.head(20)

Unnamed: 0,RESTAURANT_ID,res_linear
0,4085,917.578003
1,12680,359.178619
2,1411,809.393555
3,204,363.64209
4,13453,205.616516
5,4518,51.630341
6,1643,615.144836
7,5109,620.427673
8,5606,433.677307
9,14319,654.86908
