## Develop a machine learning model to classify restaurants based on their cuisines.

In [1]:
# Import Libraries
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Pre-processing Steps

In [3]:
# Creating the Dataframe
df=pd.read_csv('Dataset.csv')

In [4]:
df.head(1)

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314


In [5]:
# Droping un-necessary Columns
df.drop('Restaurant ID', axis=1, inplace=True) 
df.drop('Country Code', axis=1, inplace=True) 
df.drop('City', axis=1, inplace=True) 
df.drop('Address', axis=1, inplace=True) 
df.drop('Locality', axis=1, inplace=True) 
df.drop('Locality Verbose', axis=1, inplace=True) 
df.drop('Longitude', axis=1, inplace=True) 
df.drop('Latitude', axis=1, inplace=True) 
df.drop('Currency', axis=1, inplace=True) 
df.drop('Has Table booking', axis=1, inplace=True) 
df.drop('Has Online delivery', axis=1, inplace=True) 
df.drop('Is delivering now', axis=1, inplace=True) 
df.drop('Switch to order menu', axis=1, inplace=True) 
df.drop('Price range', axis=1, inplace=True) 
df.drop('Aggregate rating', axis=1, inplace=True) 
df.drop('Rating color', axis=1, inplace=True) 
df.drop('Rating text', axis=1, inplace=True) 
df.drop('Votes', axis=1, inplace=True)

In [6]:
df

Unnamed: 0,Restaurant Name,Cuisines,Average Cost for two
0,Le Petit Souffle,"French, Japanese, Desserts",1100
1,Izakaya Kikufuji,Japanese,1200
2,Heat - Edsa Shangri-La,"Seafood, Asian, Filipino, Indian",4000
3,Ooma,"Japanese, Sushi",1500
4,Sambo Kojin,"Japanese, Korean",1500
...,...,...,...
9546,Naml۱ Gurme,Turkish,80
9547,Ceviz A��ac۱,"World Cuisine, Patisserie, Cafe",105
9548,Huqqa,"Italian, World Cuisine",170
9549,A���k Kahve,Restaurant Cafe,120


In [7]:
# Checking for missing values
df.isnull().sum()

Restaurant Name         0
Cuisines                9
Average Cost for two    0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(9542, 3)

In [16]:
# Checking missing vaalues for each col
missing_values = df.isna().sum() 
missing_values_column = df['Restaurant Name'].isna().sum() 
missing_values_column = df['Cuisines'].isna().sum() 
missing_values_column = df['Average Cost for two'].isna().sum()

In [17]:
df_cleaned = df.dropna() 
df_cleaned = df.dropna(subset=['Restaurant Name']) 
df_cleaned = df.dropna(subset=['Cuisines']) 
df_cleaned = df.dropna(subset=['Average Cost for two'])

In [18]:
df.describe(include="all")

Unnamed: 0,Restaurant Name,Cuisines,Average Cost for two
count,9542,9542,9542.0
unique,7437,1825,
top,Cafe Coffee Day,North Indian,
freq,83,936,
mean,,,1200.326137
std,,,16128.743876
min,,,0.0
25%,,,250.0
50%,,,400.0
75%,,,700.0


In [19]:
# Converting Categorical data to NUmerical
from sklearn.preprocessing import LabelEncoder 

label_encoder = LabelEncoder() 
df['Restaurant Name'] = label_encoder.fit_transform(df['Restaurant Name']) 
df['Cuisines'] = label_encoder.fit_transform(df['Cuisines'])

In [20]:
df

Unnamed: 0,Restaurant Name,Cuisines,Average Cost for two
0,3742,920,1100
1,3167,1111,1200
2,2892,1671,4000
3,4700,1126,1500
4,5515,1122,1500
...,...,...,...
9546,4436,1813,80
9547,1310,1824,105
9548,3063,1110,170
9549,512,1657,120


## Building Random Forest Model

In [21]:
# Create X and Y
X = df[['Restaurant Name', 'Average Cost for two']] 
Y = df['Cuisines']

In [22]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()

scaler.fit(X)
X= scaler.transform(X)

In [23]:
from sklearn.model_selection import train_test_split

#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
                                                    random_state=10)  

In [27]:
# predicting using Decision Tree Classifier.
from sklearn.tree import DecisionTreeClassifier

model_DT = DecisionTreeClassifier(random_state=10,
                                   criterion="gini")

# fit the model on data and predict the values
model_DT.fit(X_train,Y_train)      # fit is the function that is used for training the data
Y_pred = model_DT.predict(X_test) # Validation Data
#print(Y_pred)
print(list(zip(Y_test,Y_pred)))

[(1348, 1348), (1329, 1611), (1608, 1235), (1650, 1655), (1306, 1749), (1514, 549), (168, 50), (331, 1424), (1723, 1723), (582, 865), (682, 682), (1306, 1329), (159, 1098), (201, 1306), (1306, 1306), (58, 58), (1520, 546), (1597, 1381), (774, 765), (518, 497), (1626, 1626), (1306, 1714), (1749, 331), (1520, 186), (546, 684), (1514, 1306), (1334, 684), (1212, 518), (958, 1444), (986, 986), (1306, 1626), (1256, 177), (177, 518), (58, 58), (1626, 1626), (1288, 1650), (1514, 1514), (12, 1813), (497, 497), (1306, 1444), (233, 1792), (1554, 1765), (828, 331), (1559, 1598), (230, 230), (1306, 828), (1514, 549), (199, 1444), (1275, 249), (1520, 677), (937, 809), (331, 331), (576, 1340), (1329, 835), (955, 362), (1325, 865), (1306, 1334), (1014, 1691), (518, 1723), (177, 299), (1306, 1306), (1547, 549), (1384, 331), (986, 828), (1618, 280), (1429, 475), (841, 834), (243, 251), (1306, 1329), (1323, 1323), (1699, 1699), (497, 435), (1789, 1789), (1130, 82), (828, 518), (828, 828), (1329, 837), (1

In [28]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         1
           6       0.25      0.14      0.18         7
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       1.00      1.00      1.00         2
          18       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         2
          24       0.00      0.00      0.00  

## Building Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
#create a model object
classifier = LogisticRegression(multi_class="multinomial")
#train the model object
classifier.fit(X_train,Y_train)      # fit is the function that is used for training the data

Y_pred = classifier.predict(X_test)
print(Y_pred)

[1306 1306 1306 ... 1306 1306 1306]


In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         2
          26       0.00      0.00      0.00  

### Conclusion:
    By comparing Random Forest and Logistic Regression. Random Forest is Performing Better as compared to Logistic Regression.