In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [5]:
# Load the dataset
data = pd.read_csv('Dataset.csv')

# Finding the missing values
data.isnull().sum()

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64

In [6]:
# Fill missing values in the 'Cuisines' column with 'Unknown'
data['Cuisines'].fillna('Unknown', inplace=True)

# Define categorical columns
categorical_columns = ['Restaurant Name', 'City', 'Locality', 'Locality Verbose', 'Cuisines', 'Currency',
                       'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 
                       'Rating color', 'Rating text', 'Address']

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Restaurant Name', 'City', 'Locality', 'Locality Verbose', 'Cuisines', 'Currency',
                                     'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 
                                     'Rating color', 'Rating text', 'Address'], drop_first=True)

 # Example for get_dummies
 
 # Original DataFrame:

   ### City
    New York
    Los Angeles
    Chicago
    New York
 
# City_New York	City_Los Angeles	City_Chicago
          1	                               0	                         0
          0	                               1	                         0
          0	                               0	                         1
          1	                               0	                         0


In [7]:
# Normalize numerical features
numerical_columns = ['Longitude', 'Latitude', 'Average Cost for two', 'Price range', 'Votes']
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [8]:
# Prepare the feature set and target variable
X = data.drop(columns=['Restaurant ID', 'Aggregate rating'])
y = data['Aggregate rating']

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train and evaluate Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print('Linear Regression MSE:', mean_squared_error(y_test, y_pred_lr))

Linear Regression MSE: 129991364191.36801


In [12]:
# Train and evaluate Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print('Decision Tree MSE:', mean_squared_error(y_test, y_pred_dt))

Decision Tree MSE: 0.05143380429094714


In [13]:
# Train and evaluate Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print('Random Forest MSE:', mean_squared_error(y_test, y_pred_rf))

Random Forest MSE: 0.030054916797488153


In [14]:
def preprocess_input(user_input, scaler):
    # Create a DataFrame from the user input
    input_df = pd.DataFrame([user_input])
    
    # One-hot encode the input
    input_df = pd.get_dummies(input_df, columns=['Restaurant Name', 'City', 'Locality', 'Locality Verbose', 'Cuisines', 'Currency',
                                                 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 
                                                 'Rating color', 'Rating text', 'Address'], drop_first=True)
    
    # Ensure all columns in training data are present in the input
    missing_cols = set(X_train.columns) - set(input_df.columns)
    for col in missing_cols:
        input_df[col] = 0
    input_df = input_df[X_train.columns]
    
    # Normalize numerical features
    input_df[numerical_columns] = scaler.transform(input_df[numerical_columns])
    
    return input_df

In [15]:
def predict_rating(user_input, model, scaler):
    # Preprocess the input
    preprocessed_input = preprocess_input(user_input, scaler)
    
    # Make a prediction
    prediction = model.predict(preprocessed_input)
    
    return prediction[0]

In [17]:
# Example user input
user_input = {
    'Restaurant Name': 'Izakaya Kikufuji',
    'Country Code': 162,
    'City': 'Makati City',
    'Address': 'Little Tokyo, 2277 Chino Roces Avenue, Legaspi Village, Makati City',
    'Locality': 'Little Tokyo, Legaspi Village, Makati City',
    'Locality Verbose': 'Little Tokyo, Legaspi Village, Makati City, Makati City',
    'Longitude': 121.014101,
    'Latitude': 14.553708,
    'Cuisines': 'Japanese',
    'Average Cost for two': 1200,
    'Currency': 'Botswana Pula(P)',
    'Has Table booking': 'Yes',
    'Has Online delivery': 'No',
    'Is delivering now': 'No',
    'Switch to order menu': 'No',
    'Price range': 3,
    'Rating color': 'Dark Green',
    'Rating text': 'Excellent',
    'Votes': 591
}

# Ensure that all categorical values are strings
for key in user_input.keys():
    if key in categorical_columns:
        user_input[key] = str(user_input[key])

# Choose a trained model (e.g., lr_model, dt_model, rf_model)
model = rf_model

# Make a prediction
predicted_rating = predict_rating(user_input, model, scaler)
print('Predicted Aggregate Rating:', predicted_rating)


  input_df[col] = 0


Predicted Aggregate Rating: 4.464000000000002
