In [1]:
import pandas as pd

# Load datasets
zomato_data = pd.read_csv("https://github.com/dsrscientist/dataset4/raw/main/zomato.csv")
country_code_data = pd.read_excel("https://github.com/dsrscientist/dataset4/raw/main/Country-Code.xlsx")

# Merge datasets on 'Country Code'
merged_data = pd.merge(zomato_data, country_code_data, on='Country Code', how='left')

# Display the first few rows of the merged dataset
print(merged_data.head())

# Information about the merged dataset
print(merged_data.info())

# Summary statistics of numerical columns
print(merged_data.describe())

# Check for missing values
print(merged_data.isnull().sum())

# Data preprocessing
# Handle missing values
# For numerical columns, fill missing values with mean
numerical_columns = merged_data.select_dtypes(include=['float64', 'int64']).columns
merged_data[numerical_columns] = merged_data[numerical_columns].fillna(merged_data[numerical_columns].mean())

# For categorical columns, fill missing values with mode
categorical_columns = merged_data.select_dtypes(include=['object']).columns
merged_data[categorical_columns] = merged_data[categorical_columns].fillna(merged_data[categorical_columns].mode().iloc[0])

# Encoding categorical variables
merged_data = pd.get_dummies(merged_data, drop_first=True)

# Separate features and target variables for prediction
X_avg_cost = merged_data.drop(columns=['Average Cost for two'])
y_avg_cost = merged_data['Average Cost for two']

X_price_range = merged_data.drop(columns=['Price range'])
y_price_range = merged_data['Price range']

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_avg_cost_train, X_avg_cost_test, y_avg_cost_train, y_avg_cost_test = train_test_split(X_avg_cost, y_avg_cost, test_size=0.2, random_state=42)
X_price_range_train, X_price_range_test, y_price_range_train, y_price_range_test = train_test_split(X_price_range, y_price_range, test_size=0.2, random_state=42)

# Train machine learning models for prediction
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Model for predicting Average Cost for two
model_avg_cost = LinearRegression()
model_avg_cost.fit(X_avg_cost_train, y_avg_cost_train)

# Model for predicting Price range
model_price_range = RandomForestRegressor()
model_price_range.fit(X_price_range_train, y_price_range_train)

# Evaluate models
# For Average Cost for two prediction
avg_cost_score = model_avg_cost.score(X_avg_cost_test, y_avg_cost_test)
print("R^2 Score for Average Cost for two prediction:", avg_cost_score)

# For Price range prediction
price_range_score = model_price_range.score(X_price_range_test, y_price_range_test)
print("R^2 Score for Price range prediction:", price_range_score)

# Make predictions
# Predictions for Average Cost for two
avg_cost_predictions = model_avg_cost.predict(X_avg_cost_test)

# Predictions for Price range
price_range_predictions = model_price_range.predict(X_price_range_test)


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 7044: invalid continuation byte