In [None]:
import pandas as pd
import json
from geopy.distance import geodesic
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
from scipy import stats

In [None]:
file_path_business = "/content/yelp_academic_dataset_business (4).csv"
file_path_category = "/content/yelp_academic_dataset_category (4).csv"
file_path_user = "/content/yelp_academic_dataset_user (4).csv"
# file_path_review = "/content/yelp_academic_dataset_review_notext (4).csv"
file_path_hotels = '/content/Google Hotels.csv'
file_path_rental = '/content/Google Hotels Rental API FL.xlsx'
df_business = pd.read_csv(file_path_business, on_bad_lines='skip')
df_category = pd.read_csv(file_path_category, on_bad_lines='skip')
df_user = pd.read_csv(file_path_user, on_bad_lines='skip')
# df_review = pd.read_csv(file_path_review, on_bad_lines='skip')
df_hotel = pd.read_csv(file_path_hotels)
df_rental = pd.read_excel(file_path_rental)

# Hotel Data

In [None]:
def expand_ratings(rating_str):
    try:
        # Convert the string to a list of dictionaries
        ratings = json.loads(rating_str.replace("'", '"'))
        # Create a dictionary with star ratings as keys and counts as values
        expanded = {f"star_{rating['stars']}": rating['count'] for rating in ratings}
        return expanded
    except Exception as e:
        print(f"Error processing row: {e}")
        return {}

# Apply the function to each row in the 'ratings' column
expanded_ratings = df_hotel['Ratings'].apply(expand_ratings)

# Convert the expanded ratings into separate columns and concatenate them back
expanded_df = pd.DataFrame(expanded_ratings.tolist())

# Concatenate the expanded DataFrame with the original DataFrame (if you want to keep original data)
df_hotel_expanded = pd.concat([df_hotel, expanded_df], axis=1)

In [None]:
df_hotel_expanded = df_hotel_expanded.rename(columns={
    'star_1': 'Star 1',
    'star_2': 'Star 2',
    'star_3': 'Star 3',
    'star_4': 'Star 4',
    'star_5': 'Star 5'
})
# df_hotel_expanded.head()

In [None]:
merged_df = pd.merge(df_business, df_category, on='business_id', how='inner')

# Now filter the merged DataFrame for Florida restaurants
florida_restaurants = merged_df[
    (merged_df['state'] == 'FL') &
    (merged_df['category_name'] == 'Restaurants')
]

In [None]:
# Define a function to extract zip code using regex
def extract_zipcode(address):
    # Regex pattern for U.S. zip code (5 digits)
    match = re.search(r'\b\d{5}\b', address)
    if match:
        return match.group(0)  # Return the matched zip code
    return None  # Return None if no match is found

# Apply the function to the 'address' column of the hotels DataFrame
df_hotel_expanded['zipcode'] = df_hotel_expanded['Address'].apply(extract_zipcode)

# Show the first few rows to verify
# df_hotel_expanded[['Address', 'zipcode']].head(20)

In [None]:
df_hotel_expanded.columns

Index(['Name', 'Description', 'Coordinates', 'Latitude', 'Longitude',
       'Hotel Class', 'Link', 'Ratings', 'Reviews', 'Reviews Breakdown',
       'Address', 'Star 5', 'Star 4', 'Star 3', 'Star 2', 'Star 1', 'zipcode'],
      dtype='object')

In [None]:
florida_restaurants['postal_code'] = florida_restaurants['postal_code'].astype(str)
df_hotel_expanded['zipcode'] = df_hotel_expanded['zipcode'].astype(str)
florida_restaurants.rename(columns={'name': 'restaurant name'}, inplace=True)
df_hotel_expanded.rename(columns={'Name': 'Hotel Name'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  florida_restaurants['postal_code'] = florida_restaurants['postal_code'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  florida_restaurants.rename(columns={'name': 'restaurant name'}, inplace=True)


In [None]:
florida_restaurants.count()

Unnamed: 0,0
bid,8731
business_id,8731
restaurant name,8731
address,8635
city,8731
state,8731
postal_code,8731
latitude,8731
longitude,8731
stars,8731


In [None]:
# Merge the two tables using zipcode
matched_zip_df = pd.merge(florida_restaurants, df_hotel_expanded, left_on='postal_code', right_on='zipcode', how='inner')

# Define a function to calculate the distance between two geographical points (latitude and longitude)
def calculate_distance(row):
    restaurant_coords = (row['latitude'], row['longitude'])
    hotel_coords = (row['Latitude'], row['Longitude'])
    return geodesic(restaurant_coords, hotel_coords).kilometers

# Apply the function to calculate distances for each pair of matching restaurants and hotels
matched_zip_df['distance_km'] = matched_zip_df.apply(calculate_distance, axis=1)

matched_zip_df[['restaurant name', 'Hotel Name', 'postal_code', 'distance_km', 'Star 1', 'Star 2', 'Star 3', 'Star 4', 'Star 5']].head()

Unnamed: 0,restaurant name,Hotel Name,postal_code,distance_km,Star 1,Star 2,Star 3,Star 4,Star 5
0,Vietnamese Food Truck,Aloft Tampa Downtown,33602,1.002244,133,77,145,375,787
1,Vietnamese Food Truck,"Hotel Flor Tampa Downtown, Tapestry Collection...",33602,0.42197,230,110,158,329,829
2,Vietnamese Food Truck,JW Marriott Tampa Water Street,33602,1.622866,53,14,28,117,873
3,Vietnamese Food Truck,"Le MÃ©ridien Tampa, The Courthouse",33602,0.590933,64,34,102,261,837
4,Vietnamese Food Truck,Tampa Marriott Water Street,33602,1.622182,172,105,289,1163,3770


In [None]:
matched_zip_df.columns

Index(['bid', 'business_id', 'restaurant name', 'address', 'city', 'state',
       'postal_code', 'latitude', 'longitude', 'stars', 'review_count',
       'is_open', 'cid', 'category_name', 'Hotel Name', 'Description',
       'Coordinates', 'Latitude', 'Longitude', 'Hotel Class', 'Link',
       'Ratings', 'Reviews', 'Reviews Breakdown', 'Address', 'Star 5',
       'Star 4', 'Star 3', 'Star 2', 'Star 1', 'zipcode', 'distance_km'],
      dtype='object')

In [None]:
df_correlation = pd.DataFrame(columns=['distance_category', 'stars', 'average', 'thumbup'])

In [None]:
# Define the bins for distance
bins = [0, 1, 5, float('inf')]  # Bins: 0-1, 1-5, 5-10, and greater than 10 km

# Define the labels for each bin
labels = ['less than 1 km', '1-5 km', 'larger than 5 km']  # Labels for the bins

df_correlation['distance_category'] = pd.cut(matched_zip_df['distance_km'], bins=bins, labels=labels, right=False)

df_correlation['stars'] = matched_zip_df['stars']

df_correlation['average'] = ((matched_zip_df['Star 1'] + 2 * matched_zip_df['Star 2'] + 3 * matched_zip_df['Star 3']
                             + 4 * matched_zip_df['Star 4'] + 5 * matched_zip_df['Star 5'])
                             / (matched_zip_df['Star 1'] + matched_zip_df['Star 2'] + matched_zip_df['Star 3']
                             + matched_zip_df['Star 4'] + matched_zip_df['Star 5']))
df_correlation['thumbup'] = ((matched_zip_df['Star 4'] + matched_zip_df['Star 5'])
                             / (matched_zip_df['Star 1'] + matched_zip_df['Star 2'] + matched_zip_df['Star 3']
                             + matched_zip_df['Star 4'] + matched_zip_df['Star 5']))

In [None]:
df_correlation.head()

Unnamed: 0,distance_category,stars,average,thumbup
0,1-5 km,4.0,4.058668,0.765985
1,less than 1 km,4.0,3.855676,0.699275
2,1-5 km,4.0,4.606452,0.912442
3,less than 1 km,4.0,4.365948,0.845917
4,1-5 km,4.0,4.501,0.897072


In [None]:
model = smf.ols(formula='stars ~ average + C(distance_category)', data=df_correlation).fit()
model.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,12.93
Date:,"Tue, 03 Dec 2024",Prob (F-statistic):,1.98e-08
Time:,07:25:59,Log-Likelihood:,-12128.0
No. Observations:,10542,AIC:,24260.0
Df Residuals:,10538,BIC:,24290.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.1981,0.080,40.014,0.000,3.041,3.355
C(distance_category)[T.1-5 km],-0.0255,0.017,-1.493,0.136,-0.059,0.008
C(distance_category)[T.larger than 5 km],0.0337,0.024,1.393,0.164,-0.014,0.081
average,0.1073,0.019,5.749,0.000,0.071,0.144

0,1,2,3
Omnibus:,643.367,Durbin-Watson:,0.653
Prob(Omnibus):,0.0,Jarque-Bera (JB):,765.165
Skew:,-0.651,Prob(JB):,7.020000000000001e-167
Kurtosis:,3.215,Cond. No.,48.3


# Randomly Extract 5 Distance in Each Range

In [None]:
# Step 1: Randomly sample 5 values from each category (distance category)
random_samples_per_bin = df_correlation.groupby('distance_category').apply(lambda x: x.sample(n=5, random_state=42))

# Step 2: Reset the index for clarity
random_samples_per_bin = random_samples_per_bin.reset_index(drop=True)

  random_samples_per_bin = df_correlation.groupby('distance_category').apply(lambda x: x.sample(n=5, random_state=42))
  random_samples_per_bin = df_correlation.groupby('distance_category').apply(lambda x: x.sample(n=5, random_state=42))


In [None]:
# Randomly Extract 5 Distance in Each Range
model = smf.ols(formula='stars ~ average + C(distance_category)', data=random_samples_per_bin).fit()
model.summary()

  res = hypotest_fun_out(*samples, **kwds)


0,1,2,3
Dep. Variable:,stars,R-squared:,0.16
Model:,OLS,Adj. R-squared:,-0.069
Method:,Least Squares,F-statistic:,0.6995
Date:,"Tue, 03 Dec 2024",Prob (F-statistic):,0.572
Time:,07:25:59,Log-Likelihood:,-14.776
No. Observations:,15,AIC:,37.55
Df Residuals:,11,BIC:,40.38
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.6543,2.577,2.194,0.051,-0.018,11.327
C(distance_category)[T.1-5 km],-0.1613,0.515,-0.313,0.760,-1.296,0.973
C(distance_category)[T.larger than 5 km],-0.5249,0.490,-1.072,0.307,-1.603,0.553
average,-0.4601,0.634,-0.726,0.483,-1.855,0.935

0,1,2,3
Omnibus:,0.228,Durbin-Watson:,1.636
Prob(Omnibus):,0.892,Jarque-Bera (JB):,0.137
Skew:,-0.177,Prob(JB):,0.934
Kurtosis:,2.691,Cond. No.,59.1


# Short Term Rental Data

In [None]:
# Define a function to extract zip code using regex
def extract_zipcode_rental(address):
    # Regex pattern for U.S. zip code (5 digits)
    match = re.search(r'\b\d{5}\b', address)
    if match:
        return match.group(0)  # Return the matched zip code
    return None  # Return None if no match is found

# Apply the function to the 'address' column of the hotels DataFrame
df_rental['zipcode'] = df_rental['Address'].apply(extract_zipcode_rental)

# Show the first few rows to verify
df_rental.head(20)

Unnamed: 0,Name,Coordinates,Latitude,Longitude,Link,Ratings,Reviews,Address,zipcode
0,"""Riverbend Gem"" - Studio|Courtyard|TPA Heights","{'latitude': 28.001779556274414, 'longitude': ...",28.00178,-82.46595,https://www.staytealdoor.com/properties/6621a3...,3.3,3,"6002, North Dexter Avenue, Kenilworth Grove, T...",33604
1,Creek Cabin - Two-Bedroom House,"{'latitude': 28.21503257751465, 'longitude': -...",28.215033,-82.698647,https://www.google.com/travel/search?q=7008%2C...,3.6,7,"7008, Creek Drive, Pasco County, Florida, 3465...",34655
2,TownePlace Suites by Marriott Tampa North/I-75...,"{'latitude': 28.067001342773438, 'longitude': ...",28.067001,-82.371696,https://deals.vio.com?sig=73aca13c7f952d2641c1...,3.6,189,"6814, Woodstork Drive, Temple Terrace, Hillsbo...",33637
3,BayCation,"{'latitude': 28.048564910888672, 'longitude': ...",28.048565,-82.4011,https://www.vacasa.com/unit/118653,3.8,4,"5025, East 110th Avenue, Tampa, Hillsborough C...",33617
4,Sunny Studio,"{'latitude': 27.776987075805664, 'longitude': ...",27.776987,-82.69062,https://cohostalbookings.hospitable.rentals/pr...,3.8,8,"4216, 5th Avenue North, Central Oak Park, Sain...",33713
5,Tropical Vibe,"{'latitude': 28.02916145324707, 'longitude': -...",28.029161,-82.743187,https://www.vacasa.com/unit/107891,3.8,11,"2299, Colonial Drive, Pinellas County, Florida...",34698
6,300 ftÂ² Private room âˆ™ 1 bedroom âˆ™ 2 guests,"{'latitude': 28.266399383544922, 'longitude': ...",28.266399,-82.725151,https://www.google.com/travel/search?q=5400%2C...,4.0,1,"5400, Bellview Avenue, New Port Richey, Pasco ...",34652
7,448 Little Harbor,"{'latitude': 27.725954055786133, 'longitude': ...",27.725954,-82.473824,https://www.google.com/travel/search?q=448%2C%...,4.0,20,"448, Bahia Beach Boulevard, Hillsborough Count...",33570
8,Land's End 203 building 1 Cute Coastal and Com...,"{'latitude': 27.7414608001709, 'longitude': -8...",27.741461,-82.756218,https://www.google.com/travel/search?q=7553%2C...,4.0,48,"7553, Bayshore Drive, Sunset Beach, Treasure I...",33706
9,"Villa KailÄsa - Pool, 6min drive to the Beach","{'latitude': 27.823320388793945, 'longitude': ...",27.82332,-82.778641,https://www.google.com/travel/search?q=5636%2C...,4.0,7,"5636, 100th Way North, Seminole, Pinellas Coun...",33708


In [None]:
df_rental.rename(columns={'Ratings': 'rental_ratings'}, inplace=True)
df_rental.rename(columns={'Name': 'Rental Name'}, inplace=True)

df_rental.head()

Unnamed: 0,Rental Name,Coordinates,Latitude,Longitude,Link,rental_ratings,Reviews,Address,zipcode
0,"""Riverbend Gem"" - Studio|Courtyard|TPA Heights","{'latitude': 28.001779556274414, 'longitude': ...",28.00178,-82.46595,https://www.staytealdoor.com/properties/6621a3...,3.3,3,"6002, North Dexter Avenue, Kenilworth Grove, T...",33604
1,Creek Cabin - Two-Bedroom House,"{'latitude': 28.21503257751465, 'longitude': -...",28.215033,-82.698647,https://www.google.com/travel/search?q=7008%2C...,3.6,7,"7008, Creek Drive, Pasco County, Florida, 3465...",34655
2,TownePlace Suites by Marriott Tampa North/I-75...,"{'latitude': 28.067001342773438, 'longitude': ...",28.067001,-82.371696,https://deals.vio.com?sig=73aca13c7f952d2641c1...,3.6,189,"6814, Woodstork Drive, Temple Terrace, Hillsbo...",33637
3,BayCation,"{'latitude': 28.048564910888672, 'longitude': ...",28.048565,-82.4011,https://www.vacasa.com/unit/118653,3.8,4,"5025, East 110th Avenue, Tampa, Hillsborough C...",33617
4,Sunny Studio,"{'latitude': 27.776987075805664, 'longitude': ...",27.776987,-82.69062,https://cohostalbookings.hospitable.rentals/pr...,3.8,8,"4216, 5th Avenue North, Central Oak Park, Sain...",33713


In [None]:
# Merge the two tables using zipcode
matched_rental_df = pd.merge(florida_restaurants, df_rental, left_on='postal_code', right_on='zipcode', how='inner')

# Define a function to calculate the distance between two geographical points (latitude and longitude)
def calculate_distance(row):
    restaurant_coords = (row['latitude'], row['longitude'])
    rental_coords = (row['Latitude'], row['Longitude'])
    return geodesic(restaurant_coords, rental_coords).kilometers

# Apply the function to calculate distances for each pair of matching restaurants and hotels
matched_rental_df['distance_km'] = matched_rental_df.apply(calculate_distance, axis=1)

matched_rental_df[['restaurant name', 'Rental Name', 'postal_code', 'distance_km', 'stars', 'rental_ratings']].head()

Unnamed: 0,restaurant name,Rental Name,postal_code,distance_km,stars,rental_ratings
0,The Pearl,Land's End 203 building 1 Cute Coastal and Com...,33706,3.283853,4.0,4.0
1,The Pearl,Sunrise Resort Unit 502,33706,4.620419,4.0,4.7
2,Trust Me BBQ,Discover Your Home Away from Home in Riverview,33569,0.875711,4.0,5.0
3,Trust Me BBQ,"Entire home just outside Tampa, FL",33569,2.269097,4.0,5.0
4,PDQ Temple Terrace,BayCation,33617,0.717325,3.0,3.8


In [None]:
matched_rental_df.to_csv('matched_rental_df.csv', index=False)

In [None]:
df_rental.head()

Unnamed: 0,Rental Name,Coordinates,Latitude,Longitude,Link,rental_ratings,Reviews,Address,zipcode
0,"""Riverbend Gem"" - Studio|Courtyard|TPA Heights","{'latitude': 28.001779556274414, 'longitude': ...",28.00178,-82.46595,https://www.staytealdoor.com/properties/6621a3...,3.3,3,"6002, North Dexter Avenue, Kenilworth Grove, T...",33604
1,Creek Cabin - Two-Bedroom House,"{'latitude': 28.21503257751465, 'longitude': -...",28.215033,-82.698647,https://www.google.com/travel/search?q=7008%2C...,3.6,7,"7008, Creek Drive, Pasco County, Florida, 3465...",34655
2,TownePlace Suites by Marriott Tampa North/I-75...,"{'latitude': 28.067001342773438, 'longitude': ...",28.067001,-82.371696,https://deals.vio.com?sig=73aca13c7f952d2641c1...,3.6,189,"6814, Woodstork Drive, Temple Terrace, Hillsbo...",33637
3,BayCation,"{'latitude': 28.048564910888672, 'longitude': ...",28.048565,-82.4011,https://www.vacasa.com/unit/118653,3.8,4,"5025, East 110th Avenue, Tampa, Hillsborough C...",33617
4,Sunny Studio,"{'latitude': 27.776987075805664, 'longitude': ...",27.776987,-82.69062,https://cohostalbookings.hospitable.rentals/pr...,3.8,8,"4216, 5th Avenue North, Central Oak Park, Sain...",33713


In [None]:
df_correlation_rental = pd.DataFrame(columns=['distance_category', 'stars', 'rental_ratings'])

# Define the bins for distance
bins = [0, 1, 5, float('inf')]  # Bins: 0-1, 1-5, 5-10, and greater than 10 km

# Define the labels for each bin
labels = ['less than 1 km', '1-5 km', 'larger than 5 km']  # Labels for the bins

df_correlation_rental['distance_category'] = pd.cut(matched_rental_df['distance_km'], bins=bins, labels=labels, right=False)

df_correlation_rental['stars'] = matched_rental_df['stars']
df_correlation_rental['rental_ratings'] = matched_rental_df['rental_ratings']
df_correlation_rental

Unnamed: 0,distance_category,stars,rental_ratings
0,1-5 km,4.0,4.0
1,1-5 km,4.0,4.7
2,less than 1 km,4.0,5.0
3,1-5 km,4.0,5.0
4,less than 1 km,3.0,3.8
...,...,...,...
11501,1-5 km,3.5,4.0
11502,less than 1 km,3.5,5.0
11503,1-5 km,3.5,5.0
11504,1-5 km,3.5,5.0


In [None]:
model = smf.ols(formula='stars ~ rental_ratings + C(distance_category)', data=df_correlation_rental).fit()
model.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.006
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,22.82
Date:,"Tue, 03 Dec 2024",Prob (F-statistic):,1e-14
Time:,07:28:02,Log-Likelihood:,-13864.0
No. Observations:,11506,AIC:,27740.0
Df Residuals:,11502,BIC:,27770.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.1742,0.097,43.146,0.000,3.985,4.364
C(distance_category)[T.1-5 km],-0.0733,0.022,-3.365,0.001,-0.116,-0.031
C(distance_category)[T.larger than 5 km],-0.1784,0.030,-5.935,0.000,-0.237,-0.119
rental_ratings,-0.1106,0.020,-5.431,0.000,-0.150,-0.071

0,1,2,3
Omnibus:,557.711,Durbin-Watson:,0.675
Prob(Omnibus):,0.0,Jarque-Bera (JB):,638.74
Skew:,-0.573,Prob(JB):,1.99e-139
Kurtosis:,2.87,Cond. No.,63.8


In [None]:
# Step 1: Randomly sample 5 values from each category (distance category)
random_samples_per_bin = df_correlation_rental.groupby('distance_category').apply(lambda x: x.sample(n=5, random_state=42))

# Step 2: Reset the index for clarity
random_samples_per_bin = random_samples_per_bin.reset_index(drop=True)

  random_samples_per_bin = df_correlation_rental.groupby('distance_category').apply(lambda x: x.sample(n=5, random_state=42))
  random_samples_per_bin = df_correlation_rental.groupby('distance_category').apply(lambda x: x.sample(n=5, random_state=42))


In [None]:
model = smf.ols(formula='stars ~ rental_ratings + C(distance_category)', data=random_samples_per_bin).fit()
model.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.177
Model:,OLS,Adj. R-squared:,-0.042
Method:,Least Squares,F-statistic:,0.8082
Date:,"Tue, 03 Dec 2024",Prob (F-statistic):,0.539
Time:,07:26:04,Log-Likelihood:,-22.31
No. Observations:,20,AIC:,54.62
Df Residuals:,15,BIC:,59.6
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.1370,2.562,1.615,0.127,-1.324,9.598
C(distance_category)[T.1-5 km],0.6203,0.542,1.145,0.270,-0.535,1.775
C(distance_category)[T.5-10 km],0.7243,0.543,1.334,0.202,-0.433,1.882
C(distance_category)[T.larger than 10 km],0.9406,0.550,1.710,0.108,-0.232,2.113
rental_ratings,-0.2028,0.548,-0.370,0.717,-1.372,0.966

0,1,2,3
Omnibus:,5.187,Durbin-Watson:,1.911
Prob(Omnibus):,0.075,Jarque-Bera (JB):,3.004
Skew:,-0.874,Prob(JB):,0.223
Kurtosis:,3.741,Cond. No.,66.8


# API Price Level

In [None]:
import pandas as pd

In [None]:
# Link for API to extract price level:
# https://colab.research.google.com/drive/1E8gfRpXtURdHon4BbXBy9budVkToFKBL?usp=sharing#scrollTo=7WVRdaOCqHva

In [None]:
df_pricelevel = pd.read_excel('/content/pricelevels_final.xlsx')

In [None]:
df_pricelevel.head(15).notna()

Unnamed: 0,name,price
0,True,False
1,True,True
2,True,False
3,True,False
4,True,True
5,True,False
6,True,True
7,True,True
8,True,False
9,True,False


In [None]:
florida_restaurants.head()

Unnamed: 0,bid,business_id,restaurant name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,cid,category_name
56,12,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.95527,-82.45632,4.0,10,0,57,Restaurants
70,15,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.91612,-82.76046,4.5,100,0,71,Restaurants
275,59,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,10440 N Dale Mabry Hwy,Tampa,FL,33618,28.0462,-82.50505,4.0,23,0,276,Restaurants
278,60,JgpnXv_0XhV3SfbfB50nxw,Joe's Pizza,2038 N Dale Mabry Hwy,Tampa,FL,33607,27.96051,-82.50613,4.0,35,0,279,Restaurants
360,80,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,3173 Cypress Ridge Blvd,Wesley Chapel,FL,33544,28.19625,-82.38062,4.5,95,0,361,Restaurants


In [None]:
merged_florida_restaurants = pd.merge(florida_restaurants, df_pricelevel,
                                      left_on='restaurant name',
                                      right_on='name',
                                      how='left',
                                      suffixes=('_df1', '_df2'))
merged_florida_restaurants

Unnamed: 0,bid,business_id,restaurant name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,cid,category_name,name,price
0,12,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.95527,-82.45632,4.0,10,0,57,Restaurants,Vietnamese Food Truck,
1,15,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.91612,-82.76046,4.5,100,0,71,Restaurants,,
2,59,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,10440 N Dale Mabry Hwy,Tampa,FL,33618,28.04620,-82.50505,4.0,23,0,276,Restaurants,Roman Forum,
3,60,JgpnXv_0XhV3SfbfB50nxw,Joe's Pizza,2038 N Dale Mabry Hwy,Tampa,FL,33607,27.96051,-82.50613,4.0,35,0,279,Restaurants,,
4,80,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,3173 Cypress Ridge Blvd,Wesley Chapel,FL,33544,28.19625,-82.38062,4.5,95,0,361,Restaurants,Top Shelf Sports Lounge,$$
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76633,150250,8MzF1Tlgz0pOkxmhP5dYzA,El Cap Restaurant,3500 4th St N,St. Petersburg,FL,33704,27.80414,-82.63885,3.5,414,0,668163,Restaurants,El Cap Restaurant,$
76634,150263,-bZQH8yjm7ntTyGeLQwh8Q,Farmer's Kitchen Restaurant,3500 E Bay Dr,Largo,FL,33771,27.91679,-82.75039,4.0,6,0,668223,Restaurants,,
76635,150272,BIyT7Kr7tMJqlfp4oOOYQg,Copper Bell Cafe,11228 Boyette Rd,Riverview,FL,33569,27.85374,-82.31689,3.5,49,0,668269,Restaurants,,
76636,150293,esBGrrmuZzSiECyRBoKvvA,Colony Grill - St. Petersburg,670 Central Ave,St. Petersburg,FL,33701,27.77087,-82.64307,4.5,38,0,668358,Restaurants,Colony Grill - St. Petersburg,$$


In [None]:
print(florida_restaurants['business_id'].duplicated().sum())

0


In [None]:
print(merged_florida_restaurants['business_id'].duplicated().sum())

67907


In [None]:
merged_florida_restaurants = merged_florida_restaurants.drop_duplicates(subset='business_id')

In [None]:
merged_florida_restaurants.head()

Unnamed: 0,bid,business_id,restaurant name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,cid,category_name,name,price
0,12,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.95527,-82.45632,4.0,10,0,57,Restaurants,Vietnamese Food Truck,
1,15,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.91612,-82.76046,4.5,100,0,71,Restaurants,,
2,59,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,10440 N Dale Mabry Hwy,Tampa,FL,33618,28.0462,-82.50505,4.0,23,0,276,Restaurants,Roman Forum,
3,60,JgpnXv_0XhV3SfbfB50nxw,Joe's Pizza,2038 N Dale Mabry Hwy,Tampa,FL,33607,27.96051,-82.50613,4.0,35,0,279,Restaurants,,
4,80,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,3173 Cypress Ridge Blvd,Wesley Chapel,FL,33544,28.19625,-82.38062,4.5,95,0,361,Restaurants,Top Shelf Sports Lounge,$$


In [None]:
merged_florida_restaurants.count()

Unnamed: 0,0
bid,8731
business_id,8731
restaurant name,8731
address,8635
city,8731
state,8731
postal_code,8731
latitude,8731
longitude,8731
stars,8731


In [None]:
price_map = {'$': 1, '$$': 2, '$$$': 3, '$$$$': 4}

# Replace the symbols in the 'price_category' column with their corresponding numerical values
merged_florida_restaurants['price'] = merged_florida_restaurants['price'].replace(price_map)

  merged_florida_restaurants['price'] = merged_florida_restaurants['price'].replace(price_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_florida_restaurants['price'] = merged_florida_restaurants['price'].replace(price_map)


In [None]:
merged_florida_restaurants.head()

Unnamed: 0,bid,business_id,restaurant name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,cid,category_name,name,price
0,12,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.95527,-82.45632,4.0,10,0,57,Restaurants,Vietnamese Food Truck,
1,15,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.91612,-82.76046,4.5,100,0,71,Restaurants,,
2,59,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,10440 N Dale Mabry Hwy,Tampa,FL,33618,28.0462,-82.50505,4.0,23,0,276,Restaurants,Roman Forum,
3,60,JgpnXv_0XhV3SfbfB50nxw,Joe's Pizza,2038 N Dale Mabry Hwy,Tampa,FL,33607,27.96051,-82.50613,4.0,35,0,279,Restaurants,,
4,80,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,3173 Cypress Ridge Blvd,Wesley Chapel,FL,33544,28.19625,-82.38062,4.5,95,0,361,Restaurants,Top Shelf Sports Lounge,2.0


In [None]:
df_pricelevel

Unnamed: 0,name,price
0,Vietnamese Food Truck,
1,Cesarina's Italian Deli,$
2,Roman Forum,
3,Pizza Joe's - a Taste of Buffalo,
4,Top Shelf Sports Lounge,$$
...,...,...
8627,First Watch,$$
8628,El Cap Restaurant,$
8629,Benedict's Family Restaurant,$$
8630,Oxford Exchange,$$


In [None]:
matched_zip_df

Unnamed: 0,bid,business_id,restaurant name,address,city,state,postal_code,latitude,longitude,stars,...,Reviews,Reviews Breakdown,Address,Star 5,Star 4,Star 3,Star 2,Star 1,zipcode,distance_km
0,12,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.95527,-82.45632,4.0,...,1517,"[{'name': 'Bar', 'description': 'Bar or lounge...","Kennedy Boulevard @ Ashley Drive, 100, West Ke...",787,375,145,77,133,33602,1.002244
1,12,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.95527,-82.45632,4.0,...,1656,"[{'name': 'Service', 'description': 'Service',...","Hotel Flor Tampa Downtown, Tapestry Collection...",829,329,158,110,230,33602,0.421970
2,12,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.95527,-82.45632,4.0,...,1085,"[{'name': 'Wellness', 'description': 'Wellness...","Tampa Marriott Water Street, 505, Water Street...",873,117,28,14,53,33602,1.622866
3,12,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.95527,-82.45632,4.0,...,1298,"[{'name': 'Couples', 'description': 'Couple fr...","Le MÃ©ridien Tampa, 601, North Florida Avenue,...",837,261,102,34,64,33602,0.590933
4,12,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.95527,-82.45632,4.0,...,5499,"[{'name': 'Location', 'description': 'Location...","Tampa Marriott Water Street, 505, Water Street...",3770,1163,289,105,172,33602,1.622182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10537,150202,xGcpC8D4Sio-bN9KCr054g,Zen Bistro Grill + Sushi,2223 N Westshore Blvd,Tampa,FL,33607,27.96405,-82.52160,3.5,...,3636,"[{'name': 'Nature', 'description': 'Nature and...","Grand Hyatt Tampa Bay, 2900, Bayport Drive, Ta...",2596,670,168,76,126,33607,2.805708
10538,150202,xGcpC8D4Sio-bN9KCr054g,Zen Bistro Grill + Sushi,2223 N Westshore Blvd,Tampa,FL,33607,27.96405,-82.52160,3.5,...,1754,"[{'name': 'Service', 'description': 'Service',...","Marriott Tampa Westshore, 1001, North West Sho...",1012,449,156,54,83,33607,1.220064
10539,150202,xGcpC8D4Sio-bN9KCr054g,Zen Bistro Grill + Sushi,2223 N Westshore Blvd,Tampa,FL,33607,27.96405,-82.52160,3.5,...,2499,"[{'name': 'Nature', 'description': 'Nature and...","Sailport Waterfront Suites, 2506, North Rocky ...",1413,597,213,110,166,33607,4.949577
10540,150202,xGcpC8D4Sio-bN9KCr054g,Zen Bistro Grill + Sushi,2223 N Westshore Blvd,Tampa,FL,33607,27.96405,-82.52160,3.5,...,4051,"[{'name': 'Nature', 'description': 'Nature and...","The Godfrey Hotel & Cabanas Tampa, 7700, Bay H...",1549,810,569,360,763,33607,4.826658


In [None]:
# Merge the two DataFrames on 'restaurant_id'
merged_hotel_distance = pd.merge(matched_zip_df, df_pricelevel, left_on='restaurant name', right_on='name', how='inner')

# Define the price map for transforming the price category
price_map = {'$': 1, '$$': 2, '$$$': 3, '$$$$': 4}

# Apply the price map to the 'price_category' column
merged_hotel_distance['price'] = merged_hotel_distance['price'].replace(price_map)

# Print the merged DataFrame
print(merged_hotel_distance)

          bid             business_id             restaurant name  \
0          12  eEOYSgkmpB90uNA7lDOMRA       Vietnamese Food Truck   
1          12  eEOYSgkmpB90uNA7lDOMRA       Vietnamese Food Truck   
2          12  eEOYSgkmpB90uNA7lDOMRA       Vietnamese Food Truck   
3          12  eEOYSgkmpB90uNA7lDOMRA       Vietnamese Food Truck   
4          12  eEOYSgkmpB90uNA7lDOMRA       Vietnamese Food Truck   
...       ...                     ...                         ...   
55051  149526  eJxNnmDHBzYfxD5lFiifJw  Shaner's Land & Sea Market   
55052  149599  YaWt65tSRsyK6bppZUY-dg               Bernie's Deli   
55053  149599  YaWt65tSRsyK6bppZUY-dg               Bernie's Deli   
55054  149599  YaWt65tSRsyK6bppZUY-dg               Bernie's Deli   
55055  150158  s2eyoTuJrcP7I_XyjdhUHQ               Bros Pizzeria   

                       address            city state postal_code  latitude  \
0                          NaN       Tampa Bay    FL       33602  27.95527   
1              

  merged_hotel_distance['price'] = merged_hotel_distance['price'].replace(price_map)


In [None]:
df_clean

Unnamed: 0,bid,business_id,restaurant name,address,city,state,postal_code,latitude,longitude,stars,...,Star 4,Star 3,Star 2,Star 1,zipcode,distance_km,name,price,distance_category,average
8,107,vje0KIiE7vtpx7JzmBx5LQ,The Pearl,163 107th Ave,Treasure Island,FL,33706,27.76940,-82.76732,4.0,...,91,29,7,14,33706,4.720733,The Pearl,3.0,larger than 5 km,3.652850
9,107,vje0KIiE7vtpx7JzmBx5LQ,The Pearl,163 107th Ave,Treasure Island,FL,33706,27.76940,-82.76732,4.0,...,41,23,12,8,33706,3.977204,The Pearl,3.0,1-5 km,4.045890
10,107,vje0KIiE7vtpx7JzmBx5LQ,The Pearl,163 107th Ave,Treasure Island,FL,33706,27.76940,-82.76732,4.0,...,104,48,23,50,33706,4.540189,The Pearl,3.0,1-5 km,4.522002
11,107,vje0KIiE7vtpx7JzmBx5LQ,The Pearl,163 107th Ave,Treasure Island,FL,33706,27.76940,-82.76732,4.0,...,1275,646,411,738,33706,5.201573,The Pearl,3.0,1-5 km,4.284493
12,107,vje0KIiE7vtpx7JzmBx5LQ,The Pearl,163 107th Ave,Treasure Island,FL,33706,27.76940,-82.76732,4.0,...,52,13,10,8,33706,1.424125,The Pearl,3.0,larger than 5 km,4.192877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10536,30832,Rq3Wjcg5ULV2yn-AwcmirA,McDonald's,4009 N Armenia,Tampa,FL,33607,27.97479,-82.48444,2.0,...,501,211,99,168,33607,4.229589,McDonald's,1.0,1-5 km,4.045890
10537,30832,Rq3Wjcg5ULV2yn-AwcmirA,McDonald's,4009 N Armenia,Tampa,FL,33607,27.97479,-82.48444,2.0,...,501,211,99,168,33607,4.229589,McDonald's,1.0,1-5 km,4.522002
10538,30832,Rq3Wjcg5ULV2yn-AwcmirA,McDonald's,4009 N Armenia,Tampa,FL,33607,27.97479,-82.48444,2.0,...,501,211,99,168,33607,4.229589,McDonald's,1.0,1-5 km,4.284493
10540,30832,Rq3Wjcg5ULV2yn-AwcmirA,McDonald's,4009 N Armenia,Tampa,FL,33607,27.97479,-82.48444,2.0,...,501,211,99,168,33607,4.229589,McDonald's,1.0,1-5 km,3.499136


In [None]:
# Define the bins for distance
bins = [0, 1, 5, float('inf')]  # Bins: 0-1, 1-5, 5-10, and greater than 10 km

# Define the labels for each bin
labels = ['less than 1 km', '1-5 km', 'larger than 5 km']  # Labels for the bins

merged_hotel_distance['distance_category'] = pd.cut(matched_zip_df['distance_km'], bins=bins, labels=labels, right=False)
merged_hotel_distance['average'] = ((matched_zip_df['Star 1'] + 2 * matched_zip_df['Star 2'] + 3 * matched_zip_df['Star 3']
                             + 4 * matched_zip_df['Star 4'] + 5 * matched_zip_df['Star 5'])
                             / (matched_zip_df['Star 1'] + matched_zip_df['Star 2'] + matched_zip_df['Star 3']
                             + matched_zip_df['Star 4'] + matched_zip_df['Star 5']))

In [None]:
df_clean = merged_hotel_distance.dropna()

In [None]:
model = smf.ols(formula='stars ~ C(price) + C(distance_category)', data=merged_hotel_distance).fit()
model.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.357
Model:,OLS,Adj. R-squared:,0.357
Method:,Least Squares,F-statistic:,1065.0
Date:,"Tue, 03 Dec 2024",Prob (F-statistic):,0.0
Time:,08:30:27,Log-Likelihood:,-9911.4
No. Observations:,9595,AIC:,19830.0
Df Residuals:,9589,BIC:,19880.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.3206,0.014,171.848,0.000,2.294,2.347
C(price)[T.2.0],1.2445,0.018,71.006,0.000,1.210,1.279
C(price)[T.3.0],1.1902,0.083,14.365,0.000,1.028,1.353
C(price)[T.4.0],1.9486,0.156,12.467,0.000,1.642,2.255
C(distance_category)[T.1-5 km],0.0261,0.016,1.637,0.102,-0.005,0.057
C(distance_category)[T.larger than 5 km],0.0846,0.023,3.761,0.000,0.041,0.129

0,1,2,3
Omnibus:,2090.275,Durbin-Watson:,0.234
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4653.586
Skew:,1.247,Prob(JB):,0.0
Kurtosis:,5.329,Cond. No.,27.1


In [None]:
# Step 1: Randomly sample 5 values from each category (distance category)
random_samples_per_bin = merged_hotel_distance.groupby('distance_category').apply(lambda x: x.sample(n=45, random_state=42))

# Step 2: Reset the index for clarity
random_samples_per_bin = random_samples_per_bin.reset_index(drop=True)

  random_samples_per_bin = merged_hotel_distance.groupby('distance_category').apply(lambda x: x.sample(n=45, random_state=42))
  random_samples_per_bin = merged_hotel_distance.groupby('distance_category').apply(lambda x: x.sample(n=45, random_state=42))


In [None]:
model = smf.ols(formula='stars ~ average + price + C(distance_category)', data=random_samples_per_bin).fit()
model.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.248
Model:,OLS,Adj. R-squared:,0.222
Method:,Least Squares,F-statistic:,9.333
Date:,"Tue, 03 Dec 2024",Prob (F-statistic):,1.49e-06
Time:,08:24:22,Log-Likelihood:,-129.27
No. Observations:,118,AIC:,268.5
Df Residuals:,113,BIC:,282.4
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.9192,0.789,2.434,0.017,0.357,3.481
C(distance_category)[T.1-5 km],0.0863,0.170,0.507,0.613,-0.251,0.423
C(distance_category)[T.larger than 5 km],0.0742,0.169,0.438,0.662,-0.261,0.410
average,-0.0758,0.181,-0.420,0.675,-0.433,0.282
price,0.7769,0.133,5.854,0.000,0.514,1.040

0,1,2,3
Omnibus:,21.746,Durbin-Watson:,2.109
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.326
Skew:,1.036,Prob(JB):,1.16e-06
Kurtosis:,4.123,Cond. No.,53.7


In [None]:
df_rental.head()

Unnamed: 0,Rental Name,Coordinates,Latitude,Longitude,Link,rental_ratings,Reviews,Address,zipcode
0,"""Riverbend Gem"" - Studio|Courtyard|TPA Heights","{'latitude': 28.001779556274414, 'longitude': ...",28.00178,-82.46595,https://www.staytealdoor.com/properties/6621a3...,3.3,3,"6002, North Dexter Avenue, Kenilworth Grove, T...",33604
1,Creek Cabin - Two-Bedroom House,"{'latitude': 28.21503257751465, 'longitude': -...",28.215033,-82.698647,https://www.google.com/travel/search?q=7008%2C...,3.6,7,"7008, Creek Drive, Pasco County, Florida, 3465...",34655
2,TownePlace Suites by Marriott Tampa North/I-75...,"{'latitude': 28.067001342773438, 'longitude': ...",28.067001,-82.371696,https://deals.vio.com?sig=73aca13c7f952d2641c1...,3.6,189,"6814, Woodstork Drive, Temple Terrace, Hillsbo...",33637
3,BayCation,"{'latitude': 28.048564910888672, 'longitude': ...",28.048565,-82.4011,https://www.vacasa.com/unit/118653,3.8,4,"5025, East 110th Avenue, Tampa, Hillsborough C...",33617
4,Sunny Studio,"{'latitude': 27.776987075805664, 'longitude': ...",27.776987,-82.69062,https://cohostalbookings.hospitable.rentals/pr...,3.8,8,"4216, 5th Avenue North, Central Oak Park, Sain...",33713


In [None]:
# Merge the two DataFrames on 'restaurant_id'
merged_rental_distance = pd.merge(matched_rental_df, df_pricelevel, left_on='restaurant name', right_on='name', how='inner')

# Define the price map for transforming the price category
price_map = {'$': 1, '$$': 2, '$$$': 3, '$$$$': 4}

# Apply the price map to the 'price_category' column
merged_rental_distance['price'] = merged_rental_distance['price'].replace(price_map)

# Print the merged DataFrame
print(merged_rental_distance)

          bid             business_id                restaurant name  \
0         107  vje0KIiE7vtpx7JzmBx5LQ                      The Pearl   
1         107  vje0KIiE7vtpx7JzmBx5LQ                      The Pearl   
2         276  dJfkfBbJz7wi0RgW-ph_Eg                     Sake House   
3         276  dJfkfBbJz7wi0RgW-ph_Eg                     Sake House   
4         276  dJfkfBbJz7wi0RgW-ph_Eg                     Sake House   
...       ...                     ...                            ...   
91809  150158  s2eyoTuJrcP7I_XyjdhUHQ                  Bros Pizzeria   
91810  150158  s2eyoTuJrcP7I_XyjdhUHQ                  Bros Pizzeria   
91811  150158  s2eyoTuJrcP7I_XyjdhUHQ                  Bros Pizzeria   
91812  150293  esBGrrmuZzSiECyRBoKvvA  Colony Grill - St. Petersburg   
91813  150293  esBGrrmuZzSiECyRBoKvvA  Colony Grill - St. Petersburg   

                       address             city state postal_code  latitude  \
0                163 107th Ave  Treasure Island    FL   

  merged_rental_distance['price'] = merged_rental_distance['price'].replace(price_map)


In [None]:
# Define the bins for distance
bins = [0, 1, 5, float('inf')]  # Bins: 0-1, 1-5, 5-10, and greater than 10 km

# Define the labels for each bin
labels = ['less than 1 km', '1-5 km', 'larger than 5 km']  # Labels for the bins

merged_rental_distance['distance_category'] = pd.cut(matched_zip_df['distance_km'], bins=bins, labels=labels, right=False)

In [None]:
model = smf.ols(formula='stars ~ C(price) + C(distance_category)', data=merged_rental_distance).fit()
model.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.222
Model:,OLS,Adj. R-squared:,0.221
Method:,Least Squares,F-statistic:,554.1
Date:,"Tue, 03 Dec 2024",Prob (F-statistic):,0.0
Time:,08:39:56,Log-Likelihood:,-10962.0
No. Observations:,9738,AIC:,21940.0
Df Residuals:,9732,BIC:,21980.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.3584,0.015,161.472,0.000,2.330,2.387
C(price)[T.2.0],1.0128,0.020,51.287,0.000,0.974,1.051
C(price)[T.3.0],1.5833,0.167,9.478,0.000,1.256,1.911
C(price)[T.4.0],2.1474,0.249,8.623,0.000,1.659,2.636
C(distance_category)[T.1-5 km],-0.0111,0.017,-0.643,0.520,-0.045,0.023
C(distance_category)[T.larger than 5 km],-0.0528,0.025,-2.152,0.031,-0.101,-0.005

0,1,2,3
Omnibus:,1528.852,Durbin-Watson:,0.151
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2415.951
Skew:,1.094,Prob(JB):,0.0
Kurtosis:,4.082,Cond. No.,39.5


In [None]:
# Step 1: Randomly sample 5 values from each category (distance category)
random_samples_per_bin = merged_rental_distance.groupby('distance_category').apply(lambda x: x.sample(n=5, random_state=42))

# Step 2: Reset the index for clarity
random_samples_per_bin = random_samples_per_bin.reset_index(drop=True)

  random_samples_per_bin = merged_rental_distance.groupby('distance_category').apply(lambda x: x.sample(n=5, random_state=42))
  random_samples_per_bin = merged_rental_distance.groupby('distance_category').apply(lambda x: x.sample(n=5, random_state=42))


In [None]:
model = smf.ols(formula='stars ~ C(rental_ratings) + price + C(distance_category)', data=random_samples_per_bin).fit()
model.summary()

  res = hypotest_fun_out(*samples, **kwds)


0,1,2,3
Dep. Variable:,stars,R-squared:,0.78
Model:,OLS,Adj. R-squared:,0.341
Method:,Least Squares,F-statistic:,1.777
Date:,"Tue, 03 Dec 2024",Prob (F-statistic):,0.248
Time:,07:26:07,Log-Likelihood:,-6.0771
No. Observations:,19,AIC:,38.15
Df Residuals:,6,BIC:,50.43
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.6455,1.105,1.489,0.187,-1.058,4.349
C(rental_ratings)[T.4.0],-0.7788,1.050,-0.741,0.486,-3.349,1.792
C(rental_ratings)[T.4.4],-0.7788,1.050,-0.741,0.486,-3.349,1.792
C(rental_ratings)[T.4.5],0.0606,0.935,0.065,0.950,-2.226,2.347
C(rental_ratings)[T.4.6],0.7667,1.061,0.723,0.497,-1.829,3.362
C(rental_ratings)[T.4.7],1.0000,0.838,1.193,0.278,-1.052,3.052
C(rental_ratings)[T.4.8],-0.0242,0.756,-0.032,0.975,-1.873,1.825
C(rental_ratings)[T.4.9],-0.5061,0.909,-0.557,0.598,-2.731,1.719
C(rental_ratings)[T.5.0],1.0121,0.703,1.440,0.200,-0.708,2.732

0,1,2,3
Omnibus:,1.167,Durbin-Watson:,2.44
Prob(Omnibus):,0.558,Jarque-Bera (JB):,0.474
Skew:,-0.386,Prob(JB):,0.789
Kurtosis:,3.05,Cond. No.,29.3


# Data Prep for Machine Learning Models

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
# Prepare features (X) and target (y)
X = merged_hotel_distance[['price', 'distance_km']]  # You can include more features if necessary
y = merged_hotel_distance['stars']  # The target variable (restaurant stars)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, scale the features (important for some models like SVM or neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest Model

In [None]:
# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate the Random Forest model
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

print(f"Random Forest Regressor - MSE: {rf_mse:.4f}, R-squared: {rf_r2:.4f}")

Random Forest Regressor - MSE: 0.0921, R-squared: 0.8876
