In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, RocCurveDisplay
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from xgboost import plot_tree
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import random
from geopy.geocoders import Nominatim
import ast

pd.set_option('display.max_columns', 100)

In [2]:
listings = pd.read_csv('./data/Listings.csv', encoding='ISO-8859-1')
reviews = pd.read_csv('./data/Reviews.csv', encoding='ISO-8859-1')
si_df = pd.read_csv('./data/staten_island.csv')
man_df = pd.read_csv('./data/manhatten.csv')
bronx_df = pd.read_csv('./data/bronx.csv')
queens_df = pd.read_csv('./data/queens.csv')
brook_df = pd.read_csv('./data/brooklyn.csv')
paris_df = pd.read_csv('./data/paris.csv')

  listings = pd.read_csv('./data/Listings.csv', encoding='ISO-8859-1')


In [3]:
listings = listings[listings['city'].isin(['New York', 'Paris'])]
# listings.info()       

In [4]:
columns_to_drop = ['image_url','url','phone','display_phone']

In [5]:
si_df = si_df.drop(columns_to_drop, axis =1)
man_df = man_df.drop(columns_to_drop, axis =1)
bronx_df = bronx_df.drop(columns_to_drop, axis =1)
queens_df = queens_df.drop(columns_to_drop, axis =1)
brook_df = brook_df.drop(columns_to_drop, axis =1)
paris_df = paris_df.drop(columns_to_drop, axis =1)

In [6]:
si_df.drop_duplicates(subset=['name', 'coordinates.latitude','coordinates.longitude'], inplace=True)
man_df.drop_duplicates(subset=['name', 'coordinates.latitude','coordinates.longitude'], inplace=True)
bronx_df.drop_duplicates(subset=['name', 'coordinates.latitude','coordinates.longitude'], inplace=True)
queens_df.drop_duplicates(subset=['name', 'coordinates.latitude','coordinates.longitude'], inplace=True)
brook_df.drop_duplicates(subset=['name', 'coordinates.latitude','coordinates.longitude'], inplace=True)
paris_df.drop_duplicates(subset=['name', 'coordinates.latitude','coordinates.longitude'], inplace=True)

In [7]:
all_categories=si_df['categories']
all_categories_eval = all_categories.apply(ast.literal_eval)
all_cates = all_categories_eval.apply(lambda x: ', '.join(category['title'] for category in x))

In [8]:
si_df.categories = all_cates
man_df.categories = all_cates
brook_df.categories = all_cates
bronx_df.categories = all_cates
queens_df.categories = all_cates
paris_df.categories = all_cates

In [9]:
print(si_df['categories'].isna().value_counts())
print(man_df['categories'].isna().value_counts())
print(brook_df['categories'].isna().value_counts())
print(bronx_df['categories'].isna().value_counts())
print(queens_df['categories'].isna().value_counts())
print(paris_df['categories'].isna().value_counts())

False    466
Name: categories, dtype: int64
True     959
False    206
Name: categories, dtype: int64
True     793
False    258
Name: categories, dtype: int64
True     437
False    188
Name: categories, dtype: int64
True     1225
False     201
Name: categories, dtype: int64
True     588
False    234
Name: categories, dtype: int64


In [10]:
all_cates_split = all_cates.str.split(',').str[0]

In [11]:
si_df['First Listed Category'] = all_cates_split
man_df['First Listed Category'] = all_cates_split
brook_df['First Listed Category'] = all_cates_split
bronx_df['First Listed Category'] = all_cates_split
queens_df['First Listed Category'] = all_cates_split
paris_df['First Listed Category'] = all_cates_split

In [12]:
si_df = si_df.reset_index()
man_df = man_df.reset_index()
brook_df = brook_df.reset_index()
bronx_df = bronx_df.reset_index()
queens_df = queens_df.reset_index()
paris_df = paris_df.reset_index()

In [13]:
from geopy.distance import great_circle
newport_ri = (41.49008, -71.312796)
cleveland_oh = (41.499498, -81.695391)
great_circle(newport_ri, cleveland_oh).miles

536.9979906964345

In [14]:
si_df.columns

Index(['index', 'Unnamed: 0', 'id', 'alias', 'name', 'is_closed',
       'review_count', 'categories', 'rating', 'transactions', 'distance',
       'coordinates.latitude', 'coordinates.longitude', 'location.address1',
       'location.address2', 'location.address3', 'location.city',
       'location.zip_code', 'location.country', 'location.state',
       'location.display_address', 'price', 'total', 'region',
       'Restaurant or Attraction?', 'First Listed Category'],
      dtype='object')

In [15]:
def distance(lat,long,city,radius):
    listing_coordinate = (lat,long)

    df = pd.DataFrame()
    if city == 'staten_island':
        df = si_df
    elif city == 'manhattan':
        df = man_df
    elif city == 'brooklyn':
        df = brook_df
    elif city == 'bronx':
        df = bronx_df
    elif city == 'queens':
        df = queens_df
    elif city == 'paris':
        df = paris_df
    else:
        return None

    df['In range?'] = df.apply(lambda x: great_circle(listing_coordinate, 
                                (x['coordinates.latitude'], x['coordinates.longitude']))
                                .miles if pd.notnull(x['coordinates.latitude']) 
                                and pd.notnull(x['coordinates.longitude']) else np.nan, axis=1)
    df = df[df['In range?'] < radius]

    df = df[['name','rating','categories','First Listed Category','Restaurant or Attraction?', 'In range?']]
    return df

In [84]:
# test = distance(48.864516,2.345402,'paris',2)
# print(test['rating'].value_counts(),test['First Listed Category'].value_counts(), test['Restaurant or Attraction?'].value_counts())

In [85]:
# for i in range(10):
#     temp = distance(listings.loc[i,'latitude'],listings.loc[i,'longitude'],'paris', 2)
#     print(len(temp),temp['rating'].value_counts(),temp['First Listed Category'].value_counts(), temp['Restaurant or Attraction?'].value_counts(normalize=True))

In [None]:
ml_listings = listings.copy()
# geos = ['bronx']
geos = ['bronx', 'brooklyn', 'manhattan', 'queens', 'staten_island','paris']


for geo in geos:
    for i, row in ml_listings.iterrows():
        if str(row['city']).lower().find(geo) != -1 or str(row['district']).lower().find(geo) != -1:
            temp = distance(row['latitude'], row['longitude'], geo, 2)
            ml_listings.at[i, '# w/in 2mi'] = len(temp)
            ml_listings.at[i, 'Avg yelp rating'] = temp['rating'].mean() if not temp.empty else 0
            ml_listings.at[i, '% Restaurants'] = temp['Restaurant or Attraction?'].value_counts(normalize=True).iloc[0] if not temp.empty else 0

In [29]:
ml_listings[['listing_id','district','# w/in 2mi','Avg yelp rating','% Restaurants']]
ml_listings.to_csv('ml_listings.csv', index=False)

filtered_listings = ml_listings[(ml_listings['district'] == 'Bronx')| (ml_listings['district'] == 'Brooklyn')]
display_columns = ['listing_id', 'district', '# w/in 2mi', 'Avg yelp rating', '% Restaurants']
display_data = filtered_listings[display_columns]
display_data

In [None]:
paris_df.head()

Unnamed: 0.1,index,Unnamed: 0,id,alias,name,is_closed,review_count,categories,rating,transactions,price,distance,coordinates.latitude,coordinates.longitude,location.address1,location.address2,location.address3,location.city,location.zip_code,location.country,location.state,location.display_address,total,region,Restaurant or Attraction?,First Listed Category,In range?
0,0,0,-0iLH7iQNYtoURciDpJf6w,le-comptoir-de-la-gastronomie-paris,Le Comptoir de la Gastronomie,False,1231,"Hot Pot, Korean, Barbeque",4.5,[],€€,566.745918,48.864516,2.345402,34 rue Montmartre,,,Paris,75001,FR,75.0,"['34 rue Montmartre', '75001 Paris', 'France']",11100,"{'center': {'longitude': 2.3378562927246094, '...",Restaurant,Hot Pot,1.5e-05
1,1,1,IU9_wVOGBKjfqTTpAXpKcQ,bistro-des-augustins-paris,Bistro des Augustins,False,470,"Southern, Cocktail Bars",4.5,[],€€,1006.357971,48.854754,2.342119,39 quai des Grands Augustins,,,Paris,75006,FR,75.0,"['39 quai des Grands Augustins', '75006 Paris'...",11100,"{'center': {'longitude': 2.3378562927246094, '...",Restaurant,Southern,0.690802
2,2,2,ctP4c3mwVO5oOzLI48LtuQ,les-antiquaires-paris,Les Antiquaires,False,433,"Bars, American (New), Salad",4.5,[],€€€,917.337995,48.858066,2.328237,13 rue du Bac,,,Paris,75007,FR,75.0,"['13 rue du Bac', '75007 Paris', 'France']",11100,"{'center': {'longitude': 2.3378562927246094, '...",Restaurant,Bars,0.898532
3,3,3,cEjF41ZQB8-SST8cd3EsEw,l-avant-comptoir-paris-3,L'Avant Comptoir,False,648,Asian Fusion,4.5,[],€€,1263.351331,48.85202,2.3388,3 carrefour de l'Odéon,,,Paris,75006,FR,75.0,"[""3 carrefour de l'Odéon"", '75006 Paris', 'Fra...",11100,"{'center': {'longitude': 2.3378562927246094, '...",Restaurant,Asian Fusion,0.914064
4,4,4,-umFmobUgpW_05m_ud1vHw,la-cordonnerie-paris-5,La Cordonnerie,False,93,"Italian, American (Traditional), Sandwiches",4.5,[],€€€,461.497317,48.86543,2.33237,20 rue Saint Roch,,,Paris,75001,FR,75.0,"['20 rue Saint Roch', '75001 Paris', 'France']",11100,"{'center': {'longitude': 2.3378562927246094, '...",Restaurant,Italian,0.595689


In [None]:
si_df.iat[0, 12]

-74.14673