# Final project: Is there a link between the types of restaurants in a city and the obesity rate?

### For all details on the data, methodology, results and observations, refer to the final report here: __https://github.com/faddy-ds/Coursera_Capstone__

## Step 1: Import all required libraries

In [None]:
pip install -U scikit-learn

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import folium

import requests

from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score

## Step 2: Retrieve and analyze obesity data

In [None]:
# Create a dataframe and read data to it

obesity_rates_cities = pd.read_csv('Obesity data from CDC - 2011.csv')
obesity_rates_cities

In [None]:
# Clean dataframe by deleting rows with NaN or missing data & formatting first column to separate City and State

obesity_rates_cities.dropna(inplace=True)
obesity_rates_cities.drop(columns=['No physical activity'], inplace=True)
obesity_rates_cities.reset_index(drop=True, inplace=True)

for i in range(len(obesity_rates_cities['Area'])):
    x = obesity_rates_cities['Area'].str.rsplit(" ", 2)[i][0]
    y = x.split("-")[0]
    z = x[-2:]
    obesity_rates_cities['Area'][i] = y + ", " + z
    obesity_rates_cities['Normal weight'][i] = obesity_rates_cities['Normal weight'].str.split("%")[i][0]
    obesity_rates_cities['Overweight'][i] = obesity_rates_cities['Overweight'].str.split("%")[i][0]
    obesity_rates_cities['Obese'][i] = obesity_rates_cities['Obese'].str.split("%")[i][0]

obesity_rates_cities.set_index('Area', inplace=True)
cols = obesity_rates_cities.columns
obesity_rates_cities[cols] = obesity_rates_cities[cols].apply(pd.to_numeric, downcast='float', errors='coerce')
# obesity_rates_cities.sort_values(by=['Obese'], inplace = True)

obesity_rates_cities

In [None]:
# Visualize the statistical distribution of data

sns.set(color_codes=True)

fig_ob, ax_ob = plt.subplots(figsize = (20,10))
sns.distplot(obesity_rates_cities['Obese'], color = 'r', kde=False, bins = 191).set_title('Obese')
ax_ob.set(xlabel = "Percentage of city's population in this category", ylabel = 'Count of cities')
plt.setp(ax_ob, xticks = [i for i in range(int(min(obesity_rates_cities['Obese'])), int(max(obesity_rates_cities['Obese'])+2), 1)])

fig_ow, ax_ow = plt.subplots(figsize = (20,10))
sns.distplot(obesity_rates_cities['Overweight'], color = 'y', kde=False, bins = 191).set_title('Overweight')
ax_ow.set(xlabel = "Percentage of city's population in this category", ylabel = 'Count of cities')
plt.setp(ax_ow, xticks = [i for i in range(int(min(obesity_rates_cities['Overweight'])), int(max(obesity_rates_cities['Overweight'])+2), 1)])

fig_n, ax_n = plt.subplots(figsize = (20,10))
sns.distplot(obesity_rates_cities['Normal weight'], color = 'g', kde=False, bins = 191).set_title('Normal weight')
ax_n.set(xlabel = "Percentage of city's population in this category", ylabel = 'Count of cities')
plt.setp(ax_n, xticks = [i for i in range(int(min(obesity_rates_cities['Normal weight'])), int(max(obesity_rates_cities['Normal weight'])+2), 1)])

plt.show()

In [None]:
# Visualization of the data to identify outliers

obesity_rates_cities.boxplot()

In [None]:
# Get geographical coordiantes for the cities, using dataset from https://simplemaps.com/data/us-cities

city_coord = pd.read_csv('uscities.csv')
city_coord.dropna(inplace=True)
city_coord.drop(city_coord.columns.difference(['city', 'state_id', 'lat', 'lng']), 1, inplace=True)
city_coord['Area'] = city_coord['city'] + ', ' + city_coord['state_id']
city_coord.drop(columns = ['city', 'state_id'], inplace=True)
city_coord_keep = city_coord.loc[city_coord['Area'].isin(obesity_rates_cities.index)]
city_coord_keep.set_index('Area', inplace=True)
city_coord_keep.sort_index(inplace = True)
city_coord_keep

# There are only 160 rows, so 31 of the cities for which I have obesity data are not in the lat-long database

In [None]:
# Get lat-long data into obesity dataframe

data = pd.merge(obesity_rates_cities, city_coord_keep, left_index=True, right_index=True)
data.rename(columns = {'lat':'Latitude', 'lng':'Longitude'}, inplace = True)
data

In [None]:
# Statistical analysis of the data

data.describe(include='all')

In [None]:
# Visualize the graphical distribution of data

us_map = folium.Map(location=[38, -115], zoom_start=4)

for lat, long, obese, area in zip(data['Latitude'], data['Longitude'], data['Obese'], data.index):
    label = '{}% of population in {} is obese'.format(obese, area)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(us_map)

us_map

## Step 3: Compile most common restaurant categories per city

In [None]:
# Get 3 most common types of restaurants for each area

CLIENT_ID = 'UEHS0BUJQOPZQPPKWNBA1YZSCP3O14SZYGT1ZC0XECDCJFMG'
CLIENT_SECRET = '3AM1HHTVARNZV5VMLJKY25YQOXKYHC0M3R45IZL3QQ4LXB5S'
VERSION = '20180604'

# Below is a test with Akron, OH to test the query and results' format

LIMIT = 20
category = '4d4b7105d754a06374d81259'
intent = 'browse'
lat = data.iloc[0, 3]
long = data.iloc[0, 4]

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&limit={}&categoryId={}&intent={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    lat,
    long,
    LIMIT,
    category, 
    intent)

results = requests.get(url).json()["response"]['groups'][0]['items']
results[0]['venue']['categories'][0]['name']
# requests.get(url).json()

In [None]:
# Not specifying a radius defaults to city wide search. The specified category is all "Food" venues
area_list = data.index
latitudes = data['Latitude']
longitudes = data['Longitude']
category = '4d4b7105d754a06374d81259'
intent = 'browse'
LIMIT = 100

venues = []

for areas, lat, long in zip(area_list, latitudes, longitudes):

    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&categoryId={}&intent={}&limit={}'.format(
#     url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&categoryId={}&intent={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        long, 
        category, 
        intent,
        LIMIT)
            
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    venues.append([(
        areas, 
        lat, 
        long, 
        v['venue']['name'],
        v['venue']['categories'][0]['name']) for v in results])

In [None]:
food_venues = pd.DataFrame(columns = ['Area', 
                  'Area Latitude', 
                  'Area Longitude', 
                  'Venue', 
                  'Venue Category'])

row = 0
for i in range(0, len(venues)):
    for j in range(0, len(venues[i])):
        food_venues.loc[row] = [venues[i][j][0]] + [venues[i][j][1]] + [venues[i][j][2]] + [venues[i][j][3]] + [venues[i][j][4]]
        row = row + 1
    
display(food_venues)

In [None]:
food_venues.describe(include='all')

In [None]:
fig, ax = plt.subplots(figsize = (20,10))
sns.countplot(x="Venue Category", data=food_venues)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()

for p in ax.patches:
    x = p.get_x()
    y = p.get_y() + p.get_height()
    ax.annotate(p.get_height(), (x, y))

plt.show()

In [None]:
# Find most common venue category for each city

food_venues_onehot = pd.get_dummies(food_venues[['Venue Category']], prefix="", prefix_sep="")

food_venues_onehot['Area'] = food_venues['Area']

fixed_columns = [food_venues_onehot.columns[-1]] + list(food_venues_onehot.columns[:-1])
food_venues_onehot = food_venues_onehot[fixed_columns]

venues_grouped = food_venues_onehot.groupby('Area').mean().reset_index() # need to group with totals, not frequencis. ORRR maybe i can use frequencies
venues_grouped.set_index('Area', inplace=True)
venues_grouped

In [None]:
most_common_venue = pd.DataFrame(columns = ['Most common venue category',
                                           'Venue frequency'])

for i in range(0, len(venues_grouped.index)):
    most_common_venue.loc[i] = [venues_grouped.idxmax(axis=1)[i]] + [venues_grouped.max(axis=1)[i]]

most_common_venue.set_index(venues_grouped.index, inplace = True)

most_common_venue

In [None]:
# Bringing all data into 1 dataset

total_data = pd.merge(data, most_common_venue, left_index=True, right_index=True)
total_data

## Step 4: Attempt to find relationship between restaurant type & obesity rate

In [None]:
# Normalize data

total_data['Venue frequency'] = total_data['Venue frequency']*100
total_data

In [None]:
train_data = total_data[0:120]
test_data = total_data[121:160]

# x = train_data[['Obese', 'Venue frequency']]
# y = train_data['Most common venue category']
# x_test = test_data[['Obese', 'Venue frequency']]
# y_test = test_data['Most common venue category']

x = train_data[['Obese', 'Venue frequency', 'Latitude', 'Longitude']]
y = train_data['Most common venue category']
x_test = test_data[['Obese', 'Venue frequency', 'Latitude', 'Longitude']]
y_test = test_data['Most common venue category']

In [None]:
# KN algorithm

# find optimal k
scores = pd.DataFrame(columns=['K', 'Score'])
for i in range(1, 120):
    kn = KNeighborsClassifier(n_neighbors=i)
    kn.fit(x, y)
    scores.loc[i] = [i] + [kn.score(x_test, y_test)]

sns.lineplot(x="K", y="Score", data=scores)

scores2 = pd.DataFrame(columns=['K', 'Score'])
for i in range(1, 120):
    kn = KNeighborsClassifier(n_neighbors=i, weights='distance')
    kn.fit(x, y)
    scores2.loc[i] = [i] + [kn.score(x_test, y_test)]

sns.lineplot(x="K", y="Score", data=scores2)

display(scores.loc[scores['Score'].idxmax()], scores2.loc[scores2['Score'].idxmax()])

In [None]:
# Train the model on 2/3 of the data, and test on 1/3

kn = KNeighborsClassifier(n_neighbors=18)
kn.fit(x, y)
kn.score(x_test, y_test)

In [None]:
# Using Naive Bayes classifiers

clf = GaussianNB()
clf.fit(x, y)
clf.score(x_test, y_test)

In [None]:
# Decision tree classifier

dt = DecisionTreeClassifier(random_state=0)
dt.fit(x, y)
dt.score(x_test, y_test)

In [None]:
# Evaluating the KNN algorithm

y_pred = kn.predict(x_test)
y_true = y_test

# Jaccard index
display(jaccard_score(y_true, y_pred, average = None))

# F1 score
f1_score(y_true, y_pred, average = None)