# Pizza Lovers
## Your data analysis of pizzerias in Paris

### Import of the data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('inputs_from_exterior/basic_data.csv', delimiter = '|')

### Merge with new scraping

In [2]:
new_df = pd.read_csv('inputs_from_exterior/basic_data_75020_75007.csv', delimiter = '|')

In [3]:
df = pd.concat([df, new_df])
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,name,average_rate,nb_of_reviews,address
0,Pizza Wawa,45,1535,"35 Rue Saint-Honoré, 75001 Paris"
1,Pizzeria Iovine's,45,2591,"7bis Rue du Colonel Driant, 75001 Paris"
2,Milo,46,579,"44 Rue Saint-Honoré, 75001 Paris"
3,La Tavola Calda,45,513,"39 Rue des Bourdonnais, 75001 Paris"
4,Liberto,47,1697,"23 Rue Berger, 75001 Paris"
...,...,...,...,...
592,15-12,15,6,"6 Av. Rapp, 75007 Paris"
593,Nicole's,40,426,"14 Rue de Bellechasse, 75007 Paris"
594,Pizza Fiorentina,30,571,"7 Bd de Grenelle, 75015 Paris"
595,Primo Piano,39,45,"Le Bon Marché, 24 Rue de Sèvres, 75007 Paris"


### Split of the postal code and the city

### Cleaning the data
#### Remove duplicates

In [4]:
df = df.drop_duplicates()

#### Show the restaurants that do not have Paris in their address

In [5]:
pd.set_option('display.max_rows', None)
df[~df['address'].str.contains('Paris')]

# Deleting the 306th, 411th, 513th, 519th, 520th because they are not in Paris

df = df.drop([306, 411, 513, 519, 528])
df[~df['address'].str.contains('Paris')]
pd.set_option('display.max_rows', 10)

In [6]:
# Examine the address of the last one
df[df['name'].str.contains('CAMPISI')]
df.at[369, 'address'] = 'Angle rue de Boulanvilliers, 1 Rue des Bauches, 75016 Paris'

# Check 
df[~df['address'].str.contains('Paris')]

Unnamed: 0,name,average_rate,nb_of_reviews,address


#### Look for the empties in each columns

In [7]:
df.isna().sum()

name             0
average_rate     0
nb_of_reviews    0
address          0
dtype: int64

#### Display types of each column an change type of columns

In [8]:
df.dtypes
# modification of average rate
df['average_rate'] = df['average_rate'].str.replace(',','.')
df['average_rate'] = pd.to_numeric(df['average_rate'])

# We are going to use .str accessor for the name and address

df.dtypes

name              object
average_rate     float64
nb_of_reviews      int64
address           object
dtype: object

#### Display unique values of average_rate and nb_of_reviews columns

In [9]:
# df.average_rate.unique()
# df.nb_of_reviews.unique()

### Transform the data

#### Add a "city" column

In [10]:
# Split the column and transform it into Series
splitted_series = df.address.str.split(expand=False)

list_of_cities = []
# create a list of the cities
for city in splitted_series:
    list_of_cities.append(city[-1])
    
df['city'] = list_of_cities

### Add a "postal_code" column

In [11]:
splitted_series = df.address.str.split(expand=False)

list_of_postal_code = []
for string in splitted_series:
    list_of_postal_code.append(string[-2])
    
# Check the unique values
# set(list_of_postal_code)
# len(list_of_postal_code)

df['postal_code'] = list_of_postal_code

In [12]:
# Modification of the 75116 by 75016
df['postal_code'] = df['postal_code'].replace('75116', '75016')

### Add the latitude and longitude

In [13]:
# from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent="pizza_lovers_THP")

# from geopy.extra.rate_limiter import RateLimiter
# geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# df['location'] = df['address'].apply(geocode)

# df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)

#### Check the number of none

In [14]:
# df.groupby(a['point'])['point'].count().sort_values(ascending=False).head(24
#     )

### Print to CSV

In [15]:
df.to_csv('inputs_from_processing/basic_data_cleaned.csv', sep = '|')

## Analysis
### Statistics by district

#### First three districts by number of pizzerias

In [16]:
df.groupby(['postal_code'])['postal_code'].count().nlargest(3)

postal_code
75016    103
75005     68
75001     63
Name: postal_code, dtype: int64

#### First three district by average rating 

In [17]:
df.groupby(['postal_code'])['average_rate'].mean().nlargest(3)

postal_code
75009    4.688889
75003    4.540000
75014    4.490909
Name: average_rate, dtype: float64

#### First three pizzerias of each district by average rate

In [18]:
# val = input("Entrer the district you want :")

# df_3 = df.loc[df['postal_code'] == val].sort_values(by=['average_rate'], ascending=False).head(3)

In [19]:
df_3 = df.sort_values(by=['average_rate'], ascending=False)
df_3 = df_3.drop(['address', 'city'], axis=1)
df_3

Unnamed: 0,name,average_rate,nb_of_reviews,postal_code
91,PALLA PIZZA,5.0,2,75010
168,Panuozzo,5.0,1,75005
521,Clip Pizza,5.0,1,75020
118,Univers Pizza,5.0,80,75005
511,Pizzeria Palermo,5.0,3,75020
...,...,...,...,...
39,ACDS Restauration,2.5,2,75001
176,Les Balkans,2.5,82,75005
358,Pizzeria Tradition,2.0,8,75016
592,15-12,1.5,6,75007


#### First three pizzerias of each district by number of reviews

In [20]:
# val = input("Entrer the district you want :")

# df.loc[df['postal_code'] == val].sort_values(by=['nb_of_reviews'], ascending=False).head(3)

# Dash
## Examples for the graph

In [21]:
df_graph = df.sort_values(by=['average_rate'], ascending=False)
df_graph = df_graph.drop(['address', 'city'], axis=1)
df_graph


def df_graph_top_5_by_rating(df, district_number):
    return df[df.postal_code == district_number].sort_values(by=['average_rate'], ascending=False).head(5)

def df_graph_top_5_by_nb_reviews(df, district_number):
    return df[df.postal_code == district_number].sort_values(by=['nb_of_reviews'], ascending=False).head(5)

def series_top_5_district_by_average_rate(df):
    return df.groupby(['postal_code'])['average_rate'].mean().nlargest(5)


## Vizualization with Dash

In [22]:
import jupyter_dash
from jupyter_dash import JupyterDash
import plotly.express as px

import dash
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output
from dash import dcc, html

import pandas as pd
import numpy as np


#------------------------
# CREATION OF THE NEEDED DATASETS AND LISTS
#------------------------

df_graph = df.sort_values(by=['average_rate'], ascending=False)
df_graph = df_graph.drop(['address', 'city'], axis=1)

#------------------------
# METHODS
#------------------------

def df_graph_top_5_by_rating(df, district_number):
    return df[df.postal_code == district_number].sort_values(by=['average_rate'], ascending=False).head(5)

def df_graph_top_5_by_nb_reviews(df, district_number):
    return df[df.postal_code == district_number].sort_values(by=['nb_of_reviews'], ascending=False).head(5)

def series_top_5_district_by_average_rate(df):
    return df.groupby(['postal_code'])['average_rate'].mean().nlargest(5)

a = series_top_5_district_by_average_rate(df_graph)

#------------------------
###
##
#
jupyter_dash.JupyterDash # You have to change this to use Dash outisde of Jupyter
app = JupyterDash(__name__, external_stylesheets=[dbc.themes.CYBORG]) # this too
#
##
###
#------------------------

#------------------------
# Layout
#------------------------

app.layout = html.Div(children=
    [
    html.H1(children='Top 5 des pizzerias par arrondissement'),

    html.Div(children='''
        Selon leur note :
    '''),
    html.Div(
        dash.dcc.Dropdown(options= df_graph['postal_code'].sort_values().unique(),
        id="district-filter-1",
        value = df_graph['postal_code'].min()
        )),

    dcc.Graph(
        id='top-5-rating-by-district-graph-with-dd'
    ),
                      
    html.Div(children='''
        Selon le nombre de commentaires :
    '''),
    html.Div(
        dash.dcc.Dropdown(options= df_graph['postal_code'].sort_values().unique(),
        id="district-filter-2",
        value = df_graph['postal_code'].min()
        )),

    dcc.Graph(
        id='top-5-nb-reviews-by-district-graph-with-dd'
    ),
    
    html.H1(children='Top 5 des arrondissements'),

    html.Div(children='''
        Selon leur note moyenne :
    '''),
        
    dcc.Graph(
            figure= px.bar(a, 
                 x= a.index , y= a.values, barmode="group",
                template='plotly_dark', range_y=[a.min() - 0.2 ,a.max()+0.2])

    ),
])
   
#------------------------
# CALLBACKS
#------------------------

@app.callback(
    Output('top-5-rating-by-district-graph-with-dd', 'figure'),
    Input('district-filter-1', 'value')
)

def update_figure_1(selected_district):
    filtered_df = df_graph_top_5_by_rating(df_graph, selected_district)
    
    fig_1 = px.bar(filtered_df, 
                 x='name', y='average_rate', barmode="group",
                template='plotly_dark')

    fig_1.update_layout(transition_duration=500, yaxis_range=[filtered_df['average_rate'].min() - 0.2 ,5])
    
    return fig_1

@app.callback(
    Output('top-5-nb-reviews-by-district-graph-with-dd', 'figure'),
    Input('district-filter-2', 'value')
)

def update_figure_2(selected_district):
    filtered_df = df_graph_top_5_by_nb_reviews(df_graph, selected_district)
    
    fig_2 = px.bar(filtered_df, 
                 x='name', y='nb_of_reviews', barmode="group",
                template='plotly_dark')

    fig_2.update_layout(transition_duration=500, yaxis_range=[filtered_df['nb_of_reviews'].min()-300  , filtered_df['nb_of_reviews'].max()+300])
    
    return fig_2
    
if __name__ == '__main__':
    app.run_server(debug=True)

Dash app running on http://127.0.0.1:8050/
