# File preprocessing notebook
This notebook aim to apply some cleaning and filters for future data analysis operations

## First step : Importation of pandas library and creation of the dataframe

In [1]:
import pandas as pd

In [2]:
parquet_df = pd.read_csv('mapstr.csv')

In [3]:
parquet_df

Unnamed: 0,name,address,icon,userComment,tags
0,Cookie Love,"84 Rue d'Aboukir, 75002 Paris, France",generic,,💸#Cookie 🍪
1,Troika Royale,"23 Rue des Filatiers, 31000 Toulouse, France",restaurant,🇷🇺🇬🇪,💸💸#Asiat#TheFork 🍴
2,Café Bong,"11 Rue de la Bourse, 31000 Toulouse, France",cafe,,💸💸#Asiat#Healthy
3,Au Péché Mignon La Maison De Ravel,"37 Rue du Languedoc, 31000 Toulouse, France",bakery,,💸💸#Boulangerie 🥖
4,La Brewlangerie,"306 Avenue de Muret, 31300 Toulouse, France",bakery,,💸#Boulangerie 🥖#Cookie 🍪
...,...,...,...,...,...
812,O'Cheese Naans,"20 Rue Pablo Neruda, 77200 Torcy, France",restaurant,,Burger 🍔🍟#Grec 🥙#💸#À faire
813,Broadway Coffee,"4 Boulevard Carnot, 93250 Villemomble, France",fastfood,,💸#Burger 🍔🍟#À faire
814,Mamagaya,24 Avenue Henri Barbusse 93700 Drancy France,restaurant,,💸#Grec 🥙#Pâtes 🍝#À faire
815,La Maison Du Tacos,"23 Rue Saint-Spire, 91100 Corbeil-Essonnes, Fr...",restaurant,Meilleur endroit pour prendre un tacos FR . 🖕🏾...,💸#Halal#Tacos 🇫🇷#Validé ✅


## Second step : Filtering out the Private addresses, replacing the NaN values for comments to empty string and desc ordering to have the oldests addresses first 

In [4]:
privateless_parquet_df = parquet_df[~parquet_df['tags'].str.contains("Private")]

In [5]:
privateless_parquet_df['userComment'] = privateless_parquet_df['userComment'].fillna("")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  privateless_parquet_df['userComment'] = privateless_parquet_df['userComment'].fillna("")


In [6]:
privateless_parquet_df

Unnamed: 0,name,address,icon,userComment,tags
0,Cookie Love,"84 Rue d'Aboukir, 75002 Paris, France",generic,,💸#Cookie 🍪
1,Troika Royale,"23 Rue des Filatiers, 31000 Toulouse, France",restaurant,🇷🇺🇬🇪,💸💸#Asiat#TheFork 🍴
2,Café Bong,"11 Rue de la Bourse, 31000 Toulouse, France",cafe,,💸💸#Asiat#Healthy
3,Au Péché Mignon La Maison De Ravel,"37 Rue du Languedoc, 31000 Toulouse, France",bakery,,💸💸#Boulangerie 🥖
4,La Brewlangerie,"306 Avenue de Muret, 31300 Toulouse, France",bakery,,💸#Boulangerie 🥖#Cookie 🍪
...,...,...,...,...,...
812,O'Cheese Naans,"20 Rue Pablo Neruda, 77200 Torcy, France",restaurant,,Burger 🍔🍟#Grec 🥙#💸#À faire
813,Broadway Coffee,"4 Boulevard Carnot, 93250 Villemomble, France",fastfood,,💸#Burger 🍔🍟#À faire
814,Mamagaya,24 Avenue Henri Barbusse 93700 Drancy France,restaurant,,💸#Grec 🥙#Pâtes 🍝#À faire
815,La Maison Du Tacos,"23 Rue Saint-Spire, 91100 Corbeil-Essonnes, Fr...",restaurant,Meilleur endroit pour prendre un tacos FR . 🖕🏾...,💸#Halal#Tacos 🇫🇷#Validé ✅


In [7]:
privateless_parquet_df = privateless_parquet_df.iloc[::-1]

In [8]:
privateless_parquet_df = privateless_parquet_df.reset_index(drop=True)

In [9]:
privateless_parquet_df

Unnamed: 0,name,address,icon,userComment,tags
0,French Cantine,"65 Avenue Gambetta, 93170 Bagnolet, France",restaurant,Meilleur sandwich que j’ai mangé. Les plats de...,💸#Grec 🥙#Halal#Pâtes 🍝#Validé ✅
1,La Maison Du Tacos,"23 Rue Saint-Spire, 91100 Corbeil-Essonnes, Fr...",restaurant,Meilleur endroit pour prendre un tacos FR . 🖕🏾...,💸#Halal#Tacos 🇫🇷#Validé ✅
2,Mamagaya,24 Avenue Henri Barbusse 93700 Drancy France,restaurant,,💸#Grec 🥙#Pâtes 🍝#À faire
3,Broadway Coffee,"4 Boulevard Carnot, 93250 Villemomble, France",fastfood,,💸#Burger 🍔🍟#À faire
4,O'Cheese Naans,"20 Rue Pablo Neruda, 77200 Torcy, France",restaurant,,Burger 🍔🍟#Grec 🥙#💸#À faire
...,...,...,...,...,...
790,La Brewlangerie,"306 Avenue de Muret, 31300 Toulouse, France",bakery,,💸#Boulangerie 🥖#Cookie 🍪
791,Au Péché Mignon La Maison De Ravel,"37 Rue du Languedoc, 31000 Toulouse, France",bakery,,💸💸#Boulangerie 🥖
792,Café Bong,"11 Rue de la Bourse, 31000 Toulouse, France",cafe,,💸💸#Asiat#Healthy
793,Troika Royale,"23 Rue des Filatiers, 31000 Toulouse, France",restaurant,🇷🇺🇬🇪,💸💸#Asiat#TheFork 🍴


##

## Final Step : Export of the preprocessed dataframe into csv and parquet formats

In [10]:
privateless_parquet_df.to_parquet('pp-mapstr-addresses.parquet')

In [11]:
privateless_parquet_df.to_csv('ps-mapstr-addresses.csv')