In [1]:
#Libraries and display options
#Target of the script is to reduce the size of the geolocalized Sirene file 
import pandas as pd
import geopandas as gpd
#we get rid of parquet warning
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')



In [2]:
#Parameters 
data_sources = {"in": "data/in/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.csv", 
                "intermediate": "data/interim/sirene_smaller.csv", 
                "out": "data/interim/sirene_filtered.pq"
}

#File size barely fits into memory so do this step only if needed
full_treatement = False

#city context 
top_cities  = {'Montpellier' : ( 3.876716, 43.610769)}
bbox = [43.2, 43.8, 3.3, 4.5 ]

In [3]:

#We only keep a few columns to reduce the file size
if full_treatement:


    df = pd.read_csv(data_sources["in"], sep=';')

    cols_to_keep = ['siret','y_latitude', 'x_longitude']
    df = df.loc[:, cols_to_keep]

    df.to_csv(data_sources["intermediate"], index = False)

    del(df)

In [4]:
df = pd.read_csv(data_sources["intermediate"])
df.head()

Unnamed: 0,siret,y_latitude,x_longitude
0,82644042200010,48.480519,5.857737
1,40909561900014,48.18757,6.187949
2,91091485200017,48.073265,6.014435
3,78341973200020,48.016545,5.832695
4,32169992800016,48.212595,6.260872


In [5]:
#We  filter out based on lat , lon 

df = df.sort_values(by="y_latitude")

df = df.loc[df["y_latitude"] < bbox[1] , : ]
df = df.loc[df["y_latitude"] > bbox[0] , : ]


df = df.sort_values(by="x_longitude")

df = df.loc[df["x_longitude"] < bbox[3] , : ]
df = df.loc[df["x_longitude"] > bbox[2] , : ]

df.head()

Unnamed: 0,siret,y_latitude,x_longitude
22472174,44911310900044,43.427691,3.300005
4735745,44847792700025,43.321289,3.30002
11814469,34121084700024,43.322189,3.300025
723190,42835880800015,43.322189,3.300025
10860389,89019614000010,43.323988,3.300034


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 652581 entries, 22472174 to 23393407
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   siret        652581 non-null  int64  
 1   y_latitude   652581 non-null  float64
 2   x_longitude  652581 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 19.9 MB


In [7]:
df["geometry"] = gpd.points_from_xy(df["x_longitude"], df["y_latitude"], z=None, crs=None)

gdf = gpd.GeoDataFrame(df)

In [8]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 652581 entries, 22472174 to 23393407
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   siret        652581 non-null  int64   
 1   y_latitude   652581 non-null  float64 
 2   x_longitude  652581 non-null  float64 
 3   geometry     652581 non-null  geometry
dtypes: float64(2), geometry(1), int64(1)
memory usage: 24.9 MB


In [9]:
#df.to_csv(data_sources["out"], index = False)
gdf.to_parquet(data_sources["out"])
del df, gdf