## PARIS SUBSIDIES: DATA CLEANING (SIRENE)

### LIBRAIRIES IMPORT

In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

In [2]:
from geopy.geocoders import Nominatim

### DATA IMPORT

In [3]:
SIRENE = pd.read_csv('../00_DataFiles/01_Collected/ParisSubsidies_SIRENE.csv', dtype = str)
SIRENE.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11161 entries, 0 to 11160
Data columns (total 18 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   fields.libellecommuneetablissement         11160 non-null  object
 1   fields.categorieentreprise                 6609 non-null   object
 2   fields.etatadministratifunitelegale        11161 non-null  object
 3   fields.geolocetablissement                 11021 non-null  object
 4   fields.trancheeffectifsunitelegale         6850 non-null   object
 5   fields.trancheeffectifsunitelegaletriable  6850 non-null   object
 6   fields.siretsiegeunitelegale               11161 non-null  object
 7   fields.siret                               11161 non-null  object
 8   fields.etablissementsiege                  11161 non-null  object
 9   fields.adresseetablissement                11155 non-null  object
 10  fields.sectionunitelegale         

#### Profile report before cleaning

In [4]:
profile = ProfileReport(SIRENE.reset_index(drop=True), title='SIRENE Profiling Report - Before Cleaning', interactions={'continuous': False})
profile.to_file('ProfileReports/ParisSubsidies_SIRENE_bf.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### FIELD BY FIELD APPROACH

#### Duplicates

In [5]:
SIRENE.loc[SIRENE.duplicated(subset = SIRENE.drop(['geometry.coordinates', 'fields.geolocetablissement'], axis = 1).columns, keep = False)].sort_values(by='fields.siret')

Unnamed: 0,fields.libellecommuneetablissement,fields.categorieentreprise,fields.etatadministratifunitelegale,fields.geolocetablissement,fields.trancheeffectifsunitelegale,fields.trancheeffectifsunitelegaletriable,fields.siretsiegeunitelegale,fields.siret,fields.etablissementsiege,fields.adresseetablissement,fields.sectionunitelegale,fields.codepostaletablissement,fields.caractereemployeurunitelegale,fields.datecreationunitelegale,fields.datecreationetablissement,fields.denominationunitelegale,geometry.type,geometry.coordinates
989,MONTREUIL,PME,Active,"[48.86182, 2.435903]",20 à 49 salariés,12,31081902400293,31081902400293,oui,14 Rue DE LA BEAUNE,Hébergement et restauration,93100.0,Oui,1973-10-30,2017-12-01,VACANCES & FAMILLES,Point,"[2.435903, 48.86182]"
11159,MONTREUIL,PME,Active,"[48.86182, 2.435903]",20 à 49 salariés,12,31081902400293,31081902400293,oui,14 Rue DE LA BEAUNE,Hébergement et restauration,93100.0,Oui,1973-10-30,2017-12-01,VACANCES & FAMILLES,Point,"[2.435903, 48.86182]"
11026,PARIS 11,PME,Active,"[48.853229, 2.389652]",6 à 9 salariés,03,32079827500030,32079827500030,oui,10 Impasse DELEPINE,Santé humaine et action sociale,75011.0,Oui,1980-08-01,1993-07-09,BABILLAGES,Point,"[2.389652, 48.853229]"
6978,PARIS 11,PME,Active,"[48.853229, 2.389652]",6 à 9 salariés,03,32079827500030,32079827500030,oui,10 Impasse DELEPINE,Santé humaine et action sociale,75011.0,Oui,1980-08-01,1993-07-09,BABILLAGES,Point,"[2.389652, 48.853229]"
11093,PARIS 6,PME,Active,"[48.845433, 2.321548]",1 ou 2 salariés,01,32239771200017,32239771200017,oui,103 Rue DE VAUGIRARD,Autres activités de services,75006.0,Oui,1981-08-01,1981-08-01,SOS PARIS,Point,"[2.321548, 48.845433]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1165,PARIS 14,,Active,"[48.829565, 2.329143]",,,83479403400019,83479403400019,oui,47 Rue REMY DUMONCEL,Information et communication,75014.0,Non,2017-12-01,2017-12-01,"PARIS 14, TERRITOIRE DE CINEMA",Point,"[2.329143, 48.829565]"
10994,PARIS 15,,Active,"[48.835341, 2.30214]",,,83526701400024,83526701400024,oui,60 Rue DOMBASLE,Autres activités de services,75015.0,Non,2010-04-28,2018-02-16,ASSOCIATION DES COMMERCANTS DE LA RUE VOUILLE,Point,"[2.30214, 48.835341]"
7846,PARIS 15,,Active,"[48.835341, 2.30214]",,,83526701400024,83526701400024,oui,60 Rue DOMBASLE,Autres activités de services,75015.0,Non,2010-04-28,2018-02-16,ASSOCIATION DES COMMERCANTS DE LA RUE VOUILLE,Point,"[2.30214, 48.835341]"
6962,PARIS 19,,Active,"[48.873641, 2.377641]",,,83834612000010,83834612000010,oui,15 Rue JULES ROMAINS,Autres activités de services,75019.0,Non,2017-03-29,2017-03-29,ESPOIR BERBERE,Point,"[2.377641, 48.873641]"


In [6]:
SIRENE.drop_duplicates(subset = SIRENE.drop(['geometry.coordinates', 'fields.geolocetablissement'], axis = 1).columns, inplace = True)

#### Missing values

In [7]:
SIRENE.isna().sum()

fields.libellecommuneetablissement              1
fields.categorieentreprise                   4533
fields.etatadministratifunitelegale             0
fields.geolocetablissement                    139
fields.trancheeffectifsunitelegale           4292
fields.trancheeffectifsunitelegaletriable    4292
fields.siretsiegeunitelegale                    0
fields.siret                                    0
fields.etablissementsiege                       0
fields.adresseetablissement                     6
fields.sectionunitelegale                       0
fields.codepostaletablissement                  1
fields.caractereemployeurunitelegale            0
fields.datecreationunitelegale                  0
fields.datecreationetablissement                0
fields.denominationunitelegale                  4
geometry.type                                 139
geometry.coordinates                          139
dtype: int64

#### fields.categorieentreprise 
Many NaNs: they are 'non productive' entities (https://www.insee.fr/fr/information/1730869)

In [8]:
SIRENE['fields.categorieentreprise'].fillna('Non productif', inplace = True)

#### fields.etatadministratifunitelegale
Almost all entities are still active so this field is not very informative. We can drop it.

In [9]:
SIRENE.drop('fields.etatadministratifunitelegale', axis = 1, inplace = True)

#### fields.geolocetablissement
We need to examine and potentially fill the missing values.

In [10]:
SIRENE[SIRENE['fields.geolocetablissement'].isna()].head()

Unnamed: 0,fields.libellecommuneetablissement,fields.categorieentreprise,fields.geolocetablissement,fields.trancheeffectifsunitelegale,fields.trancheeffectifsunitelegaletriable,fields.siretsiegeunitelegale,fields.siret,fields.etablissementsiege,fields.adresseetablissement,fields.sectionunitelegale,fields.codepostaletablissement,fields.caractereemployeurunitelegale,fields.datecreationunitelegale,fields.datecreationetablissement,fields.denominationunitelegale,geometry.type,geometry.coordinates
216,PARIS 16,Non productif,,,,83405071800016,83405071800016,oui,2 Avenue PAUL DOUMER,Autres activités de services,75116.0,Non,2017-10-11,2017-10-11,ASSOCIATION MANASSE,,
351,PARIS 16,PME,,1 ou 2 salariés,1.0,81065441800013,81065441800013,oui,15 Avenue PRESIDENT WILSON,"Arts, spectacles et activités récréatives",75116.0,Oui,2007-05-15,2007-05-15,ASSOCIATION QI GONG EE TONG FRANCE,,
495,PARIS 16,PME,,10 à 19 salariés,11.0,32765711000018,32765711000018,oui,71 Avenue HENRI MARTIN,"Arts, spectacles et activités récréatives",75116.0,Oui,1983-01-01,1983-01-01,CERCLE FEMININ PARIS,,
766,PARIS 16,PME,,6 à 9 salariés,3.0,34088665400011,34088665400011,oui,34 Avenue DE NEW YORK,Autres activités de services,75116.0,Oui,1987-01-01,1987-01-01,AMERICAN CENTER FOR ART AND CULTURE,,
797,PARIS 16,PME,,1 ou 2 salariés,1.0,78467027500022,78467027500022,oui,27 Rue DECAMPS,Autres activités de services,75116.0,Oui,1900-01-01,1900-01-01,AUMONERIE CATHOLIQUE JANSON-DELACROIX,,


In [11]:
# Are they all located in the XVIe arrondissement?
SIRENE.loc[SIRENE['fields.geolocetablissement'].isna(),'fields.codepostaletablissement'].value_counts().head()

75116.0    112
75004.0      4
91080.0      2
75015.0      2
76620.0      2
Name: fields.codepostaletablissement, dtype: int64

In [12]:
# Almost all of them: probably the geolocalisation was not able to deal with 75116 as 'code postal' instead of 75016.
# We will try to get the geolocation based on the address (switching 75116 to 75016).

# Subsetting
subset = SIRENE.loc[SIRENE['fields.geolocetablissement'].isna()].copy()
subset['fields.codepostaletablissement'] = np.where(subset['fields.codepostaletablissement'] == '75116.0', '75016', subset['fields.codepostaletablissement'])

# Geocoding
geolocator = Nominatim(user_agent="ParisSubsidies")

for i in subset.index:
    d = {'street':subset.loc[i,'fields.adresseetablissement'], 'postalcode': str(subset.loc[i,'fields.codepostaletablissement'])[0:5], 'country':'FRANCE'}
    p = geolocator.geocode(d)
    try:
        subset.loc[i,'fields.geolocetablissement'] = '[' + str(p.longitude) + ',' + str(p.latitude) + ']'
    except:
        pass

In [13]:
subset.loc[subset['fields.geolocetablissement'].isna(), ['fields.adresseetablissement', 'fields.codepostaletablissement', 'fields.denominationunitelegale']]

Unnamed: 0,fields.adresseetablissement,fields.codepostaletablissement,fields.denominationunitelegale
2768,DEPARTEMENTALE 128,91190.0,SYSTEM@TIC PARIS REGION
3297,8 Place LES COPAINS D ABORD,91080.0,LES PASSEURS D'ONDES
3480,1 Place MAL DE LATTRE DE TASSIGNY,75016.0,ASSOCIATION SPI DAUPHINE
4570,Place MAL DE LATTRE DE TASSIGNY,75016.0,OREILLE DE DAUPHINE
5055,1 Place MAL DE LATTRE DE TASSIGNY,75016.0,THEATRE A DAUPHINE
5156,55 Rue DU 329E RGT D'INFANTERIE,76620.0,ASSOCIATION LA BAZOOKA
5876,7 Place M RENAUD -JL BARRAULT,75015.0,LES DONNEURS DE VOIX
6587,PARC D'ACTIVITES,84120.0,ACIDD
6954,5 Avenue LA GRANDE ARMEE,75016.0,CENTRE DE RECHERCHE INTERDISCIPLINAIRE EN JURI...
7114,55 Rue DU 329E RGT D'INFANTERIE,76620.0,COMPAGNIE AKTE


In [14]:
# Manual input for remaining NaNs
subset.loc[2768, 'fields.geolocetablissement'] = '[48.6572354,2.18586]'
subset.loc[3297, 'fields.geolocetablissement'] = '[48.6287341,2.4156858]'
subset.loc[3480, 'fields.geolocetablissement'] = '[48.8710273,2.271914]'
subset.loc[4570, 'fields.geolocetablissement'] = '[48.8710273,2.271914]'
subset.loc[5055, 'fields.geolocetablissement'] = '[48.8710273,2.271914]'
subset.loc[5156, 'fields.geolocetablissement'] = '[49.5039949,0.122207]'
subset.loc[5876, 'fields.geolocetablissement'] = '[48.8383529,2.2972237]'
subset.loc[6587, 'fields.geolocetablissement'] = '[43.7033551,5.5627938]'
subset.loc[6954, 'fields.geolocetablissement'] = '[48.8742464,2.2901752]'
subset.loc[7114, 'fields.geolocetablissement'] = '[49.5039949,0.122207]'
subset.loc[7714, 'fields.geolocetablissement'] = '[48.8710273,2.271914]'
subset.loc[7768, 'fields.geolocetablissement'] = '[47.3322116,5.0458991]'
subset.loc[7968, 'fields.geolocetablissement'] = '[48.9255848,2.3621314]'
subset.loc[9092, 'fields.geolocetablissement'] = '[48.8421348,2.3316201]'
subset.loc[9625, 'fields.geolocetablissement'] = '[48.8710273,2.271914]'
subset.loc[10360, 'fields.geolocetablissement'] = '[48.8710273,2.271914]'

In [15]:
# Load data into SIRENE
for i in subset.index:
    SIRENE.loc[i, 'fields.geolocetablissement'] = subset.loc[i,'fields.geolocetablissement']

#### fields.trancheeffectifsunitelegale & fields.trancheeffectifsunitelegaletriable 
NaNs are 'Etablissement non employeur' with triable = -1.

Ordered categorical field.

In [16]:
SIRENE['fields.trancheeffectifsunitelegale'].fillna('Etablissement non employeur', inplace = True)
SIRENE['fields.trancheeffectifsunitelegaletriable'].fillna(-1, inplace = True)

In [17]:
# Convert to integer
SIRENE['fields.trancheeffectifsunitelegaletriable'] = SIRENE['fields.trancheeffectifsunitelegaletriable'].astype(int)

In [18]:
dct_tranche = dict()

for tranche in SIRENE['fields.trancheeffectifsunitelegale'].unique():
    dct_tranche[tranche] = int(SIRENE.loc[SIRENE.loc[SIRENE['fields.trancheeffectifsunitelegale'] == tranche].index.min(), 'fields.trancheeffectifsunitelegaletriable'])

In [19]:
# Sort dictionary by values
dct_tranche = dict(sorted(dct_tranche.items(), key=lambda item: item[1]))

In [20]:
SIRENE['fields.trancheeffectifsunitelegale'] = pd.Categorical(SIRENE['fields.trancheeffectifsunitelegale'], categories=dct_tranche.keys(), ordered = True)


#### fields.siretsiegeunitelegale
Uniquely identifies legal entity.

#### fields.siret
Uniquely identifies etablissement.

#### fields.etablissementsiege
'oui' if etablissement is siege.

To convert to binary field 1/0.

In [21]:
SIRENE['fields.etablissementsiege'] = np.where(SIRENE['fields.etablissementsiege'] == 'oui',1,0)

#### Dates field
Transform to datetime

##### fields.datecreationunitelegale  

In [22]:
SIRENE['fields.datecreationunitelegale'] = pd.to_datetime(SIRENE['fields.datecreationunitelegale'])

##### fields.datecreationetablissement

In [23]:
SIRENE['fields.datecreationetablissement'] = pd.to_datetime(SIRENE['fields.datecreationetablissement'])

#### Address fields
We keep them for visualisation purposes (hover labels).

##### fields.adresseetablissement
We fill the NaNs manually.

In [24]:
SIRENE.loc[SIRENE['fields.adresseetablissement'].isna()]

Unnamed: 0,fields.libellecommuneetablissement,fields.categorieentreprise,fields.geolocetablissement,fields.trancheeffectifsunitelegale,fields.trancheeffectifsunitelegaletriable,fields.siretsiegeunitelegale,fields.siret,fields.etablissementsiege,fields.adresseetablissement,fields.sectionunitelegale,fields.codepostaletablissement,fields.caractereemployeurunitelegale,fields.datecreationunitelegale,fields.datecreationetablissement,fields.denominationunitelegale,geometry.type,geometry.coordinates
1267,LES PILLES,PME,"[44.379526, 5.188954]",Etablissement non employeur,-1,41763712100019,41763712100019,1,,Autres activités de services,26110.0,Non,1997-11-01,1997-11-01,ASSOCIATION AFRICULTURES,Point,"[5.188954, 44.379526]"
4846,COEUVRES-ET-VALSERY,PME,"[49.335993, 3.151804]",1 ou 2 salariés,1,38750560500023,38750560500023,1,,"Arts, spectacles et activités récréatives",2600.0,Non,1991-06-25,2005-05-31,LA LANTERNE MAGIQUE,Point,"[3.151804, 49.335993]"
6327,LOUHOSSOA,PME,"[43.317797, -1.355818]",10 à 19 salariés,11,41103434100049,41103434100049,1,,"Arts, spectacles et activités récréatives",64250.0,Oui,1996-02-26,2008-09-20,LE PETIT THEATRE DE PAIN,Point,"[-1.355818, 43.317797]"
7612,FLEURY-DEVANT-DOUAUMONT,Non productif,"[49.193794, 5.43252]",Etablissement non employeur,-1,78339244200019,78339244200019,1,,"Arts, spectacles et activités récréatives",55100.0,Non,1900-01-01,1900-01-01,COMITE DU MEMORIAL DE VERDUN,Point,"[5.43252, 49.193794]"
10836,PRASLAY,PME,"[47.74088, 5.105861]",1 ou 2 salariés,1,47957806400012,47957806400012,1,,"Arts, spectacles et activités récréatives",52160.0,Oui,2004-10-18,2004-10-18,LES DECISIFS,Point,"[5.105861, 47.74088]"
11123,SOURCE-SEINE,Non productif,"[47.490598, 4.686979]",Etablissement non employeur,-1,50097582600016,50097582600016,1,,Autres activités de services,21690.0,Non,2007-11-15,2007-11-15,ASSOCIATION DES SOURCES DE LA SEINE,Point,"[4.686979, 47.490598]"


In [25]:
SIRENE.loc[1267, 'fields.adresseetablissement'] = 'Le Village'
SIRENE.loc[4846, 'fields.adresseetablissement'] = '28 Rue Gabrielle d\'Estrées'
SIRENE.loc[6327, 'fields.adresseetablissement'] = 'Le Bourg'
SIRENE.loc[7612, 'fields.adresseetablissement'] = '1 Avenue du Corps Européen'
SIRENE.loc[10836, 'fields.adresseetablissement'] = '2 Rue des Chassaignes'
SIRENE.loc[11123, 'fields.adresseetablissement'] = '22 Rue de Saint-Germain'

##### fields.libellecommuneetablissement
We fill the NaNs manually.

In [26]:
SIRENE.loc[SIRENE['fields.libellecommuneetablissement'].isna()]

Unnamed: 0,fields.libellecommuneetablissement,fields.categorieentreprise,fields.geolocetablissement,fields.trancheeffectifsunitelegale,fields.trancheeffectifsunitelegaletriable,fields.siretsiegeunitelegale,fields.siret,fields.etablissementsiege,fields.adresseetablissement,fields.sectionunitelegale,fields.codepostaletablissement,fields.caractereemployeurunitelegale,fields.datecreationunitelegale,fields.datecreationetablissement,fields.denominationunitelegale,geometry.type,geometry.coordinates
7622,,PME,"[49.028226, 3.957707]",10 à 19 salariés,11,31935615000015,31935615000015,1,HOTEL DE VILLE QUEBEC,Autres activités de services,,Oui,1980-01-01,1980-01-01,ASS INTERNATIONALE DES MAIRES FRANCOPHON,Point,"[3.957707, 49.028226]"


In [27]:
SIRENE.loc[7622, 'fields.libellecommuneetablissement'] = 'QUEBEC'

##### fields.codepostaletablissement
Kept first 5 characters only.

In [28]:
SIRENE['fields.codepostaletablissement'] = SIRENE['fields.codepostaletablissement'].astype(str).apply(lambda x: x[0:5])
SIRENE.loc[7622, 'fields.codepostaletablissement'] = 'G1R 4S9' # Except for Quebec

##### Concatenate into a single field (addresseetablissementcomplete)

In [29]:
SIRENE['adresseetablissementcomplete'] = SIRENE['fields.adresseetablissement'] + ', ' + SIRENE['fields.codepostaletablissement'] + ' ' + SIRENE['fields.libellecommuneetablissement']
# We keep the 'code postal' for arrondissement & departement analysis
SIRENE.drop(['fields.adresseetablissement', 'fields.libellecommuneetablissement'], axis = 1, inplace = True)

In [30]:
# Upper case
SIRENE['adresseetablissementcomplete'] = SIRENE['adresseetablissementcomplete'].str.upper()

#### fields.sectionunitelegale 
Not very informative and we have more insights in the subsidies file ('Secteurs d'activités définies par l'association').
We can drop it.

In [31]:
SIRENE.drop('fields.sectionunitelegale', axis = 1, inplace = True)

#### fields.caractereemployeurunitelegale
'oui' if can have employees.

To convert to binary field 1/0.

In [32]:
SIRENE['fields.caractereemployeurunitelegale'] = np.where(SIRENE['fields.caractereemployeurunitelegale'] == 'Oui',1,0)

#### fields.denominationunitelegale
We fill the NaNs manually.

In [33]:
SIRENE.loc[SIRENE['fields.denominationunitelegale'].isna()]

Unnamed: 0,fields.categorieentreprise,fields.geolocetablissement,fields.trancheeffectifsunitelegale,fields.trancheeffectifsunitelegaletriable,fields.siretsiegeunitelegale,fields.siret,fields.etablissementsiege,fields.codepostaletablissement,fields.caractereemployeurunitelegale,fields.datecreationunitelegale,fields.datecreationetablissement,fields.denominationunitelegale,geometry.type,geometry.coordinates,adresseetablissementcomplete
1523,PME,"[48.863613, 2.405414]",Etablissement non employeur,-1,50980367200013,50980367200013,1,75020,0,2009-01-09,2009-01-09,,Point,"[2.405414, 48.863613]","4 VLA DES LYANES, 75020 PARIS 20"
2498,Non productif,"[48.837568, 2.259573]",Etablissement non employeur,-1,51265462500015,51265462500015,1,75016,0,2009-06-02,2009-06-02,,Point,"[2.259573, 48.837568]","10 RUE GUDIN, 75016 PARIS 16"
5732,PME,"[48.890392, 2.345489]",Etablissement non employeur,-1,82122548900011,82122548900011,1,75018,0,2016-07-01,2016-07-01,,Point,"[2.345489, 48.890392]","59 RUE RAMEY, 75018 PARIS 18"
5886,PME,"[48.819309, 2.343683]",Etablissement non employeur,-1,52837795500026,52837795500018,0,75014,0,2009-03-16,2009-03-16,,Point,"[2.343683, 48.819309]","1 BOULEVARD JOURDAN, 75014 PARIS 14"


In [34]:
SIRENE.loc[1523, 'fields.denominationunitelegale'] = 'ALESSANDRA LE DU'
SIRENE.loc[2498, 'fields.denominationunitelegale'] = 'NADIA BARBE'
SIRENE.loc[5732, 'fields.denominationunitelegale'] = 'VERONIQUE RIEFFEL'
SIRENE.loc[5886, 'fields.denominationunitelegale'] = 'AHMED LAHLOU'

In [35]:
# Upper case
SIRENE['fields.denominationunitelegale'] = SIRENE['fields.denominationunitelegale'].str.upper()

#### geometry.type and geometry.coordinates
Redundant information with fields.geolocetablissement. We can drop them.

In [36]:
SIRENE.drop(['geometry.type', 'geometry.coordinates'], axis = 1, inplace = True)

#### Columns name
We can remove all the 'fields.' prefixes.

In [37]:
SIRENE.columns = SIRENE.columns.str.replace('fields.', '', regex=False)

### CLEAN VIEW

#### Profile report after cleaning

In [38]:
SIRENE.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11108 entries, 0 to 11156
Data columns (total 13 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   categorieentreprise                 11108 non-null  object        
 1   geolocetablissement                 11108 non-null  object        
 2   trancheeffectifsunitelegale         11108 non-null  category      
 3   trancheeffectifsunitelegaletriable  11108 non-null  int64         
 4   siretsiegeunitelegale               11108 non-null  object        
 5   siret                               11108 non-null  object        
 6   etablissementsiege                  11108 non-null  int64         
 7   codepostaletablissement             11108 non-null  object        
 8   caractereemployeurunitelegale       11108 non-null  int64         
 9   datecreationunitelegale             11108 non-null  datetime64[ns]
 10  datecreationetablissem

In [39]:
#profile = ProfileReport(SIRENE.reset_index(drop=True), title='SIRENE Profiling Report - Before Cleaning', interactions={'continuous': False})
#profile.to_file('ProfileReports/ParisSubsidies_SIRENE_bf.html')

#### Save to CSV

In [40]:
SIRENE.to_csv('../00_DataFiles/02_Cleaned/ParisSubsidies_SIRENE.csv', index = False)

#### Save to Feather

In [41]:
SIRENE.reset_index(drop=True).to_feather('../00_DataFiles/02_Cleaned/ParisSubsidies_SIRENE.feather')