## PARIS SUBSIDIES: DATA JOINING

### LIBRAIRIES IMPORT

In [1]:
import pandas as pd
import numpy as np

### joined IMPORT

#### Subsidies

In [2]:
subsidies = pd.read_feather('../00_DataFiles/02_Cleaned/ParisSubsidies_Records.feather')
subsidies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79968 entries, 0 to 79967
Data columns (total 8 columns):
 #   Column                                           Non-Null Count  Dtype 
---  ------                                           --------------  ----- 
 0   Numéro de dossier                                79968 non-null  object
 1   Année budgétaire                                 79968 non-null  object
 2   Numéro Siret                                     79968 non-null  object
 3   Objet du dossier                                 79968 non-null  object
 4   Montant voté                                     79968 non-null  object
 5   Direction                                        79968 non-null  object
 6   Nature de la subvention                          79968 non-null  object
 7   Secteurs d'activités définies par l'association  79968 non-null  object
dtypes: object(8)
memory usage: 4.9+ MB


In [3]:
subsidies.isna().sum()

Numéro de dossier                                  0
Année budgétaire                                   0
Numéro Siret                                       0
Objet du dossier                                   0
Montant voté                                       0
Direction                                          0
Nature de la subvention                            0
Secteurs d'activités définies par l'association    0
dtype: int64

In [4]:
# Some datatypes to correct
subsidies['Année budgétaire'] = subsidies['Année budgétaire'].astype(int)
subsidies['Montant voté'] = subsidies['Montant voté'].astype(int)

In [5]:
# Columns to be renamed
dct_rename = {
    'Numéro de dossier':'numero_dossier',
    'Année budgétaire':'annee_budgetaire',
    'Numéro Siret':'siret',
    'Objet du dossier':'objet_dossier',
    'Montant voté':'montant_vote',
    'Direction':'direction',
    'Nature de la subvention':'nature_subvention',
    'Secteurs d\'activités définies par l\'association':'secteurs_activites'
}

subsidies.rename(columns = dct_rename, inplace = True)

#### SIRENE

In [6]:
SIRENE = pd.read_feather('../00_DataFiles/02_Cleaned/ParisSubsidies_SIRENE.feather')
SIRENE.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11108 entries, 0 to 11107
Data columns (total 13 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   categorieentreprise                 11108 non-null  object        
 1   geolocetablissement                 11108 non-null  object        
 2   trancheeffectifsunitelegale         11108 non-null  category      
 3   trancheeffectifsunitelegaletriable  11108 non-null  int64         
 4   siretsiegeunitelegale               11108 non-null  object        
 5   siret                               11108 non-null  object        
 6   etablissementsiege                  11108 non-null  int64         
 7   codepostaletablissement             11108 non-null  object        
 8   caractereemployeurunitelegale       11108 non-null  int64         
 9   datecreationunitelegale             11108 non-null  datetime64[ns]
 10  datecreationetablissem

In [7]:
SIRENE.isna().sum()

categorieentreprise                   0
geolocetablissement                   0
trancheeffectifsunitelegale           0
trancheeffectifsunitelegaletriable    0
siretsiegeunitelegale                 0
siret                                 0
etablissementsiege                    0
codepostaletablissement               0
caractereemployeurunitelegale         0
datecreationunitelegale               0
datecreationetablissement             0
denominationunitelegale               0
adresseetablissementcomplete          0
dtype: int64

In [8]:
# Transform categoryentreprise into an ordered category
SIRENE.categorieentreprise = pd.Categorical(SIRENE.categorieentreprise, ['Non productif', 'PME', 'ETI', 'GE'], ordered = True)

In [9]:
# Split geolocetablissement into latitude / longitude
SIRENE.geolocetablissement = SIRENE.geolocetablissement.str.replace('[','').str.replace(']','').str.split(',')

  SIRENE.geolocetablissement = SIRENE.geolocetablissement.str.replace('[','').str.replace(']','').str.split(',')


In [10]:
SIRENE['latitude'] = SIRENE.geolocetablissement.apply(lambda x: x[0])
SIRENE['longitude'] = SIRENE.geolocetablissement.apply(lambda x: x[1])
SIRENE.drop('geolocetablissement', axis = 1, inplace= True)

In [11]:
# Rename columns for readability
dct_rename = {
    'categorieentreprise':'categorie_entreprise',
    'trancheeffectifsunitelegale':'tranche_effectifs_unite_legale',
    'trancheeffectifsunitelegaletriable':'tranche_effectifs_unite_legale_triable',
    'siretsiegeunitelegale':'siret_siege_unite_legale',
    'etablissementsiege':'etablissement_siege',
    'codepostaletablissement':'code_postal_etablissement',
    'caractereemployeurunitelegale':'caractere_employeur_unite_legale',
    'datecreationunitelegale':'date_creation_unite_legale',
    'datecreationetablissement':'date_creation_etablissement',
    'denominationunitelegale':'denomination_unite_legale',
    'adresseetablissementcomplete':'adresse_etablissement_complete'
}
SIRENE.rename(columns = dct_rename, inplace=True)

### JOINING

In [12]:
joined = subsidies.join(SIRENE.set_index('siret'), on='siret', how = 'left')
joined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79968 entries, 0 to 79967
Data columns (total 21 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   numero_dossier                          79968 non-null  object        
 1   annee_budgetaire                        79968 non-null  int64         
 2   siret                                   79968 non-null  object        
 3   objet_dossier                           79968 non-null  object        
 4   montant_vote                            79968 non-null  int64         
 5   direction                               79968 non-null  object        
 6   nature_subvention                       79968 non-null  object        
 7   secteurs_activites                      79968 non-null  object        
 8   categorie_entreprise                    79961 non-null  category      
 9   tranche_effectifs_unite_legale          79961 non-

In [13]:
joined.isna().sum()

numero_dossier                            0
annee_budgetaire                          0
siret                                     0
objet_dossier                             0
montant_vote                              0
direction                                 0
nature_subvention                         0
secteurs_activites                        0
categorie_entreprise                      7
tranche_effectifs_unite_legale            7
tranche_effectifs_unite_legale_triable    7
siret_siege_unite_legale                  7
etablissement_siege                       7
code_postal_etablissement                 7
caractere_employeur_unite_legale          7
date_creation_unite_legale                7
date_creation_etablissement               7
denomination_unite_legale                 7
adresse_etablissement_complete            7
latitude                                  7
longitude                                 7
dtype: int64

In [14]:
joined.loc[joined['categorie_entreprise'].isna()]

Unnamed: 0,numero_dossier,annee_budgetaire,siret,objet_dossier,montant_vote,direction,nature_subvention,secteurs_activites,categorie_entreprise,tranche_effectifs_unite_legale,...,siret_siege_unite_legale,etablissement_siege,code_postal_etablissement,caractere_employeur_unite_legale,date_creation_unite_legale,date_creation_etablissement,denomination_unite_legale,adresse_etablissement_complete,latitude,longitude
41308,2019_07630,2019,78426221400017,soutenir la jeune création artistique contempo...,2000,DAC,Projet,[Culture & Arts],,,...,,,,,NaT,NaT,,,,
42392,2019_07196,2019,51348142400036,Volontaires intervenant auprès de victimes d'a...,0,DASES,Non précisée,"[Défense des droits et des intérêts, Mémoire]",,,...,,,,,NaT,NaT,,,,
57130,2019_05369,2019,51348142400036,Volontaires intervenant auprès de Victimes d'A...,10000,DPSP,Projet,"[Défense des droits et des intérêts, Mémoire]",,,...,,,,,NaT,NaT,,,,
65774,2020_02265,2020,51348142400036,Volontaires intervenant auprès de victimes d'a...,10000,DPSP,Projet,"[Défense des droits et des intérêts, Mémoire]",,,...,,,,,NaT,NaT,,,,
66014,2020_07211,2020,78426221400017,Soutenir la jeune création artistique contempo...,1500,DAC,Projet,[Culture & Arts],,,...,,,,,NaT,NaT,,,,
73185,2021_09363,2021,78426221400017,aide à la création et aux jeunes artistes,0,DAC,Non précisée,[Culture & Arts],,,...,,,,,NaT,NaT,,,,
74471,2021_06477,2021,12345677911345,Test-MS-22/12/20,0,DDCT,Non précisée,"[Communication & média, Social, Sport]",,,...,,,,,NaT,NaT,,,,


In [15]:
# Above lines are either tests or 'non diffusible', we can drop them.
df_drops = joined.loc[joined['categorie_entreprise'].isna()]
df_drops.to_csv('../00_DataFiles/99_Dropped/ParisSubsidies_NoJoining.csv', index = False)
joined.dropna(inplace = True)

In [16]:
joined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79961 entries, 0 to 79967
Data columns (total 21 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   numero_dossier                          79961 non-null  object        
 1   annee_budgetaire                        79961 non-null  int64         
 2   siret                                   79961 non-null  object        
 3   objet_dossier                           79961 non-null  object        
 4   montant_vote                            79961 non-null  int64         
 5   direction                               79961 non-null  object        
 6   nature_subvention                       79961 non-null  object        
 7   secteurs_activites                      79961 non-null  object        
 8   categorie_entreprise                    79961 non-null  category      
 9   tranche_effectifs_unite_legale          79961 non-

In [17]:
joined.latitude = joined.latitude.astype(float)
joined.longitude = joined.longitude.astype(float)

### DATA ENRICHMENT

#### Bins for 'montant_vote'

In [18]:
with np.errstate(divide='ignore'):
    joined['montant_vote_scale'] = np.floor(np.log10(joined.montant_vote))

joined['montant_vote_cat'] = np.nan
joined['montant_vote_cat'] = np.where(joined.montant_vote_scale <= 2, '<1k EUR', joined.montant_vote_cat)
joined['montant_vote_cat'] = np.where(joined.montant_vote_scale == 3, '1-10k EUR', joined.montant_vote_cat)
joined['montant_vote_cat'] = np.where(joined.montant_vote_scale == 4, '10-100k EUR', joined.montant_vote_cat)
joined['montant_vote_cat'] = np.where(joined.montant_vote_scale == 5, '100k-1M EUR', joined.montant_vote_cat)
joined['montant_vote_cat'] = np.where(joined.montant_vote_scale == 6, '1-10M EUR', joined.montant_vote_cat)
joined['montant_vote_cat'] = np.where(joined.montant_vote_scale == 7, '> 10M EUR', joined.montant_vote_cat)
joined['montant_vote_cat'] = np.where(joined.montant_vote_scale < 0, 'Rejected', joined.montant_vote_cat)

joined['montant_vote_cat'] = pd.Categorical(joined.montant_vote_cat, categories = ['Rejected', '<1k EUR', '1-10k EUR', '10-100k EUR','100k-1M EUR','1-10M EUR', '> 10M EUR'], ordered=True)

#### Filter accepted vs. rejected

In [19]:
# Boolean & str for accepted vs. rejected
joined['subsidy_granted_bool'] = np.where(joined.montant_vote>0, True, False)
joined['subsidy_granted'] = np.where(joined.montant_vote>0, 'yes', 'no')

#### Geography

In [20]:
# Paris vs. IdF vs. Beyond

joined['dpt_code'] = joined.code_postal_etablissement.str[0:2]

joined['geo_cat'] = np.nan

joined.geo_cat = np.where(joined.dpt_code == '75', 'Paris', joined.geo_cat)
joined.geo_cat = np.where(joined.dpt_code.isin(['77','78','91','92','93','94','95']) , 'IdF', joined.geo_cat)
joined.geo_cat = np.where(joined.geo_cat == 'nan', 'Beyond', joined.geo_cat)

In [21]:
# Arrondissement

joined['arrondissement'] = np.where(joined.dpt_code == '75', joined.code_postal_etablissement.str[-2:], 'Outside Paris')

### SAVE

#### CSV

In [22]:
joined.to_csv('../00_DataFiles/03_Joined/ParisSubsidies_Joined.csv', index = False)

#### Feather

In [23]:
joined.reset_index(drop=True).to_feather('../00_DataFiles/03_Joined/ParisSubsidies_Joined.feather')