#### Libraries

In [6]:
import sys, os, wget, zipfile
import pandas as pd

#### Set Paths

In [2]:
pathData = 'data'
pathTMP = os.path.join(pathData, 'tmp')
os.makedirs(pathTMP, exist_ok=True)

---
#### Functions

In [3]:
def bar_progress(current, total, width=80):
    progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
    sys.stdout.write("\r" + progress_message)
    sys.stdout.flush()

---
#### Collect Data - SIREN/SIRET of the Companies and their establishment

In [4]:
dictStocks = dict({
    'CORE_URL': 'https://www.data.gouv.fr/fr/datasets/r/',
    'Stock Etablissement Historique': {'URL': '88fbb6b4-0320-443e-b739-b4376a012c32', 'UPDATE': '2023_02_01'},
    'Stock Etablissement': {'URL': '0651fb76-bcf3-4f6a-a38d-bc04fa708576', 'UPDATE': '2023_02_01'},
    'Stock Unite Legale Historique': {'URL': '0835cd60-2c2a-497b-bc64-404de704ce89', 'UPDATE': '2023_02_01'},
    'Stock Etablissement Liens Succession': {'URL': '9c4d5d9c-4bbb-4b9c-837a-6155cb589e26', 'UPDATE': '2023_02_01'},
    'Stock Unite Legale': {'URL': '825f4199-cadd-486c-ac46-a65a8ea1a047', 'UPDATE': '2023_02_01'}
})

In [18]:
getStockData = False  # Change this BOOLEAN to "True" if you want to download the data regarding the stock.
if getStockData:
    for stockKey in list(filter(lambda cKey : 'stock' in cKey.lower(), dictStocks.keys())):
        print(f'Current File -> {stockKey}')
        stockURL = dictStocks["CORE_URL"] + dictStocks[stockKey]["URL"]
        stockFilename = wget.download(stockURL, out=pathTMP, bar=bar_progress)
        z = zipfile.ZipFile(stockFilename)
        z.extractall(pathData)
        z.close()
        os.remove(stockFilename)
        print(f'\nFile Downloaded and Extracted Successfully!')


Current File -> Stock Etablissement Historique
Downloading: 100% [978759807 / 978759807] bytesdata\tmp/88fbb6b4-0320-443e-b739-b4376a012c32

File Downloaded and Extracted Successfully!


---

#### Collect Data - BODACC

In [7]:
get_BODACC_Data = False  # Change this BOOLEAN to "True" if you want to download the data regarding the BODAAC.
if get_BODACC_Data:
    BODAAC_URL = 'https://bodacc-datadila.opendatasoft.com/api/explore/v2.1/catalog/datasets/annonces-commerciales/exports/csv'
    filename = wget.download(BODAAC_URL, out=pathData, bar=bar_progress)

---

#### Collect Data - NAFs Code

In [43]:
get_CodeNAFs_Data = False  # Change this BOOLEAN to "True" if you want to download the data regarding the BODAAC.
if get_CodeNAFs_Data:
    CodeNAFs_URL = 'https://www.insee.fr/fr/statistiques/fichier/2120875/int_courts_naf_rev_2.xls'
    filename = wget.download(CodeNAFs_URL, out=pathData, bar=bar_progress)
    
    # Transform the Dataset Content
    df_codeNafs = pd.read_excel(filename).dropna().reset_index(drop=True)
    df_codeNafs = df_codeNafs.drop(columns = [df_codeNafs.columns[0], df_codeNafs.columns[2], df_codeNafs.columns[3]])
    df_codeNafs = df_codeNafs.rename(columns = {'Code': 'NAF', df_codeNafs.columns[-1]: 'Intitulés'})
    df_codeNafs['SECTION'] = df_codeNafs['NAF'].str.extract('SECTION ([A-Z])')
    df_codeNafs['SECTION'].fillna(method='ffill', inplace=True)
    df_codeNafs = df_codeNafs[~df_codeNafs['NAF'].str.contains('SECTION')].reset_index(drop=True)
    df_codeNafs.to_csv(filename.replace('.xls', '.csv'))  # Transform XLS into CSV
    os.remove(filename)

Downloading: 100% [-1 / -1] bytes

#### SPLIT Stock Data by NAFs Code SECTION

In [1]:
split_Stock_Data = False  # Change this BOOLEAN to "True" if you want to download the data regarding the BODAAC.
if split_Stock_Data: 
    df_codeNafs = pd.read_csv( os.path.join(pathData, 'int_courts_naf_rev_2.csv'))
    lsStocksFiles = list(filter(lambda fileName : all(['lien' not in fileName.lower(), 'stock' in fileName.lower(), '.csv' in fileName.lower()]), os.listdir(pathData)))
    for stockFile in lsStocksFiles:  # Loop through each Stock File
        try:
            stockFolderName = stockFile.split('_')[0]
            print(stockFolderName, f'\n{"-"*len(stockFolderName)}')
            sectionPath = os.path.join(pathData, stockFolderName, 'SECTION')
            os.makedirs(sectionPath, exist_ok=True)

            df = pd.read_csv(os.path.join(pathData, stockFile), low_memory=False)
            for section in df_codeNafs['SECTION'].unique():
                NAF_codes = list(df_codeNafs[df_codeNafs['SECTION'] == section]['NAF'])
                colActivity = 'activitePrincipaleUniteLegale' if 'Etablissement' not in stockFile else 'activitePrincipaleEtablissement'
                subdf = df[df[colActivity].isin(NAF_codes)].reset_index(drop=True)
                subdf.to_csv( os.path.join(sectionPath, f'{stockFolderName}-SECTION_{section}.csv'))
                print(f'Dataset {stockFolderName} - Filtered on Section {section} -> Created')
            print('\n')
        except:
            continue

---

#### Collect Data - Legal Categories by INSEE 

In [5]:
get_LEGAL_CATEGORIES_Data = False  # Change this BOOLEAN to "True" if you want to download the data regarding the LEGAL Categories by the INSEE.
if get_LEGAL_CATEGORIES_Data:
    LEGAL_CATEGORIES_URL = 'https://public.opendatasoft.com/api/explore/v2.1/catalog/datasets/categories-juridiques-insee/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B'
    filename = wget.download(LEGAL_CATEGORIES_URL, out=pathData, bar=bar_progress)

Downloading: 100% [-1 / -1] bytes