In [None]:
import os
import sys
import json
import pathlib

from wasabi import msg

from functools import lru_cache


import pandas as pd
import numpy as np

from typing import Union, List, Dict, Optional

In [None]:
def get_file_paths(data_path: Union[str, pathlib.Path], file_prefix: str, sort: Optional[bool] = True) -> List[str]:
    """
    Util for getting a sorted/unsorted list of file paths from disk.

        --Parameters
     - data_path: (str, or Path), the directory path
     - file_prefix: (str), the file prefixs which should be searched (txt, json, csv, etc ..)
     - sort: (bool), if or not returning a sorted lis

        --Return
     - A list containing the founded file with prefix file_prefix

        """

    if isinstance(data_path, str):
        data_path = pathlib.Path(data_path)

    if not data_path.is_dir():
        raise ValueError(f'data path {data_path} is not a regular directory.')
    
    file_prefix = file_prefix if not file_prefix[0] == '.' else file_prefix[1:]
    ls = list(map(lambda x: x, data_path.glob(f'*.{file_prefix}')))
    msg.info(f'Founded {len(ls)} file with prefix {file_prefix} at {data_path}')
    if sort:
        return sorted(ls)
    return ls

@lru_cache
def get_raw_enterprise_paths():
    enterprise_ids_paths = get_file_paths(data_path='../../data/raw_enterprise', file_prefix='json')
    ent_ids = list(map(lambda x: str(x.stem).split('_')[3], enterprise_ids_paths)) # sorted enterprise ids
    msg.info(f'ENT ID: {ent_ids[0]}')
    return sorted(ent_ids)

def check_raw_enterprise_exist(enterprise_id: str):
    ent_ids = get_raw_enterprise_paths()
    msg.info(f'Looking if {enterprise_id} has been already processed')
    return binary_search(array=ent_ids, item=enterprise_id)

def binary_search(array, item):
    left = 0
    right = len(array) - 1
    if item < array[left] or item > array[right]:
        return None
    while left <= right:
        mid = left + (right - left) // 2
        el = array[mid]
        print(el)
        print(mid)
        print(right)
        print(left)
        if el == item:
            msg.good('found')
            return True
        elif el > item:
            right = mid - 1
        else:
            left = mid + 1
    return None

In [None]:
with open('../../data/enterprise_ids/202207211828_Abruzzo_AQUILA.txt') as f:
    txt = f.read().splitlines()
txt[0], txt[1]

In [None]:
ent_ids = get_raw_enterprise_paths()

In [None]:
list(set(txt).difference(ent_ids))

In [None]:
ent_ids[8] < ent_ids[52]

In [None]:
df = pd.read_csv('../../data/final_df_prova.csv', index_col=0)
df.head()

In [None]:
# ROOT_PATH = '../..'
# DATA_PATH = Path(ROOT_PATH)/'data'
# file_paths = sorted(list(map(lambda x: str(x), Path(DATA_PATH).glob('*.txt'))))
# logger.info(f'Retrived {len(file_paths)} txt files in {DATA_PATH}')

# BASE_URL = 'https://www.fgas.it/RicercaSezC/DettaglioImpresa?id='


# def gen_dataset(file_paths: str):
#     for f_path in file_paths:
#         with open(f_path) as f:
#             logger.info(f'yielding {f_path}')
#             yield (f_path, f.read())


# def get_full_url(enterprise_id: str) -> str:
#     return BASE_URL + enterprise_id

In [None]:
# dataset = gen_dataset(file_paths)

# for data in dataset:
#     p = Path(data[0])
#     print(p.parent)
#     ddd = p.stem.split('_')
#     print(ddd[0], ddd[1], ddd[2])
#     print(p.name)
#     lines = data[1].splitlines()
#     print(len(lines))
#     for line in lines:
#         print(line)
#         break
#     break

In [None]:
payload_dict = {
    'FormatoReport': '3',
    'GeneraReport': 'false',
    'displayReport': 'display%3Anone%3B',
    'flagAreaPub': 'True',
    'DataFromSession': 'false',
    'DownloadToken': '',
    'NumRecord': '160',
    'Nazionalita': 'I',
    'IDRegione': '19',
    'IstatProv': '083',
    'Identificativo': '',
    'RadioBtnDenominazione': 'C',
    'Denominazione': '',
    'NumCertProv': '',
    'TipoSoggetto': 'I',
    'IDAttivita_I': '',
    'IDAttivita': ''}

raw_payload = "FormatoReport=3&GeneraReport=false&displayReport=display%3Anone%3B&flagAreaPub=True&DataFromSession=false&DownloadToken=&NumRecord=160&Nazionalita=I&IDRegione=19&IstatProv=083&Identificativo=&RadioBtnDenominazione=C&Denominazione=&NumCertProv=&TipoSoggetto=I&IDAttivita_I=&IDAttivita="


In [None]:
s = [key + '=' + value for key, value in payload_dict.items()]
ss = '&'.join(s)
print(ss)
print(raw_payload)
assert ss == raw_payload

In [None]:
with open('../../assets/regions-ids.json') as f:
    regions = json.load(f)

with open('../../assets/provinces-ids.json') as f:
    provinces = json.load(f)
len(regions), len(provinces)

In [None]:
data_path = '../../data/202207181700_Abruzzo_AQUILA.json'
with open(data_path) as f:
    d = json.loads(json.load(f))