In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
def preprocess(w):
    w = w.split(',')[0].strip()
    w = w.split('(')[0].strip()
    w = w.split('/')[0].strip()
    w = "_".join(list(map(lambda x:x.strip(), w.split('&'))))
    w = w.replace("St ","Saint ")
    w = w.replace("-"," ")
    w = w.replace("."," ")
    w = w.replace("*"," ")
    w = w.replace("\'"," ")
    w = w.split()
    if w[-1].lower() == 'airport': w = w[0:-1]
    if w[-1].lower() == 'intl': w = w[0:-1]
    w = "_".join(w)
    return w

In [3]:
df_aircraft = pd.read_csv('./data/df_aircraft.csv', sep='§', engine='python',
                          index_col=0, encoding='utf-8')
df_airline = pd.read_csv('./data/df_airline.csv', sep='§', engine='python',
                         index_col=0, encoding='utf-8')
df_airport = pd.read_csv('./data/df_airport.csv', sep='§', engine='python',
                         index_col=0, encoding='utf-8')
df_country = pd.read_csv('./data/df_country.csv', sep='§', engine='python',
                         index_col=0, encoding='utf-8')


In [4]:
tabs = ['default','bars']
cate = ['Seat', 'Food']
manu = ['Airbus', 'Boeing', 'Cessna', 'Cirrus ','Pilatus', 'Matra']
airl = list(df_airline['airline'].apply(preprocess))
airc = list(df_aircraft['model'].apply(preprocess))

In [15]:
CT_tabs = 'tab'
CT_tabs_default = 'default'
CT_sess = 'session'
CT_date = 'date'
CT_filt = 'filters'
CT_filt_manu = 'manufacturer'
CT_filt_airc = 'aircraft'
CT_filt_airl = 'company'
CT_filt_cate = 'category'
CT_filt_date = 'date'

CT_bdd_sess = 'session'
CT_bdd_date = 'date'
CT_bdd_json = 'json'
CT_bdd_hash = 'hash'

In [16]:
def make_date() -> str :
    year = str(np.random.randint(2010,2020)).zfill(4)
    month = str(np.random.randint(12)).zfill(2)
    day = str(np.random.randint(31)).zfill(2)
    return day+month+year

def make_timestamp() -> str :
    year = str(np.random.randint(2010,2020)).zfill(4)
    month = str(np.random.randint(12)).zfill(2)
    day = str(np.random.randint(31)).zfill(2)
    hour = str(np.random.randint(24)).zfill(2)
    minute = str(np.random.randint(60)).zfill(2)
    sec = str(np.random.randint(60)).zfill(2)
    return day+'/'+month+'/'+year+' '+hour+'h'+minute+'m'+sec+'s'

def make_manu() -> str:
    return np.random.choice(manu,1)[0]

def make_airl() -> str:
    return np.random.choice(airl,1)[0]

def make_airc() -> str:
    return np.random.choice(airc,1)[0]

def make_cate() -> str:
    return np.random.choice(cate,1)[0]

def make_tab() -> str:
    return np.random.choice(tabs,1)[0]

def init_filters() -> dict:
    filters = {
        CT_filt_manu : [],
        CT_filt_airc : [],
        CT_filt_airl : [],
        CT_filt_cate : [],
        CT_filt_date : [],
    }
    return filters

def make_event_str(sessid: int,
                   timestamp: str,
                   tab: str = None,
                   filters : dict = None) -> str :
    if tab is None:
        tab = CT_tabs_default
    if filters is None:
        filters = init_filters()
    
    event = {
        CT_sess : sessid,
        CT_filt : filters,
        CT_tabs : tab,
        CT_date : timestamp,
    }
    
    event_string = json.dumps(event)
    return event_string

def make_filters() -> dict:
    filters = {
        CT_filt_manu : [make_manu() for i in range(np.random.randint(5))],
        CT_filt_airc : [make_airc() for i in range(np.random.randint(5))],
        CT_filt_airl : [make_airl() for i in range(np.random.randint(5))],
        CT_filt_cate : [make_cate() for i in range(np.random.randint(5))],
        CT_filt_date : [make_date(), make_date()],
    }
    return filters

def make_session(sessid:int, n=None) -> pd.DataFrame:
    df = make_bdd(0)
    if n is None:
        n = np.random.randint(1,10)
    for i in range(n):
        timestamp = make_timestamp()
        filters = make_filters()
        tab = make_tab()
        event = make_event_str(sessid, timestamp, tab , filters)
        row = pd.Series([sessid, timestamp, event],
                        index = df.columns)
        df = df.append(row, ignore_index=True)
    return df

def make_bdd(nb_session : int = 1):
    bdd = pd.DataFrame(columns=[CT_bdd_session, CT_bdd_date, CT_bdd_json])
    for i in range(nb_session):
        bdd = bdd.append(make_session(i)).reset_index(drop=True)
    return bdd

def hash_event_dict(content):
    if isinstance(content, str):
        res = content.lower()
    
    elif isinstance(content, int) or isinstance(content, float):
        res = content
    
    elif isinstance(content, list):
        res = []
        content = sorted(set(content))
        for ik,k in enumerate(content):
            res += [hash_event_dict(content[ik])]
    
    elif isinstance(content, dict):
        res = {}
        for k in content.keys():
            res[k] = hash_event_dict(content[k])
    
    elif isinstance(content, tuple):
        res = tuple([])
        content = sorted(set(content))
        for ik,k in enumerate(content):
            res += tuple([hash_event_dict(content[ik])])
        
    return res

def json_string_to_hash(json_string):
    event_dict = json.loads(str(json_string))
    event_dict_to_hash = {
        key : event_dict[key] 
        for key in sorted([k for k in event_dict.keys()
                           if k not in [CT_sess, CT_date]])
    }
    event_dict_filtered = hash_event_dict(event_dict_to_hash)
    event_dict_hasheded = json.dumps(event_dict_filtered)
    return event_dict_hasheded

##### make the database

In [17]:
bdd = make_bdd(50)
bdd.head()

Unnamed: 0,session,date,json
0,0,19/11/2011 14h20m27s,"{""session"": 0, ""filters"": {""manufacturer"": [""C..."
1,0,21/06/2011 16h05m50s,"{""session"": 0, ""filters"": {""manufacturer"": [],..."
2,1,01/08/2016 00h04m51s,"{""session"": 1, ""filters"": {""manufacturer"": [""C..."
3,1,08/06/2016 04h02m18s,"{""session"": 1, ""filters"": {""manufacturer"": [""A..."
4,1,25/00/2017 17h35m44s,"{""session"": 1, ""filters"": {""manufacturer"": [],..."


##### hash the jsons : create a single representation for every corresponding json

In [18]:
bdd[CT_bdd_hash] = bdd[CT_bdd_json].apply(json_string_to_hash)

##### extract the states

In [19]:
etats = dict([(a,b) for (b,a) in enumerate(sorted(bdd[CT_bdd_hash].unique()))])
id_to_etats = dict(enumerate(sorted(bdd[CT_bdd_hash].unique())))

##### extract the transitions

In [20]:
transitions = np.zeros([len(etats),len(etats)])
for i in range(1,len(bdd)):
    transitions[etats[bdd[CT_bdd_hash][i-1]], etats[bdd[CT_bdd_hash][i]]] += 1