In [4]:
import pandas as pd
import numpy as np

In [5]:
def preprocess(w):
    w = w.split(',')[0].strip()
    w = w.split('(')[0].strip()
    w = w.split('/')[0].strip()
    w = "_".join(list(map(lambda x:x.strip(), w.split('&'))))
    w = w.replace("St ","Saint ")
    w = w.replace("-"," ")
    w = w.replace("."," ")
    w = w.replace("*"," ")
    w = w.replace("\'"," ")
    w = w.split()
    if w[-1].lower() == 'airport': w = w[0:-1]
    if w[-1].lower() == 'intl': w = w[0:-1]
    w = "_".join(w)
    return w

# For the sentence generation with tags

In [6]:
def word_gen(model, word_list, up=False, cap=False, same=False, low=False):
    def my_capitalize(w):
        return "_".join(list(map(lambda x:x.capitalize(), w.split('_'))))
    
    l_upper, l_lower, l_cap, l_same = [], [], [], []
    if up:
        l_upper = [m.upper() for m in word_list 
                   if m.upper() in model.vocab]
    if cap:
        l_cap = [my_capitalize(m) 
                 for m in word_list 
                 if my_capitalize(m) in model.vocab]
    if same:
        l_same = [m for m in word_list 
                  if m in model.vocab]
    if low:
        l_lower = [m.lower() for m in word_list 
                   if m.lower() in model.vocab]
    
    return set(l_upper + l_cap + l_lower + l_same)

# init the tags
def init_sent(list_words):
    res = []
    for e in list_words:
        if e.startswith('#'):
            res.append((e,'#'))
        elif e.startswith('$'):
            res.append((e,'#'))
        else :
            for w in e.split():
                res.append((w,CT_TAG_O))
    return res

# initialize a structure
# if tuple : choose one at random
def init_structure(struct):
    res = []
    if isinstance(struct, str):
        struct = struct.strip()
        if len(struct): res += [struct]
    elif isinstance(struct, tuple):
        n = np.random.choice(len(struct),1)[0]
        l = list(struct)[n]
        res += init_structure(l)
    elif isinstance(struct, list):
        for s in struct:
            if isinstance(s, str):
                s = s.strip()
                if len(s) : res += [s]
            elif isinstance(s,tuple):
                n = np.random.choice(len(s),1)[0]
                l = list(s)[n]
                res += init_structure(l)
            elif isinstance(s,list):
                n = np.random.choice(len(s),1)[0]
                l = list(s)[n]
                res += init_structure(l)
    return res

# make the swaps
def swap(tagger,W):
    def wt_augm(w, tag):
        ws = w.split("_")
        ws = [w for w in ws if w != '']
        t = tag.split(CT_SEP)
        t_name = t[0]
        if len(t)>1:
            t_suffix = t[-1]
        else : t_suffix = CT_SUF_B
        
        tags = [t_name+CT_SEP+t_suffix if i==0 
                else t_name+CT_SEP+CT_SUF_E if i+1==len(ws)
                else t_name+CT_SEP+CT_SUF_I 
                for i in range(len(ws))]
        tags = [CT_TAG_O if t.startswith(CT_TAG_O) 
                else t 
                for t in tags]
        return list(zip(ws,tags))
    
    hw, tag = W
    if hw.startswith('#'):
        n = random.sample(range(len(tagger[hw])),1)[0]
        w = tagger[hw][n]
        w = list(zip(w[0].split(), w[1].split()))
        w = [swap(tagger,(m[0], tag)) if (tag!='#')
             else swap(tagger,(m[0], m[1])) 
             for m in w]
        if len(w)==1: 
            w = w[0]
            
    elif hw.startswith('$GEN$'):
        key = hw.split('$GEN$')[1]
        a = select_from_sentence_gen(gen, key)[1:]
        w = wt_augm(a, tag)
            
    elif hw.startswith('$VOC$'):
        _,key,sub_key = hw.split('$VOC$')[1].split('#')
        words_set = voc[key][sub_key]
        n = random.sample(range(len(words_set)),1)[0]
        w = list(words_set)[n]
        w = wt_augm(w, tag)
    else:
        w = (hw, tag)
    
    return w

# remove list of list
def remove_lists(listsOfLists):
    continue_loop = True
    while continue_loop:
        res = []
        continue_loop = False
        for t in listsOfLists:
            if isinstance(t, list):
                continue_loop = True
                res.extend(t)
            else:
                res.append(t)
        listsOfLists = res
    return res

# generate a new sentence with tags from a structure
def generete_sentence_from_structure(tagger, structure):
    sent_ini = init_sent(structure)
    tagged_sentence = remove_lists([swap(tagger,w) for w in sent_ini])
    sent = " ".join([w[0] for w in tagged_sentence])
    tags = " ".join([w[1] for w in tagged_sentence])
    return sent, tags

# return the closest word in the vocabulary, the key and score
def closest_word_in_voc(voc, word):
    jarowinkler = JaroWinkler()
    sim = 0
    for k in voc:
        for w in voc[k]['voc']:
            jaro = jarowinkler.similarity(w,word)
            if jaro >= sim:
                sim = jaro
                result = w,k,sim
    return result

# autocorrection of a sentence based on the model and our vocabulary
def auto_correction(model_vocab, voc, voc_stopwords, sentence):
    spell = SpellChecker()
    correction = []
    for w in sentence.split():
        if w[0].isupper():
            wc,wk,ws = closest_word_in_voc(voc, w)
            if ws > 0.95:
                correction.append(wc)
            else:
                if w in model_vocab or w in voc_stopwords:
                    correction.append(w)
                else:
                    wc = spell.correction(w)
                    correction.append(wc.capitalize())
        else :
            if w in model_vocab or w in voc_stopwords:
                correction.append(w)
            else:
                wc,wk,ws = closest_word_in_voc(voc, w)
                if ws > 0.95:
                    correction.append(wc)
                else:
                    wc = spell.correction(w)
                    correction.append(wc)
    return correction

# generate and select a sentence from the gen dictionary
def select_from_sentence_gen(gen, ks):
    res = ''
    for k in ks.split():
        if k not in gen:
            print("warning :",k,"not found")
        else :
            n = np.random.choice(len(gen[k]),1)[0]
            l = list(gen[k])[n].split()
            for el in l:

                if el.startswith('#'):
                    res += select_from_sentence_gen(gen, el)
                else :
                    res += '_'+el
    return res

def generate_dataframe_for_bert(n=10000):
    data_train = []
    for i in range(n):
        structure = structures[np.random.choice(len(structures),1)[0]]
        structure_init = init_structure(structure)
        sent,tags = generete_sentence_from_structure(tagger,structure_init)
        for u,j in zip(sent.split(),tags.split()):
            data_train.append([i,u,j])
            
    train_df = pd.DataFrame(data_train, columns=['sentence_id', 'words', 'labels'])
    return train_df

##### data verifications

In [7]:
# function that verify the integrity of the tagger dictionary
def verif_dict_tagger(tagger):
    verif_global = True
    hash_keys = [k for k in tagger.keys() if k.startswith('$')==False]
    for k in hash_keys:
        l = tagger[k]
        words = [w[0] for w in l]
        tags = [w[1] for w in l]

        # verif number of words = number of tags
        left = np.array([len(words) 
                         for words in list(map(lambda x:x.split(), words))])
        right = np.array([len(words) 
                          for words in list(map(lambda x:x.split(), tags))])
        verif1 = all(left == right)
        if not verif1: print(k, "\t1-Number of words != number of tags")

        # verif all #words have #tags conterpart
        left = np.array([words[0].startswith("#") 
                         for words in list(map(lambda x:x.split(), words))
                         if len(words)>0])
        right = np.array([words[0].startswith("#") 
                          for words in list(map(lambda x:x.split(), tags))
                          if len(words)>0])
        verif2 = all(left == right)
        if not verif2: print(k, "\t2-Not all words have their tags")
        
        # verif all links have key
        verif3 = [m in tagger.keys() for w in words for m in w.split()
                  if m.startswith("#")]
        if len(verif3)>0: verif3 = all(verif3)
        else: verif3 = True
        if not verif3: print(k, "\t3-Not all link have a key")

        verif_key = verif1 & verif2 & verif3
        verif_global &= verif_key

        if not verif_key: print(k)
    
    return verif_global

# verify the integrity of the generator dictionary
def verif_dict_generator(dict_gen):
    verif_global = True
    
    verif_keys_in_dict = [
        p in gen 
        for k in gen.keys()
        for sent in list(gen[k])
        for p in sent.split()
        if p.startswith('#')
    ]
    
    verif_global &= all(verif_keys_in_dict)
    return verif_global

# verify the integrity of the structures
def verif_structure_link_tag(structure, tagger, display=False):
    words = list(pd.core.common.flatten([[structure]]))
    
    verifs = [w for w in words 
              if w.startswith('#') and w not in tagger.keys()]
    
    if display : print(verifs)
    return len(verifs)==0

# verify the integrity of the tagger in respect of the generator
def verif_dict_tagger_links_gen(tagger,gen):
    verif_global = True
    hash_keys = [k for k in tagger.keys() 
                 if k.startswith('$')==False]
    for k in hash_keys:
        l = tagger[k]
        words = [w[0] for w in l]
        gen_keys = [l.split("$GEN$")[1] for w in words 
                    for l in w.split() 
                    if l.startswith("$GEN$")]
        verif = [gen_k in gen for gen_k in gen_keys]
        verif = all(verif)
        
        verif_global = verif_global & verif
        if not verif: 
            print(k, verif)
        
    return verif_global

# verify the integrity of the tagger in respect of the vocabulary
def verif_dict_tagger_links_voc(tagger,voc):
    verif_global = True
    hash_keys = [k for k in tagger.keys() if k.startswith('$')==False]
    for k in hash_keys:
        l = tagger[k]
        words = [w[0] for w in l]
        gen_keys = [tuple(l.split("$VOC$")[1].split('#')[1:])
                    for w in words for l in w.split() 
                    if l.startswith("$VOC$")]
        
        verif1 = all([gen_k[0] in voc for gen_k in gen_keys])
        verif2 = all([gen_k[1] in voc[gen_k[0]] for gen_k in gen_keys])
        verif = verif1 & verif2
        if not verif: print(k, ": a voc key is not in the voc")
        verif_global = verif_global & verif
        
    return verif_global

# For Markov

In [9]:
def random_date() -> str :
    year = str(np.random.randint(2010,2020)).zfill(4)
    month = str(np.random.randint(12)).zfill(2)
    day = str(np.random.randint(31)).zfill(2)
    return day+month+year

def random_timestamp() -> str :
    year = str(np.random.randint(2010,2020)).zfill(4)
    month = str(np.random.randint(12)).zfill(2)
    day = str(np.random.randint(31)).zfill(2)
    hour = str(np.random.randint(24)).zfill(2)
    minute = str(np.random.randint(60)).zfill(2)
    sec = str(np.random.randint(60)).zfill(2)
    return day+'/'+month+'/'+year+' '+hour+'h'+minute+'m'+sec+'s'

def random_manu(prevoc) -> str:
    return np.random.choice(prevoc["manu"],1)[0]

def random_airl(prevoc) -> str:
    return np.random.choice(prevoc["airl"],1)[0]

def random_airc(prevoc) -> str:
    return np.random.choice(prevoc["airc"],1)[0]

def random_cate(prevoc) -> str:
    return np.random.choice(prevoc["cate"],1)[0]

def random_tab(prevoc) -> str:
    return np.random.choice(prevoc["tabs"],1)[0]

def random_coun(prevoc) -> str:
    return np.random.choice(prevoc["coun"],1)[0]

def init_filters() -> dict:
    filters = {
        CT_filt_manu : [],
        CT_filt_airc : [],
        CT_filt_airl : [],
        CT_filt_coun : [],
        CT_filt_cate : [],
        CT_filt_date : [],
    }
    return filters

def init_event() -> dict:
    event = {
        CT_tabs : "",
        CT_filt : init_filters(),
    }
    return event

def random_filters(prevoc) -> dict:
    filters = {
        CT_filt_manu : [random_manu(prevoc) for i in range(np.random.randint(5))],
        CT_filt_airc : [random_airc(prevoc) for i in range(np.random.randint(5))],
        CT_filt_airl : [random_airl(prevoc) for i in range(np.random.randint(5))],
        CT_filt_coun : [random_coun(prevoc) for i in range(np.random.randint(5))],
        CT_filt_cate : [random_cate(prevoc) for i in range(np.random.randint(5))],
        CT_filt_date : [random_date(), random_date()],
    }
    return filters

def random_event(prevoc) -> dict :

    event = {
        CT_tabs : random_tab(prevoc),
        CT_filt : random_filters(prevoc),
    }
    return event


def random_session(prevoc, sessid:int, n=None) -> pd.DataFrame:
    
    df = make_bdd(prevoc, 0)
    if n is None:
        n = np.random.randint(1,10)
    for i in range(n):
        timestamp = random_timestamp()
        event = random_event(prevoc)
        event_str = json.dumps(event)
        row = pd.Series([str(sessid), timestamp, event_str],
                        index = df.columns)
        df = df.append(row, ignore_index=True)
    return df

def make_bdd(prevoc, nb_session : int = 1):
    bdd = pd.DataFrame(columns=[CT_bdd_sess, CT_bdd_date, CT_bdd_json])
    for i in range(nb_session):
        bdd = bdd.append(random_session(prevoc, i)).reset_index(drop=True)
    return bdd

def hash_event_dict(content):
    if isinstance(content, str):
        res = content.lower()
    
    elif isinstance(content, int) or isinstance(content, float):
        res = content
    
    elif isinstance(content, list):
        res = []
        content = sorted(set(content))
        for ik,k in enumerate(content):
            res += [hash_event_dict(content[ik])]
    
    elif isinstance(content, dict):
        res = {}
        for k in content.keys():
            res[k] = hash_event_dict(content[k])
    
    elif isinstance(content, tuple):
        res = tuple([])
        content = sorted(set(content))
        for ik,k in enumerate(content):
            res += tuple([hash_event_dict(content[ik])])
        
    return res

def json_string_to_hash(json_string):
    event_dict = json.loads(str(json_string))
    event_dict_to_hash = {
        key : event_dict[key] 
        for key in sorted([k for k in event_dict.keys()
                           if k not in [CT_sess, CT_date]])
    }
    event_dict_filtered = hash_event_dict(event_dict_to_hash)
    event_dict_hasheded = json.dumps(event_dict_filtered)
    return event_dict_hasheded

def predict_next_state(state, df_transitions):
    return df_transitions.loc[state,].idxmax()

In [None]:
def tfd (date):
    return (date[0:2]+'-'+date[2:4]+'-'+date[4:])

def make_sentence_fom_json(event_dict):
    sent = "We suggest you "
    if event_dict[CT_tabs] == CT_tabs_gen:
        sent = sent+'the global study'
    else:
        sent = sent+'the'+event_dict[CT_tabs]+"s' study"
    sent = sent + ' from '+tfd(event_dict[CT_filt][CT_date][0])+' to '+tfd(event_dict[CT_filt][CT_date][1])
    manu = ''
    if len(event_dict[CT_filt][CT_filt_manu]) == 1:
        manu = ' for the manufacturer'+ event_dict[CT_filt][CT_filt_manu][0]
    elif (len(event_dict[CT_filt][CT_filt_manu]) > 1):
        manu = ' for the manufacturers '
        for k in event_dict[CT_filt][CT_filt_manu]:
            if k != event_dict[CT_filt][CT_filt_manu][-1]:
                manu = manu + k+','
            else: 
                manu = manu + k
            
    if manu == "":
        sent = sent + ' for'
    else:
        sent = sent + manu + ' and'
    airc = ''
    if len(event_dict[CT_filt][CT_filt_airc]) == 1:
        airc = ' for the aircraft'+ event_dict[CT_filt][CT_filt_airc][0]
    elif len(event_dict[CT_filt][CT_filt_airc]) > 1:
        airc = ' for the aircrafts '
        for k in event_dict[CT_filt][CT_filt_airc]:
            if k != event_dict[CT_filt][CT_filt_airc][-1]:
                airc = airc + k+','
            else: 
                airc = airc + k
            
    if airc == "":
        sent = sent + ' for'
    else:
        sent = sent + airc + ' and'
    comp = ''
    if len(event_dict[CT_filt][CT_filt_airl]) == 1:
        comp = ' for the company'+ event_dict[CT_filt][CT_filt_airl][0]
    elif len(event_dict[CT_filt][CT_filt_airl]) > 1:
        comp = ' for the companies '
        for k in event_dict[CT_filt][CT_filt_airl]:
            if k != event_dict[CT_filt][CT_filt_airl][-1]:
                comp = comp + k+','
            else: 
                comp = comp + k
            
    if comp == "":
        sent = sent + ' for'
    else:
        sent = sent + comp + ' and'
    cat = ''
    if len(event_dict[CT_filt][CT_filt_cate]) == 1:
        cat = ' for the category'+ event_dict[CT_filt][CT_filt_cate][0]
    elif len(event_dict[CT_filt][CT_filt_cate]) > 1:
        cat = ' for the categories '
        for k in event_dict[CT_filt][CT_filt_cate]:
            if k != event_dict[CT_filt][CT_filt_cate][-1]:
                 cat = cat + k+' , '
            else: 
                 cat = cat + k
            
    if cat == "":
        sent = sent 
    else:
        sent = sent + cat
    sent = sent+'. If you agree, click onto the following link ;)'  
    return(sent)