In [1]:
import pandas as pd
import numpy as np
import random
import re
import gensim
from gensim.models import Word2Vec
from strsimpy.jaro_winkler import JaroWinkler
import nltk
from nltk.corpus import stopwords
from spellchecker import SpellChecker
import itertools

In [2]:
pathword2vec = './data/GoogleNews-vectors-negative300-SLIM.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(pathword2vec, binary=True)
voc_stopwords = set(stopwords.words('english'))

In [3]:
df_aircraft = pd.read_csv('./data/df_aircraft.csv', sep='§', engine='python', index_col=0, encoding='utf-8')
df_airline = pd.read_csv('./data/df_airline.csv', sep='§', engine='python', index_col=0, encoding='utf-8')
df_airport = pd.read_csv('./data/df_airport.csv', sep='§', engine='python', index_col=0, encoding='utf-8')
df_country = pd.read_csv('./data/df_country.csv', sep='§', engine='python', index_col=0, encoding='utf-8')


In [4]:
def preprocess(w):
    w = w.split(',')[0].strip()
    w = w.split('(')[0].strip()
    w = w.split('/')[0].strip()
    w = "_".join(list(map(lambda x:x.strip(), w.split('&'))))
    w = w.replace("St ","Saint ")
    w = w.replace("-"," ")
    w = w.replace("."," ")
    w = w.replace("*"," ")
    w = w.replace("\'"," ")
    w = w.split()
    if w[-1].lower() == 'airport': w = w[0:-1]
    if w[-1].lower() == 'intl': w = w[0:-1]
    w = "_".join(w)
    return w

def word_gen(model, word_list, up=False, cap=False, same=False, low=False):
    def my_capitalize(w):
        return "_".join(list(map(lambda x:x.capitalize(), w.split('_'))))
    
    l_upper, l_lower, l_cap, l_same = [], [], [], []
    if up:
        l_upper = [m.upper() for m in word_list if m.upper() in model.vocab]
    if cap:
        l_cap = [my_capitalize(m) for m in word_list if my_capitalize(m) in model.vocab]
    if same:
        l_same = [m for m in word_list if m in model.vocab]
    if low:
        l_lower = [m.lower() for m in word_list if m.lower() in model.vocab]
    
    return set(l_upper + l_cap + l_lower + l_same)

In [5]:
# init the tags
def init_sent(list_words):
    res = []
    for e in list_words:
        if e.startswith('#'):
            res.append((e,'#'))
        elif e.startswith('$'):
            res.append((e,'#'))
        else :
            for w in e.split():
                res.append((w,'0'))
    return res

# make the swaps
def swap(tagger,W):
    def wt_augm(w, tag):
        ws = w.split("_")
        ws = [w for w in ws if w != '']
        t = tag.split(CT_SEP)
        t_name = t[0]
        if len(t)>1:
            t_suffix = t[-1]
        else : t_suffix = CT_SUF_B
        
        tags = [t_name+CT_SEP+t_suffix if i==0 
                else t_name+CT_SEP+CT_SUF_E if i+1==len(ws)
                else t_name+CT_SEP+CT_SUF_I 
                for i in range(len(ws))]
        tags = ['0' if t.startswith('0') else t for t in tags]
        return list(zip(ws,tags))
    
    hw, tag = W
    if hw.startswith('#'):
        n = random.sample(range(len(tagger[hw])),1)[0]
        w = tagger[hw][n]
        w = list(zip(w[0].split(), w[1].split()))
        w = [swap(tagger,(m[0], tag)) if (tag!='#')
             else swap(tagger,(m[0], m[1])) 
             for m in w]
        if len(w)==1: 
            w = w[0]
            
    elif hw.startswith('$GEN$'):
        key = hw.split('$GEN$')[1]
        a = select_from_sentence_gen(gen, key)[1:]
        w = wt_augm(a, tag)
            
    elif hw.startswith('$'):
            n = random.sample(range(len(tagger[hw])),1)[0]
            w = list(tagger[hw])[n]
            w = wt_augm(w, tag)
    else:
        w = (hw, tag)
    
    return w

# remove list of list
def remove_lists(listsOfLists):
    continue_loop = True
    while continue_loop:
        res = []
        continue_loop = False
        for t in listsOfLists:
            if isinstance(t, list):
                continue_loop = True
                res.extend(t)
            else:
                res.append(t)
        listsOfLists = res
    return res

# generate a new sentence with tags from a structure
def generete_sentence_from_structure(tagger, structure):
    sent_ini = init_sent(structure)
    tagged_sentence = remove_lists([swap(tagger,w) for w in sent_ini])
    sent = " ".join([w[0] for w in tagged_sentence])
    tags = " ".join([w[1] for w in tagged_sentence])
    return sent, tags

# return the closest word in the vocabulary, the key and score
def closest_word_in_voc(voc, word):
    jarowinkler = JaroWinkler()
    sim = 0
    for k in voc:
        for w in voc[k]['voc']:
            jaro = jarowinkler.similarity(w,word)
            if jaro >= sim:
                sim = jaro
                result = w,k,sim
    return result

# autocorrection of a sentence based on the model and our vocabulary
def auto_correction(model_vocab, voc, voc_stopwords, sentence):
    spell = SpellChecker()
    correction = []
    for w in sentence.split():
        if w[0].isupper():
            wc,wk,ws = closest_word_in_voc(voc, w)
            if ws > 0.95:
                correction.append(wc)
            else:
                if w in model_vocab or w in voc_stopwords:
                    correction.append(w)
                else:
                    wc = spell.correction(w)
                    correction.append(wc.capitalize())
        else :
            if w in model_vocab or w in voc_stopwords:
                correction.append(w)
            else:
                wc,wk,ws = closest_word_in_voc(voc, w)
                if ws > 0.95:
                    correction.append(wc)
                else:
                    wc = spell.correction(w)
                    correction.append(wc)
    return correction

# generate and select a sentence from the gen dictionary
def select_from_sentence_gen(gen, ks):
    res = ''
    for k in ks.split():
        if k not in gen:
            print("warning :",k,"not found")
        else :
            n = np.random.choice(len(gen[k]),1)[0]
            l = list(gen[k])[n].split()
            for el in l:

                if el.startswith('#'):
                    res += select_from_sentence_gen(gen, el)
                else :
                    res += '_'+el
    return res

In [6]:
# function that verify the integrity of the tagger dictionary
def verif_dict_tagger(dict_swaps):
    verif_global = True
    hash_keys = [k for k in dict_swaps.keys() if k.startswith('$')==False]
    for k in hash_keys:
        l = dict_swaps[k]
        words = [w[0] for w in l]
        tags = [w[1] for w in l]

        # verif number of words = number of tags
        left = np.array([len(words) for words in list(map(lambda x:x.split(), words))])
        right = np.array([len(words) for words in list(map(lambda x:x.split(), tags))])
        verif1 = all(left == right)
        if not verif1: print(k, "\t1-Number of words != number of tags")

        # verif all #words have #tags conterpart
        left = np.array([words[0].startswith("#") 
                         for words in list(map(lambda x:x.split(), words))
                         if len(words)>0])
        right = np.array([words[0].startswith("#") 
                          for words in list(map(lambda x:x.split(), tags))
                          if len(words)>0])
        verif2 = all(left == right)
        if not verif2: print(k, "\t2-Not all words have their tags")
        
        # verif all links have key
        verif3 = [m in dict_swaps.keys() for w in words for m in w.split()
                  if m.startswith("#") or (m.startswith("$GEN$")==False and m.startswith("$"))]
        if len(verif3)>0: verif3 = all(verif3)
        else: verif3 = True
        if not verif3: print(k, "\t3-Not all link have a key")

        verif_key = verif1 & verif2 & verif3
        verif_global &= verif_key

        if not verif_key: print(k)
    
    return verif_global

# verify the integrity of the generator dictionary
def verif_dict_generator(dict_gen):
    verif_global = True
    
    verif_keys_in_dict = [
        p in gen 
        for k in gen.keys()
        for sent in list(gen[k])
        for p in sent.split()
        if p.startswith('#')
    ]
    
    verif_global &= all(verif_keys_in_dict)
    return verif_global

# verify the integrity of the structures
def verif_structures(structures, tagger, display=False):
    verif_global = True
    
    verif_keys_in_dict = [
        all([word in tagger for word in struct if word.startswith('#')])
        for struct in structures
    ]
    if display: print(verif_keys_in_dict)
    
    verif_global = verif_global & all(verif_keys_in_dict)
    return verif_global

# verify the integrity of the tagger in respect of the generator
def verif_dict_tagger_links_gen(tagger,gen):
    verif_global = True
    hash_keys = [k for k in tagger.keys() if k.startswith('$')==False]
    for k in hash_keys:
        l = tagger[k]
        words = [w[0] for w in l]
        gen_keys = [l.split("$GEN$")[1] for w in words for l in w.split() if l.startswith("$GEN$")]
        verif = [gen_k in gen for gen_k in gen_keys]
        verif = all(verif)
        
        verif_global = verif_global & verif
        if not verif: 
            print(k, verif)
        
    return verif_global

In [7]:
manu = ['Airbus','Boeing','Cessna','Cirrus ','Pilatus','Matra']
coun = list(df_country['country'].apply(preprocess))
citi = list(df_airport['location'].apply(preprocess))
airp = list(df_airport['airport'].apply(preprocess))
airl = list(df_airline['airline'].apply(preprocess))
mont = ['January','February','March','April','May','June','July','August','September','October','November','December']
seas = ['Winter','Spring','Autumn','Winter']
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
year = ['Year','Years']

In [8]:
CT_SEP = '_'

CT_SUF_B='B'
CT_SUF_I='I'
CT_SUF_E='E'

CT_TAG_STAT = 'STAT'
CT_TAG_MANU = 'MANU'
CT_TAG_AIRP = 'AIRP'
CT_TAG_AIRL = 'AIRL'
CT_TAG_COUN = 'COUN'
CT_TAG_DATE1 = 'DATE1'
CT_TAG_DATE2 = 'DATE2'
CT_TAG_STUD = 'STUDIED'

In [9]:
voc = {
    "manu" : {'voc' : word_gen(model, manu, up=True, cap=True, same=True, low=False),
              'tag' : CT_TAG_MANU,
              'name' : 'Airplane Manufacturer'},
    "coun" : {'voc' : word_gen(model, coun, up=True, cap=True, same=True, low=False),
              'tag' : CT_TAG_COUN,
              'name' : 'Country'},
    "citi" : {'voc' : word_gen(model, citi, up=True, cap=True, same=True, low=False),
              'tag' : None,
              'name' : 'City'},
    "airp" : {'voc' : word_gen(model, airp, up=True, cap=True, same=True, low=False),
              'tag' : CT_TAG_AIRP,
              'name' : 'Airport'},
    "airl" : {'voc' : word_gen(model, airl, up=True, cap=True, same=True, low=False),
              'tag' : CT_TAG_AIRL,
              'name' : 'Airline'},
    "mont" : {'voc' : word_gen(model, mont, up=True, cap=True, same=True, low=False),
              'tag' : None,
              'name' : 'Month'},
    "seas" : {'voc' : word_gen(model, seas, up=True, cap=True, same=True, low=True),
              'tag' : None,
              'name' : 'Season'},
    "days" : {'voc' : word_gen(model, days, up=True, cap=True, same=True, low=True),
              'tag' : None,
              'name' : 'Days'},
    "year" : {'voc' : word_gen(model, year, up=True, cap=True, same=True, low=False),
              'tag' : None,
              'name' : 'Year'},
}

In [10]:
gen = {
    '#client' : {
        'client','clients','customer','customers','passenger','passengers',
    },
    '#satisfaction' : {
        'contentment','contentments','satisfaction','satisfactions',
    },
    '#quantity' : {
        'amount','amounts','number','quantity','sum','sums',
    },
    '#flight' : {
        'flight','flights','travel','travels',
    },
    '#client_satisfaction' : {
        '#client #satisfaction',
        '#satisfaction of #client'
    },
    '#quantity_of_flights' : {
        '#flight #quantity',
        '#quantity of #flight'
    },
    '#PROP#to' : {
        'to','to the beginning of','to the end of','until',
    },
    '#PROP#from' : {
        'from','from the beginning of','from the end of','in',
    },
    '#STATS#graph' : {
        'chart','charts','graph','graphs','histogram','histograms','pie chart','pie charts',
        'slope','slopes',
    },
    '#VERB#show' : {
        'display','highlight','plot','print','show','view',
    },
    '#VERB#be' : {
        'happen to be','is','seems','seems to be', 
    },
    '#VERB#like' : {
        'like','likes','love','loves',
    },
    '#VERB#dislike' : {
        'cannot stand','dislike','dislikes','hate','hates',
    },
    '#PRONON#meus' : {
        'me','us','',
    },
    '#ARTICLE#' : {
        '','a','an','the',
    },
    '#COMP#good' : {
        'able', 'acceptable', 'ace', 'admirable', 'advantageous', 'agreeable', 'amazing', 'appropriate', 
        'awesome', 'benefic', 'capable', 'capital', 'clever', 'comfortable', 'commendable', 'common', 
        'congenial', 'convenient', 'decent', 'deluxe', 'efficient', 'excellent', 'exceptional', 
        'expert', 'fascinating', 'favorable', 'first-class', 'first-rate', 'flawless', 'fresh', 
        'friendly', 'good', 'gratifying', 'great', 'healthy', 'helpful', 'honest', 'honorable', 
        'hygienic', 'incredible', 'intact', 'kindhearted', 'marvelous', 'neat', 'nice', 'normal', 
        'opportune', 'perfect', 'pleasant', 'pleasing', 'positive', 'precious', 'prime', 'prodigious', 
        'profitable', 'qualified', 'rad', 'reliable', 'reputable', 'respectable', 'right', 'safe', 
        'salutary', 'satisfactory', 'satisfying', 'serviceable', 'shipshape', 'shocking', 'skillful',
        'solid', 'splendid', 'stable', 'sterling', 'stunning', 'stupendous', 'suitable', 'suited', 
        'super', 'superb', 'superior', 'surprising', 'talented', 'tasty', 'tip-top', 'tolerable', 
        'trustworthy', 'unbelievable', 'useful', 'valuable', 'welcome', 'wonderful', 'worthy',
    },
    '#COMP#bad' : {
        'abominable', 'amiss', 'atrocious', 'awful', 'bad', 'bummer', 'careless', 'catastrophic', 
        'chaotic', 'cheap', 'cheesy', 'crap', 'crappy', 'crummy', 'damaging', 'dangerous', 'defective', 
        'deficient', 'deleterious', 'detrimental', 'disagreeable', 'disastrous', 'discouraging', 
        'displeasing', 'distressing', 'dreadful', 'dumb', 'erroneous', 'evil', 'fallacious', 
        'garbage', 'godawful', 'grim', 'grody', 'gross', 'grungy', 'harsh', 'hurtful', 'icky', 
        'imperfect', 'impolite', 'inadequate', 'incorrect', 'iniquitous', 'injurious', 'junky', 
        'lame', 'loud', 'lousy', 'mean', 'moldy', 'noisy', 'not good', 'old', 'painful', 'poor', 
        'rancid', 'regretful', 'rotten', 'rude', 'ruinous', 'sad', 'shitty', 'slipshod', 'spoiled', 
        'stinking', 'strident', 'substandard', 'terrible', 'tragic', 'troubled', 'troubling', 
        'unacceptable', 'unfavorable', 'unfortunate', 'unhappy', 'unhealthy', 'unlucky', 'unpleasant', 
        'unsatisfactory', 'unwell', 'upsetting', 'vicious', 'wicked', 'wrong',
    },
}

verif_dict_generator(gen)

True

In [11]:
select_from_sentence_gen(gen, "#COMP#good #client_satisfaction")[1:]

'fresh_clients_satisfactions'

In [12]:
tagger = {
    '#STATS#graph' : [('$GEN$#STATS#graph',CT_TAG_STAT)],
    '#VERB#show' : [('$GEN$#VERB#show','0')],
    '#PRONON#meus' : [('$GEN$#PRONON#meus','0')],
    '#ARTICLE#' : [('$GEN$#ARTICLE#','0')],

    # Named Entity
    '#NE#manu' : [
        ('$manu$voc',CT_TAG_MANU),
    ],

    '#NE#airp' : [
        ('$airp$voc',CT_TAG_AIRP),
        ('$airp$voc airport',CT_TAG_AIRP+' 0'),
        ('airport of $airp$voc','0 0 '+CT_TAG_AIRP),
    ],

    '#NE#airl' : [
        ('$airl$voc',CT_TAG_AIRL),
    ],

    '#NE#coun' : [
        ('$coun$voc',CT_TAG_COUN),
    ],

    # Date1
    '#DATE1#' : [
      ('#DATE1#y','#'),
      ('the year #DATE1#y','0 0 #'),
      ('year #DATE1#y','0 #'),
      ('#DATE1#my','#'),
      ('#DATE1#sy','#'),
    ],
    '#DATE1#y' : [
      ('$year$voc',
       CT_TAG_DATE1+CT_SEP+CT_SUF_B),
    ],
    '#DATE1#my' : [
      ('$mont$voc $year$voc',
       CT_TAG_DATE1+CT_SEP+CT_SUF_B+' '+\
       CT_TAG_DATE1+CT_SEP+CT_SUF_E),
    ],
    '#DATE1#sy' : [
      ('$seas$voc $year$voc',
       CT_TAG_DATE1+CT_SEP+CT_SUF_B+' '+\
       CT_TAG_DATE1+CT_SEP+CT_SUF_E),
    ],

    # Date2
    '#DATE2#' : [
      ('#DATE2#y','#'),
      ('the year #DATE2#y','0 0 #'),
      ('year #DATE2#y','0 #'),
      ('#DATE2#my','#'),
      ('#DATE2#sy','#'),
    ],
    '#DATE2#y' : [
      ('$year$voc',
       CT_TAG_DATE2+CT_SEP+CT_SUF_B)
    ],
    '#DATE2#my' : [
      ('$mont$voc $year$voc',
       CT_TAG_DATE2+CT_SEP+CT_SUF_B+' '+\
       CT_TAG_DATE2+CT_SEP+CT_SUF_E),
    ],
    '#DATE2#sy' : [
      ('$seas$voc $year$voc',
       CT_TAG_DATE2+CT_SEP+CT_SUF_B+' '+\
       CT_TAG_DATE2+CT_SEP+CT_SUF_E),
    ],

    # Studied variable
    '#STUDIED#' : [
      ('#ARTICLE# $GEN$#quantity_of_flights','# '+CT_TAG_STUD),
      ('#ARTICLE# $GEN$#client_satisfaction','# '+CT_TAG_STUD),
    ],

    # Propositions SHOW
    '#PROP#show_meus_the' : [
      ('#VERB#show #PRONON#meus #ARTICLE#','# # #')
    ],

    # Propositions DATES
    '#PROP#DATE#from_to' : [
      ('$GEN$#PROP#from #DATE1# $GEN$#PROP#to #DATE2#','0 # 0 #'),
    ],
    '#PROP#DATE#for' : [
      ('for #ARTICLE# #DATE1#','0 # #'),
    ],
    '#PROP#DATE#since' : [
      ('since #ARTICLE# #DATE1#','0 # #'),
    ],
    
    # SELECTORS
    '$manu$voc' : voc['manu']['voc'],
    '$coun$voc' : voc['coun']['voc'],
    '$airl$voc' : voc['airl']['voc'],
    '$airp$voc' : voc['airp']['voc'],
    '$airl$voc' : voc['airl']['voc'],
    '$mont$voc' : voc['mont']['voc'],
    '$seas$voc' : voc['seas']['voc'],
    '$days$voc' : voc['days']['voc'],
    '$year$voc' : voc['year']['voc'],
}

print(verif_dict_tagger(tagger))
print(verif_dict_tagger_links_gen(tagger,gen))

True
True


In [13]:
structures = [
    ['#PROP#show_meus_the','#STATS#graph','of', '#STUDIED#','for','#NE#manu', 'and',
     "#NE#manu", 'in', '#NE#coun', "#PROP#DATE#from_to"
    ],
    ['#PROP#show_meus_the','#STATS#graph','of', '#STUDIED#','for','#NE#manu', 'and',
     "#NE#manu", 'in', '#NE#coun', "#PROP#DATE#for"
    ]
]

verif_structures(structures, tagger)

True

In [14]:
structure = structures[np.random.choice(len(structures),1)[0]]
sent,tags = generete_sentence_from_structure(tagger,structure)
print("Same length :", len(sent.split()) == len(tags.split()))
sent,tags

Same length : True


('view a graph of a client contentments for MATRA and Pilatus in Guatemala for year YEAR',
 '0 0 STAT_B 0 0 STUDIED_B STUDIED_E 0 MANU_B 0 MANU_B 0 COUN_B 0 0 DATE1_B')

In [15]:
sentence = 'I woulde lirke the graph of the numbr of fligt for Boing on mondy 2020'
auto_correction(model.vocab, voc, voc_stopwords, sentence)

['I',
 'would',
 'like',
 'the',
 'graph',
 'of',
 'the',
 'number',
 'of',
 'flight',
 'for',
 'Boeing',
 'on',
 'monday',
 '2020']