! pip install strsimpy  
! pip install pyspellchecker  

In [1]:
import pandas as pd
import numpy as np
import random
import re
import gensim
from gensim.models import Word2Vec
from strsimpy.jaro_winkler import JaroWinkler
import nltk
from nltk.corpus import stopwords
from spellchecker import SpellChecker
import itertools

##### Libraries
! pip install pyspellchecker
##### Models :
https://github.com/eyaler/word2vec-slim/blob/master/GoogleNews-vectors-negative300-SLIM.bin.gz  
https://github.com/mmihaltz/word2vec-GoogleNews-vectors/blob/master/GoogleNews-vectors-negative300.bin.gz

!wget -P ./data -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [2]:
%run constants.ipynb
%run functions.ipynb

In [3]:
pathword2vec = data_directory+'GoogleNews-vectors-negative300-SLIM.bin.gz'
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(pathword2vec,
                                                            binary=True)

In [4]:
nltk.download('stopwords')
voc_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
prefixe = [CT_TAG_STAT, CT_TAG_MANU, CT_TAG_AIRP, CT_TAG_AIRL, CT_TAG_COUN,
           CT_TAG_DATE1, CT_TAG_DATE2, CT_TAG_STUD]
separat = [CT_SEP]
suffixe = [CT_SUF_B, CT_SUF_I, CT_SUF_E]

all_tags =  [CT_TAG_O] + ["".join(t) 
                          for t in list(itertools.product(prefixe,separat,suffixe))]
print(all_tags)

['O', 'STAT_B', 'STAT_I', 'STAT_E', 'MANU_B', 'MANU_I', 'MANU_E', 'AIRP_B', 'AIRP_I', 'AIRP_E', 'AIRL_B', 'AIRL_I', 'AIRL_E', 'COUN_B', 'COUN_I', 'COUN_E', 'DATE1_B', 'DATE1_I', 'DATE1_E', 'DATE2_B', 'DATE2_I', 'DATE2_E', 'STUDIED_B', 'STUDIED_I', 'STUDIED_E']


In [6]:
manu = ['Airbus', 'Boeing', 'Cessna', 'Cirrus ','Pilatus', 'Matra']
coun = list(df_country['country'].apply(preprocess))
citi = list(df_airport['location'].apply(preprocess))
airp = list(df_airport['airport'].apply(preprocess))
airl = list(df_airline['airline'].apply(preprocess))
mont = ['January', 'February', 'March', 'April', 'May', 'June', 
        'July', 'August', 'September', 'October', 'November', 'December']
seas = ['Winter', 'Spring', 'Autumn', 'Winter']
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 
        'Saturday', 'Sunday']
year = ['Year', 'Years']

In [7]:
voc = {
    "manu" : {
        'voc' : word_gen(model_w2v, manu, up=True, 
                         cap=True, same=True, low=False),
        'tag' : CT_TAG_MANU,
        'name' : 'Airplane Manufacturer'
    },
    "coun" : {
        'voc' : word_gen(model_w2v, coun, up=True, 
                         cap=True, same=True, low=False),
        'tag' : CT_TAG_COUN,
        'name' : 'Country'
    },
    "citi" : {
        'voc' : word_gen(model_w2v, citi, up=True, 
                         cap=True, same=True, low=False),
        'tag' : None,
        'name' : 'City'
    },
    "airp" : {
        'voc' : word_gen(model_w2v, airp, up=True, 
                         cap=True, same=True, low=False),
        'tag' : CT_TAG_AIRP,
        'name' : 'Airport'
    },
    "airl" : {
        'voc' : word_gen(model_w2v, airl, up=True, 
                         cap=True, same=True, low=False),
        'tag' : CT_TAG_AIRL,
        'name' : 'Airline'
    },
    "mont" : {
        'voc' : word_gen(model_w2v, mont, up=True, 
                         cap=True, same=True, low=False),
        'tag' : None,
        'name' : 'Month'
    },
    "seas" : {
        'voc' : word_gen(model_w2v, seas, up=True, 
                         cap=True, same=True, low=True),
        'tag' : None,
        'name' : 'Season'
    },
    "days" : {
        'voc' : word_gen(model_w2v, days, up=True, 
                         cap=True, same=True, low=True),
        'tag' : None,
        'name' : 'Days'
    },
    "year" : {
        #'voc' : word_gen(model_w2v, year, up=True, 
        #cap=True, same=True, low=False),
        'voc' : {str(y) for y in range(1900,2050)},
        'tag' : None,
        'name' : 'Year'
    },
}

In [8]:
gen = {
    '#client' : {
        'client', 'clients', 'customer', 'customers', 'passenger', 'passengers',
    },
    '#satisfaction' : {
        'contentment', 'contentments', 'satisfaction', 'satisfactions',
    },
    '#quantity' : {
        'amount', 'amounts', 'number', 'quantity', 'sum', 'sums',
    },
    '#flight' : {
        'flight', 'flights', 'travel', 'travels',
    },
    '#client_satisfaction' : {
        '#client #satisfaction',
        '#satisfaction of #client'
    },
    '#quantity_of_flights' : {
        '#flight #quantity',
        '#quantity of #flight'
    },
    '#PROP#to' : {
        'to', 'to the beginning of', 'to the end of', 'until',
    },
    '#PROP#from' : {
        'from', 'from the beginning of', 'from the end of', 'in',
    },
    '#STATS#graph' : {
        'chart','charts', 'graph','graphs', 'histogram', 'histograms',
        'pie chart', 'pie charts', 'slope', 'slopes',
    },
    '#STATS#wordcloud' : {
        'cloud of words', 'wordcloud',
    },
    '#STATS#plan_of_plane' : {
        'plan of plane', 'plans of plane', 'plans of planes',
    },
    '#VERB#show' : {
        'display', 'highlight', 'plot', 'print', 'show', 'view',
    },
    '#VERB#be' : {
        'happen to be', 'is', 'seems', 'seems to be', 
    },
    '#VERB#like' : {
        'like', 'likes', 'love', 'loves',
    },
    '#VERB#dislike' : {
        'cannot stand', 'dislike', 'dislikes', 'hate', 'hates',
    },
    '#PRONON#meus' : {
        'me', 'us',
    },
    '#ARTICLE#' : {
        'a', 'an', 'the',
    },
    '#COMP#good' : {
        'able', 'acceptable', 'ace', 'admirable', 'advantageous', 'agreeable',
        'amazing', 'appropriate', 'awesome', 'benefic', 'capable', 'capital', 
        'clever', 'comfortable', 'commendable', 'common', 'congenial', 
        'convenient', 'decent', 'deluxe', 'efficient', 'excellent', 
        'exceptional', 'expert', 'fascinating', 'favorable', 'first-class', 
        'first-rate', 'flawless', 'fresh', 'friendly', 'good', 'gratifying', 
        'great', 'healthy', 'helpful', 'honest', 'honorable', 'hygienic', 
        'incredible', 'intact', 'kindhearted', 'marvelous', 'neat', 'nice', 
        'normal', 'opportune', 'perfect', 'pleasant', 'pleasing', 'positive', 
        'precious', 'prime', 'prodigious', 'profitable', 'qualified', 'rad', 
        'reliable', 'reputable', 'respectable', 'right', 'safe', 'salutary',
        'satisfactory', 'satisfying', 'serviceable', 'shipshape', 'shocking', 
        'skillful','solid', 'splendid', 'stable', 'sterling', 'stunning', 
        'stupendous', 'suitable', 'suited', 'super', 'superb', 'superior', 
        'surprising', 'talented', 'tasty', 'tip-top', 'tolerable', 
        'trustworthy', 'unbelievable', 'useful', 'valuable', 'welcome', 
        'wonderful', 'worthy',
    },
    '#COMP#bad' : {
        'abominable', 'amiss', 'atrocious', 'awful', 'bad', 'bummer', 'careless', 
        'catastrophic', 'chaotic', 'cheap', 'cheesy', 'crap', 'crappy', 'crummy', 
        'damaging', 'dangerous', 'defective', 'deficient', 'deleterious', 
        'detrimental', 'disagreeable', 'disastrous', 'discouraging', 'displeasing', 
        'distressing', 'dreadful', 'dumb', 'erroneous', 'evil', 'fallacious', 
        'garbage', 'godawful', 'grim', 'grody', 'gross', 'grungy', 'harsh', 'hurtful', 
        'icky', 'imperfect', 'impolite', 'inadequate', 'incorrect', 'iniquitous', 
        'injurious', 'junky', 'lame', 'loud', 'lousy', 'mean', 'moldy', 'noisy', 
        'not good', 'old', 'painful', 'poor', 'rancid', 'regretful', 'rotten', 
        'rude', 'ruinous', 'sad', 'shitty', 'slipshod', 'spoiled', 
        'stinking', 'strident', 'substandard', 'terrible', 'tragic', 'troubled', 
        'troubling', 'unacceptable', 'unfavorable', 'unfortunate', 'unhappy',
        'unhealthy', 'unlucky', 'unpleasant', 'unsatisfactory', 'unwell', 'upsetting', 
        'vicious', 'wicked', 'wrong',
    },
}

verif_dict_generator(gen)

True

In [9]:
select_from_sentence_gen(gen, "#COMP#good #client_satisfaction")[1:]

'stable_satisfaction_of_client'

In [10]:
tagger = {
    '#STATS#graph' : [('$GEN$#STATS#graph', CT_TAG_STAT)],
    '#STATS#wordcloud' : [('$GEN$#STATS#wordcloud', CT_TAG_STAT)],
    '#STATS#plan_of_plane' : [('$GEN$#STATS#plan_of_plane', CT_TAG_STAT)],
    '#VERB#show' : [('$GEN$#VERB#show', CT_TAG_O)],
    '#PRONON#meus' : [('$GEN$#PRONON#meus', CT_TAG_O)],
    '#ARTICLE#' : [('$GEN$#ARTICLE#', CT_TAG_O)],

    # Named Entity
    '#NE#manu' : [
        ('$VOC$#manu#voc', CT_TAG_MANU),
    ],

    '#NE#airp' : [
        ('$VOC$#airp#voc', CT_TAG_AIRP),
        ('$VOC$#airp#voc airport', CT_TAG_AIRP+' '+CT_TAG_O),
        ('airport of $VOC$#airp#voc', CT_TAG_O+' '+CT_TAG_O+' '+CT_TAG_AIRP),
    ],

    '#NE#airl' : [
        ('$VOC$#airl#voc', CT_TAG_AIRL),
    ],

    '#NE#coun' : [
        ('$VOC$#coun#voc', CT_TAG_COUN),
    ],

    # Date1
    '#DATE1#' : [
      ('#DATE1#y', CT_HASH),
      ('year #DATE1#y', CT_TAG_O+' '+CT_HASH),
      ('#DATE1#my', CT_HASH),
      ('#DATE1#sy', CT_HASH),
    ],
    '#DATE1#y' : [
      ('$VOC$#year#voc',
       CT_TAG_DATE1+CT_SEP+CT_SUF_B),
    ],
    '#DATE1#my' : [
      ('$VOC$#mont#voc $VOC$#year#voc',
       CT_TAG_DATE1+CT_SEP+CT_SUF_B+' '+\
       CT_TAG_DATE1+CT_SEP+CT_SUF_E),
    ],
    '#DATE1#sy' : [
      ('$VOC$#seas#voc $VOC$#year#voc',
       CT_TAG_DATE1+CT_SEP+CT_SUF_B+' '+\
       CT_TAG_DATE1+CT_SEP+CT_SUF_E),
    ],

    # Date2
    '#DATE2#' : [
      ('#DATE2#y', CT_HASH),
      ('year #DATE2#y', CT_TAG_O+' '+CT_HASH),
      ('#DATE2#my', CT_HASH),
      ('#DATE2#sy', CT_HASH),
    ],
    '#DATE2#y' : [
      ('$VOC$#year#voc',
       CT_TAG_DATE2+CT_SEP+CT_SUF_B),
    ],
    '#DATE2#my' : [
      ('$VOC$#mont#voc $VOC$#year#voc',
       CT_TAG_DATE2+CT_SEP+CT_SUF_B+' '+\
       CT_TAG_DATE2+CT_SEP+CT_SUF_E),
    ],
    '#DATE2#sy' : [
      ('$VOC$#seas#voc $VOC$#year#voc',
       CT_TAG_DATE2+CT_SEP+CT_SUF_B+' '+\
       CT_TAG_DATE2+CT_SEP+CT_SUF_E),
    ],

    # Studied variable
    '#STUDIED#' : [
      ('#ARTICLE# $GEN$#quantity_of_flights', CT_HASH+' '+CT_TAG_STUD),
      ('#ARTICLE# $GEN$#client_satisfaction', CT_HASH+' '+CT_TAG_STUD),
    ],

    # Propositions SHOW
    '#PROP#show_meus_the' : [
      ('#VERB#show #PRONON#meus #ARTICLE#', CT_HASH+' '+CT_HASH+' '+CT_HASH),
    ],

    # Propositions DATES
    '#PROP#DATE#' : [
        ('#PROP#DATE#from_to', CT_HASH),
        ('#PROP#DATE#for', CT_HASH),
        ('#PROP#DATE#since', CT_HASH),
    ],
    '#PROP#DATE#from_to' : [
      ('$GEN$#PROP#from #DATE1# $GEN$#PROP#to #DATE2#',
       CT_TAG_O+' '+CT_HASH+' '+CT_TAG_O+' '+CT_HASH),
    ],
    '#PROP#DATE#for' : [
        ('for #DATE1#', CT_TAG_O+' '+CT_HASH),
        ('for the #DATE1#', CT_TAG_O+' '+CT_TAG_O+' '+CT_HASH),
    ],
    '#PROP#DATE#since' : [
        ('since #DATE1#', CT_TAG_O+' '+CT_HASH),
        ('since the #DATE1#', CT_TAG_O+' '+CT_TAG_O+' '+CT_HASH),
    ],
}

print(verif_dict_tagger(tagger))
print(verif_dict_tagger_links_gen(tagger,gen))
print(verif_dict_tagger_links_voc(tagger,voc))

True
True
True


In [11]:
s_do_or_not = ('', '#PROP#show_meus_the',)
s_what = ('#STATS#plan_of_plane',
          '#STATS#wordcloud',
          ['#STATS#graph', 'of', '#STUDIED#'],
         )
s_who_or_not = ('',
                ['for', ('#NE#manu',['#NE#manu', ('and',',',''), '#NE#manu']),],
               )
s_when_or_not = ('',
                 ['in',('#NE#coun',['#NE#coun',('and',',',''),'#NE#coun']),"#PROP#DATE#",],
                )

structures = [
    [s_do_or_not, s_what, s_who_or_not, s_when_or_not,],
    [s_do_or_not, s_what, s_when_or_not, s_who_or_not,],
    [s_who_or_not, s_do_or_not, s_what, s_when_or_not,],
    [s_when_or_not, s_do_or_not, s_what, s_who_or_not,],
    [s_who_or_not, s_when_or_not, s_do_or_not, s_what,],
    [s_when_or_not, s_who_or_not, s_do_or_not, s_what,],
]

verif_structure_link_tag(structures, tagger, display=False)

True

In [12]:
structure = structures[np.random.choice(len(structures),1)[0]]
init_structure(structure)

['#PROP#show_meus_the', '#STATS#plan_of_plane', 'for', '#NE#manu']

In [13]:
structure = structures[np.random.choice(len(structures),1)[0]]
structure_init = init_structure(structure)
sent,tags = generete_sentence_from_structure(tagger,structure_init)
print("Same length :", len(sent.split()) == len(tags.split()))
sent,tags

Same length : True


('print me the plan of plane', 'O O O STAT_B STAT_I STAT_E')

In [14]:
sentence = 'I woulde lirke the graph of the numbr of fligt for Boing on mondy 2020'
auto_correction(model_w2v.vocab, voc, voc_stopwords, sentence)

['I',
 'would',
 'like',
 'the',
 'graph',
 'of',
 'the',
 'number',
 'of',
 'flight',
 'for',
 'Boeing',
 'on',
 'monday',
 '2020']

##### generation of the datasets

In [15]:
data_train = []
for i in range(10000):
    structure = structures[np.random.choice(len(structures),1)[0]]
    structure_init = init_structure(structure)
    sent,tags = generete_sentence_from_structure(tagger,structure_init)
    for u,j in zip(sent.split(),tags.split()):
        data_train.append([i,u,j])

In [16]:
data_eval = []
for i in range(1000):
    structure = structures[np.random.choice(len(structures),1)[0]]
    structure_init = init_structure(structure)
    sent,tags = generete_sentence_from_structure(tagger,structure_init)
    for u,j in zip(sent.split(),tags.split()):
        data_eval.append([i,u,j])

In [17]:
# Creating train_df  and eval_df 
train_df = pd.DataFrame(data_train, columns=['sentence_id', 'words', 'labels'])
eval_df = pd.DataFrame(data_eval, columns=['sentence_id', 'words', 'labels'])

In [18]:
train_df.to_csv(data_directory+'tagger_train.csv', sep='§', encoding='utf-8')
eval_df.to_csv(data_directory+'tagger_eval.csv', sep='§', encoding='utf-8')