Created on Fri Jan 17 17:03:35 2019  
Group 9  
@authors:  

sources :  
df_aircraft : scrap_flugzeuginfo_aircrafts.ipynb  
df_airline : scrap_flugzeuginfo_airlines.ipynb  
df_airport : scrap_flugzeuginfo_airports.ipynb  
df_country : scrap_flugzeuginfo_countries.ipynb  

In [None]:
from gensim.models import KeyedVectors

In [None]:
def preprocess(w: str) -> str:
    w = w.split(',')[0].strip()
    w = w.split('(')[0].strip()
    w = w.split('/')[0].strip()
    w = "_".join(list(map(lambda x: x.strip(), w.split('&'))))
    w = w.replace("St ", "Saint ")
    w = w.replace("-", " ")
    w = w.replace(".", " ")
    w = w.replace("*", " ")
    w = w.replace("\'", " ")
    w = w.split()
    if w[-1].lower() == 'airport':
        w = w[0:-1]
    if w[-1].lower() == 'intl':
        w = w[0:-1]
    w = "_".join(w)
    return w

In [None]:
def get_prevoc() -> dict:
    """This function returns a list of words present into the initial database.
    It's used for the sentence generation and the train

    Out:
        prevoc: dictionary of database vocabulary 

    """
    df_aircraft = pd.read_csv(data_directory+'df_aircraft.csv', sep='§', engine='python',
                              index_col=0, encoding='utf-8')
    df_airline = pd.read_csv(data_directory+'df_airline.csv', sep='§', engine='python',
                             index_col=0, encoding='utf-8')
    df_airport = pd.read_csv(data_directory+'df_airport.csv', sep='§', engine='python',
                             index_col=0, encoding='utf-8')
    df_country = pd.read_csv(data_directory+'df_country.csv', sep='§', engine='python',
                             index_col=0, encoding='utf-8')
    
    prevoc = {
        "manu": ['Airbus', 'Boeing', 'Cessna', 'Cirrus ', 'Pilatus', 'Matra'],
        "coun": list(df_country['country'].apply(preprocess)),
        "citi": list(df_airport['location'].apply(preprocess)),
        "airc": list(df_aircraft['model'].apply(preprocess)),
        "airp": list(df_airport['airport'].apply(preprocess)),
        "airl": list(df_airline['airline'].apply(preprocess)),
        "mont": ['January', 'February', 'March', 'April', 'May', 'June', 'July',
                 'August', 'September', 'October', 'November', 'December'],
        "seas": ['Winter', 'Spring', 'Autumn', 'Winter'],
        "days": ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
                 'Saturday', 'Sunday'],
        "year": ['Year', 'Years'],
        "cate": ['Business', 'Premium', 'Economy'],
        "tabs": ['General', 'Seat', 'Plane plan', 'Flight characteristics', 'Comments'],
    }
    return prevoc

In [None]:
def get_voc(model_w2v: KeyedVectors, prevoc: dict) -> dict:
    """This function uses the prevoc with the word2vec classifier. 
    It writes the prevoc vocabulary in different forms: it puts the same word
    in uppercase, lowercase and just the fist letter in uppercase.

    Parameters:
        model_w2c: word2vec classifier 
        prevoc: list of prevoc vocabulary 

    Out:
        voc: dictionary of words applied to word 2 vec and in differents forms

    """
    voc = {
        "manu": {
            'voc': word_gen(model_w2v, prevoc["manu"], up=True,
                            cap=True, same=True, low=False),
            'tag': CT_TAG_MANU,
            'name': 'Airplane Manufacturer'
        },
        "coun": {
            'voc': word_gen(model_w2v, prevoc["coun"], up=True,
                            cap=True, same=True, low=False),
            'tag': CT_TAG_COUN,
            'name': 'Country'
        },
        "citi": {
            'voc': word_gen(model_w2v, prevoc["citi"], up=True,
                            cap=True, same=True, low=False),
            'tag': None,
            'name': 'City'
        },
        "airc": {
            'voc': word_gen(model_w2v, prevoc["airc"], up=True,
                            cap=True, same=True, low=False),
            'tag': CT_TAG_AIRC,
            'name': 'Aircraft'
        },
        "airp": {
            'voc': word_gen(model_w2v, prevoc["airp"], up=True,
                            cap=True, same=True, low=False),
            'tag': CT_TAG_AIRP,
            'name': 'Airport'
        },
        "airl": {
            'voc': word_gen(model_w2v, prevoc["airl"], up=True,
                            cap=True, same=True, low=False),
            'tag': CT_TAG_AIRL,
            'name': 'Airline'
        },
        "mont": {
            'voc': word_gen(model_w2v, prevoc["mont"], up=True,
                            cap=True, same=True, low=False),
            'tag': None,
            'name': 'Month'
        },
        "seas": {
            'voc': word_gen(model_w2v, prevoc["seas"], up=True,
                            cap=True, same=True, low=True),
            'tag': None,
            'name': 'Season'
        },
        "days": {
            'voc': word_gen(model_w2v, prevoc["days"], up=True,
                            cap=True, same=True, low=True),
            'tag': None,
            'name': 'Days'
        },
        "year": {
            # 'voc' : word_gen(model_w2v, prevoc["year"], up=True,
            # cap=True, same=True, low=False),
            'voc': {str(y) for y in range(1900, 2050)},
            'tag': None,
            'name': 'Year'
        },
    }
    return voc

In [None]:
def get_gen() -> dict:
    """This function for each key called by get_tagger going to generats similar words.
    In general, the generator is going to get the associated # and fin similar word in the 
    following dictionary.

    Out:
        gen: dictionary with several verbs, adjectives, subjects similar at #

    """

    gen = {
        '#client': {
            'client', 'clients', 'customer', 'customers', 'passenger', 'passengers',
        },
        '#satisfaction': {
            'contentment', 'contentments', 'satisfaction', 'satisfactions',
        },
        '#quantity': {
            'amount', 'amounts', 'number', 'quantity', 'sum', 'sums',
        },
        '#flight': {
            'flight', 'flights', 'travel', 'travels',
        },
        '#client_satisfaction': {
            '#client #satisfaction',
            '#satisfaction of #client',
        },
        '#quantity_of_flights': {
            '#flight #quantity',
            '#quantity of #flight',
        },
        '#PROP#to': {
            'to', 'to the beginning of', 'to the end of', 'until',
        },
        '#PROP#from': {
            'from', 'from the beginning of', 'from the end of', 'in',
        },
        '#STATS#graph': {
            'chart', 'charts', 'graph', 'graphs', 'histogram', 'histograms',
            'pie chart', 'pie charts', 'slope', 'slopes',
        },
        '#STATS#wordcloud': {
            'cloud of words', 'wordcloud',
        },
        '#STATS#plan_of_plane': {
            'plan of plane', 'plans of plane', 'plans of planes',
        },
        '#VERB#show': {
            'display', 'highlight', 'plot', 'print', 'show', 'view',
        },
        '#VERB#be': {
            'happen to be', 'is', 'seems', 'seems to be',
        },
        '#VERB#like': {
            'like', 'likes', 'love', 'loves',
        },
        '#VERB#dislike': {
            'cannot stand', 'dislike', 'dislikes', 'hate', 'hates',
        },
        '#PRONON#meus': {
            'me', 'us',
        },
        '#ARTICLE#': {
            'a', 'an', 'the',
        },
        '#COMP#good': {
            'able', 'acceptable', 'ace', 'admirable', 'advantageous', 'agreeable',
            'amazing', 'appropriate', 'awesome', 'benefic', 'capable', 'capital',
            'clever', 'comfortable', 'commendable', 'common', 'congenial',
            'convenient', 'decent', 'deluxe', 'efficient', 'excellent',
            'exceptional', 'expert', 'fascinating', 'favorable', 'first-class',
            'first-rate', 'flawless', 'fresh', 'friendly', 'good', 'gratifying',
            'great', 'healthy', 'helpful', 'honest', 'honorable', 'hygienic',
            'incredible', 'intact', 'kindhearted', 'marvelous', 'neat', 'nice',
            'normal', 'opportune', 'perfect', 'pleasant', 'pleasing', 'positive',
            'precious', 'prime', 'prodigious', 'profitable', 'qualified', 'rad',
            'reliable', 'reputable', 'respectable', 'right', 'safe', 'salutary',
            'satisfactory', 'satisfying', 'serviceable', 'shipshape', 'shocking',
            'skillful', 'solid', 'splendid', 'stable', 'sterling', 'stunning',
            'stupendous', 'suitable', 'suited', 'super', 'superb', 'superior',
            'surprising', 'talented', 'tasty', 'tip-top', 'tolerable',
            'trustworthy', 'unbelievable', 'useful', 'valuable', 'welcome',
            'wonderful', 'worthy',
        },
        '#COMP#bad': {
            'abominable', 'amiss', 'atrocious', 'awful', 'bad', 'bummer', 'careless',
            'catastrophic', 'chaotic', 'cheap', 'cheesy', 'crap', 'crappy', 'crummy',
            'damaging', 'dangerous', 'defective', 'deficient', 'deleterious',
            'detrimental', 'disagreeable', 'disastrous', 'discouraging', 'displeasing',
            'distressing', 'dreadful', 'dumb', 'erroneous', 'evil', 'fallacious',
            'garbage', 'godawful', 'grim', 'grody', 'gross', 'grungy', 'harsh', 'hurtful',
            'icky', 'imperfect', 'impolite', 'inadequate', 'incorrect', 'iniquitous',
            'injurious', 'junky', 'lame', 'loud', 'lousy', 'mean', 'moldy', 'noisy',
            'not good', 'old', 'painful', 'poor', 'rancid', 'regretful', 'rotten',
            'rude', 'ruinous', 'sad', 'shitty', 'slipshod', 'spoiled',
            'stinking', 'strident', 'substandard', 'terrible', 'tragic', 'troubled',
            'troubling', 'unacceptable', 'unfavorable', 'unfortunate', 'unhappy',
            'unhealthy', 'unlucky', 'unpleasant', 'unsatisfactory', 'unwell', 'upsetting',
            'vicious', 'wicked', 'wrong',
        },
    }
    return gen

In [None]:
def get_tagger() -> dict:
    """This function for each tag associats a similar word...

    Out:
        tagger: dictionary with all the taggers

    """

    tagger = {
        '#STATS#graph': [('$GEN$#STATS#graph', CT_TAG_STAT)],
        '#STATS#wordcloud': [('$GEN$#STATS#wordcloud', CT_TAG_STAT)],
        '#STATS#plan_of_plane': [('$GEN$#STATS#plan_of_plane', CT_TAG_STAT)],
        '#VERB#show': [('$GEN$#VERB#show', CT_TAG_O)],
        '#PRONON#meus': [('$GEN$#PRONON#meus', CT_TAG_O)],
        '#ARTICLE#': [('$GEN$#ARTICLE#', CT_TAG_O)],

        # Named Entity
        '#NE#manu': [
            ('$VOC$#manu#voc', CT_TAG_MANU),
        ],

        '#NE#airp': [
            ('$VOC$#airp#voc', CT_TAG_AIRP),
            ('$VOC$#airp#voc airport', CT_TAG_AIRP+' '+CT_TAG_O),
            ('airport of $VOC$#airp#voc', CT_TAG_O+' '+CT_TAG_O+' '+CT_TAG_AIRP),
        ],

        '#NE#airl': [
            ('$VOC$#airl#voc', CT_TAG_AIRL),
        ],

        '#NE#coun': [
            ('$VOC$#coun#voc', CT_TAG_COUN),
        ],

        # Date1
        '#DATE1#': [
            ('#DATE1#y', CT_HASH),
            ('year #DATE1#y', CT_TAG_O+' '+CT_HASH),
            ('#DATE1#my', CT_HASH),
            ('#DATE1#sy', CT_HASH),
        ],
        '#DATE1#y': [
            ('$VOC$#year#voc',
             CT_TAG_DATE1+CT_SEP+CT_SUF_B),
        ],
        '#DATE1#my': [
            ('$VOC$#mont#voc $VOC$#year#voc',
             CT_TAG_DATE1+CT_SEP+CT_SUF_B+' ' +\
             CT_TAG_DATE1+CT_SEP+CT_SUF_E),
        ],
        '#DATE1#sy': [
            ('$VOC$#seas#voc $VOC$#year#voc',
             CT_TAG_DATE1+CT_SEP+CT_SUF_B+' ' +\
             CT_TAG_DATE1+CT_SEP+CT_SUF_E),
        ],

        # Date2
        '#DATE2#': [
            ('#DATE2#y', CT_HASH),
            ('year #DATE2#y', CT_TAG_O+' '+CT_HASH),
            ('#DATE2#my', CT_HASH),
            ('#DATE2#sy', CT_HASH),
        ],
        '#DATE2#y': [
            ('$VOC$#year#voc',
             CT_TAG_DATE2+CT_SEP+CT_SUF_B),
        ],
        '#DATE2#my': [
            ('$VOC$#mont#voc $VOC$#year#voc',
             CT_TAG_DATE2+CT_SEP+CT_SUF_B+' ' +\
             CT_TAG_DATE2+CT_SEP+CT_SUF_E),
        ],
        '#DATE2#sy': [
            ('$VOC$#seas#voc $VOC$#year#voc',
             CT_TAG_DATE2+CT_SEP+CT_SUF_B+' ' +\
             CT_TAG_DATE2+CT_SEP+CT_SUF_E),
        ],

        # Studied variable
        '#STUDIED#': [
            ('#ARTICLE# $GEN$#quantity_of_flights', CT_HASH+' '+CT_TAG_STUD),
            ('#ARTICLE# $GEN$#client_satisfaction', CT_HASH+' '+CT_TAG_STUD),
        ],

        # Propositions SHOW
        '#PROP#show_meus_the': [
            ('#VERB#show #PRONON#meus #ARTICLE#', CT_HASH+' '+CT_HASH+' '+CT_HASH),
        ],

        # Propositions DATES
        '#PROP#DATE#': [
            ('#PROP#DATE#from_to', CT_HASH),
            ('#PROP#DATE#for', CT_HASH),
            ('#PROP#DATE#since', CT_HASH),
        ],
        '#PROP#DATE#from_to': [
            ('$GEN$#PROP#from #DATE1# $GEN$#PROP#to #DATE2#',
             CT_TAG_O+' '+CT_HASH+' '+CT_TAG_O+' '+CT_HASH),
        ],
        '#PROP#DATE#for': [
            ('for #DATE1#', CT_TAG_O+' '+CT_HASH),
            ('for the #DATE1#', CT_TAG_O+' '+CT_TAG_O+' '+CT_HASH),
        ],
        '#PROP#DATE#since': [
            ('since #DATE1#', CT_TAG_O+' '+CT_HASH),
            ('since the #DATE1#', CT_TAG_O+' '+CT_TAG_O+' '+CT_HASH),
        ],
    }
    return tagger

In [None]:
def get_structures() -> list:
    """This function returns the list of the differents structures that can exist 

    Out:
        structures: list of differents type of structures

    """
    s_do_or_not = ('', '#PROP#show_meus_the',)
    s_what = ('#STATS#plan_of_plane',
              '#STATS#wordcloud',
              ['#STATS#graph', 'of', '#STUDIED#'],
              )
    s_who_or_not = ('',
                    ['for', ('#NE#manu', ['#NE#manu',
                                          ('and', ',', ''), '#NE#manu']), ],
                    )
    s_when_or_not = ('',
                     ['in', ('#NE#coun', ['#NE#coun', ('and', ',', ''),
                                          '#NE#coun']), "#PROP#DATE#", ],
                     )

    structures = [
        [s_do_or_not, s_what, s_who_or_not, s_when_or_not, ],
        [s_do_or_not, s_what, s_when_or_not, s_who_or_not, ],
        [s_who_or_not, s_do_or_not, s_what, s_when_or_not, ],
        [s_when_or_not, s_do_or_not, s_what, s_who_or_not, ],
        [s_who_or_not, s_when_or_not, s_do_or_not, s_what, ],
        [s_when_or_not, s_who_or_not, s_do_or_not, s_what, ],
    ]
    return structures

In [None]:
def get_DB() -> dict:
    """This function creates the database with all the filters, and differents variables

    Parameters:

    Out:
        BDD: database
    """
    seat_numbers = set(["".join(p) for p in list(itertools.product(
        [str(i).zfill(2) for i in range(1, 99)],
        ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
         'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']))])
    
    prevoc = get_prevoc()
    [" ".join(c.split('_')) for c in prevoc['coun']]

    DB = {
        CT_filt_manu: {
            "voc": set([" ".join(c.split('_')) for c in prevoc['manu']])},
        CT_filt_airc: {
            "voc": set([" ".join(c.split('_')) for c in prevoc['airc']])},
        CT_filt_airl: {
            "voc": set([" ".join(c.split('_')) for c in prevoc['airl']])},
        CT_filt_airp: {
            "voc": set([" ".join(c.split('_')) for c in prevoc['airp']])},
        CT_filt_cate: {
            "voc": set(['business', 'economy', 'premium'])},
        CT_filt_coun: {
            "voc": set([" ".join(c.split('_')) for c in prevoc['coun']])},
        CT_filt_city: {
            "voc": set([" ".join(c.split('_')) for c in prevoc['citi']])},

        CT_feat_seat: {
            "voc": seat_numbers},
        CT_feat_seat_typ: {
            "voc": set(['business', 'economy', 'premium'])},
        CT_feat_seat_pos: {
            "voc": seat_numbers},
        CT_feat_rate_seat_conf: {
            "voc": set([str(i) for i in range(5)])},
        CT_feat_cater: {
            "voc": set(['business', 'economy', 'premium'])},
        CT_feat_travel: {
            "voc": set(['business', 'solo leisure', 'family leisure', 'couple leisure'])},
        CT_feat_power: {
            "voc": set(['yes', 'no'])},
        CT_feat_pitch: {
            "voc": set([str(i) for i in range(10, 30)])},
        CT_feat_rate_clean: {
            "voc": set([str(i) for i in range(5)])},
        CT_feat_rate_staff: {
            "voc": set([str(i) for i in range(5)])},
        CT_feat_rate_entertai: {
            "voc": set([str(i) for i in range(5)])},
        CT_feat_rate_value: {
            "voc": set([str(i) for i in range(5)])},
        CT_feat_rate_overall: {
            "voc": set([str(i) for i in range(10)])},
    }
    return DB