In [1]:
import pickle
import pandas as pd
from collections import OrderedDict
import spacy
nlp = spacy.load('en')

In [2]:
df = pd.read_excel('_data/uniques_translate_final.xlsx')

In [3]:
noisy_pos_tags = ["PROP","DET","PART","CCONJ","ADP","PRON","VERB","ADJ"]
min_token_length = 2 # minimum token length to remove

In [4]:
def is_noise(token):
    '''
    standard way to validate spacy tokens
    This method validate all the passed tokens and set true false on it
    '''
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True
    elif token.is_stop == True:
        is_noise = True
    elif token.is_digit == True:
        is_noise = True
    elif token.is_space == True:
        is_noise = True
    elif len(token.string) <= min_token_length:
        is_noise = True
    return is_noise

In [5]:
def lemmanize(string):
    result_tempo = []
    nlp_words = nlp(string)
    for word in nlp_words:
        if not is_noise(word):
            result_tempo.append(word.lemma_)
    if len(result_tempo) > 0:
        return ' '.join(result_tempo)
    else:
        return ''

In [6]:
dic_translated = dict()

In [7]:
for index, row in df.iterrows():
    dic_translated[row.key] = row.en

In [8]:
dic_translated

{'corail_rouge': 'red coral',
 'perles_naturelles': 'natural pearls',
 'coquille': 'shell',
 'laiton_bronze_antique': 'antique bronze brass',
 'agate_blanche': 'white agate',
 'papier_mache': 'paper mache',
 'cover_painting_from_georges_braque': 'cover painting from george braque',
 'midcentury_vintage_paquebot_cruise': 'midcentury vintage cruise liner',
 'vintage_paquebot_brazza_menu': 'vintage liner brazza menu',
 'donation_for_charity_red_cross_paris': 'donation for charity red cross paris',
 'metal': 'metal',
 'texture': 'texture',
 'lace': 'laced',
 'papier': 'paper',
 'zen_and_warm_greetings': 'zen and warm greetings',
 'miroir': 'mirror',
 'coton': 'cotton',
 'peinture': 'painting',
 'orange': 'orange',
 'vegetal': 'vegetal',
 'medieval': 'medieval',
 'baleine_en_acier': 'steel whale',
 'synthetique': 'synthetic',
 'cone': 'cone',
 'tissus': 'tissues',
 'laine': 'wool',
 'photocopie': 'photocopy',
 'atelier_sylphe': 'sylphe workshop',
 'coton_recycle': 'recycled cotton',
 'perle

In [10]:
dic_uniques = pickle.load( open( "_data/dic_uniques.pickle", "rb" ) )

In [11]:
for (key, texte) in dic_uniques.items() :
    if key in dic_translated:
        dic_uniques[key] = dic_translated[key]

In [12]:
dic_uniques['corail_rouge']

'red coral'

In [13]:
len(dic_uniques)

81980

In [14]:
dic_uniques_lem = OrderedDict()

In [15]:
for (key, texte) in dic_uniques.items() :
        dic_uniques_lem[key] = [dic_uniques[key], lemmanize(dic_uniques[key])]

In [16]:
dic_uniques_lem['corail_rouge']

['red coral', 'coral']

In [22]:
pickle.dump( dic_uniques_lem, open( "_data/dic_uniques_lem.pickle", "wb" ) )

In [17]:
data_dict = pickle.load( open( "_data/data_dict.pickle", "rb" ) )

In [18]:
len(data_dict)

1038485

In [23]:
data_dict

OrderedDict([('444500_1', [444500, 'clasp', 'clasp']),
             ('444500_2', [444500, 'turpentine', 'turpentine']),
             ('444500_3', [444500, 'toggle', 'toggle']),
             ('444500_4', [444500, 'sterling', 'sterling']),
             ('444500_5', [444500, 'silver', 'silver']),
             ('444500_6', [444500, 'pearls', 'pearls']),
             ('444500_7', [444500, 'swarovski', 'swarovski']),
             ('444500_8', [444500, 'beads', 'beads']),
             ('444500_9', [444500, 'glass', 'glass']),
             ('444500_10', [444500, 'yellow_turquoise', 'yellow turquoise']),
             ('444500_11', [444500, 'swarovski_pearls', 'swarovski pearls']),
             ('6361780_1', [6361780, 'watercolour', 'watercolour']),
             ('6361780_2', [6361780, 'ink', 'ink']),
             ('6361780_3', [6361780, 'paper', 'paper']),
             ('6361780_4', [6361780, 'paint', 'paint']),
             ('6361916_1', [6361916, 'watercolour', 'watercolour']),
             (

In [24]:
dict_final = OrderedDict()

In [25]:
for (key, row) in data_dict.items() :
    id_listing = row[0]
    key_texte = row[1]
    translated_texte = dic_uniques_lem[key_texte][0]
    lemmanized_texte = dic_uniques_lem[key_texte][1]
    dict_final[key] = [id_listing, key_texte, translated_texte, lemmanized_texte]

In [27]:
data_final = []

In [28]:
idx = 0
for (key, row) in dict_final.items():
    data_final.append(row)
    idx = idx + 1

In [29]:
df = pd.DataFrame(data_final, columns=['id_listing', 'key_texte', 'translated_texte', 'lemmanized_texte'])

In [37]:
df.head()

Unnamed: 0,id_listing,key_texte,translated_texte,lemmanized_texte
0,444500,clasp,clasp,
1,444500,turpentine,turpentine,
2,444500,toggle,toggle,toggle
3,444500,sterling,sterling,sterling
4,444500,silver,silver,silver


In [32]:
df.to_csv('_data/df_final.csv')