In [1]:
import pickle
import pandas as pd
from collections import OrderedDict
import spacy
nlp = spacy.load('en')

In [2]:
df = pd.read_excel('_data/uniques_translate_final.xlsx')

In [3]:
noisy_pos_tags = ["PROP","DET","PART","CCONJ","ADP","PRON","VERB","ADJ"]
min_token_length = 2 # minimum token length to remove

In [4]:
def is_noise(token):
    '''
    standard way to validate spacy tokens
    This method validate all the passed tokens and set true false on it
    '''
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True
    elif token.is_stop == True:
        is_noise = True
    elif token.is_digit == True:
        is_noise = True
    elif token.is_space == True:
        is_noise = True
    elif len(token.string) <= min_token_length:
        is_noise = True
    return is_noise

In [5]:
def lemmanize(string):
    result_tempo = []
    nlp_words = nlp(string)
    for word in nlp_words:
        if not is_noise(word):
            result_tempo.append(word.lemma_)
    if len(result_tempo) > 0:
        return ' '.join(result_tempo)
    else:
        return ''

In [6]:
dic_translated = dict()

In [7]:
for index, row in df.iterrows():
    dic_translated[row.key] = row.en

In [8]:
dic_translated

{'simili_cuir': 'leatherette',
 'polaire': 'polar',
 'lin': 'linen',
 'lin_enduit': 'coated linen',
 'coton_enduit': 'coated cotton',
 'suedine': 'suede',
 'lapis_lazuli': 'lapis lazuli',
 'cosses': 'thimbles',
 'sarrasin': 'buckwheat',
 'flocons_de_fibres_de_polyester': 'flakes of polyester fibers',
 'rembourrage_de_flocons_de_fibres_de_polyester': 'padding of polyester fiber flakes',
 'flocons_fibres_de_polyester': 'polyester fiber flakes',
 'giclee_print': 'giclée print',
 'heavy_cotton_moulin_du_roy_paper': 'heavy cotton moulin du roy paper',
 'moulin_du_roy_heavy_cotton_paper': 'roy heavy cotton paper mill',
 'heavy_cotton_moulin_de_roy_paper': 'heavy cotton roy paper mill',
 'moulin_du_roy_cotton_paper': 'mill of roy cotton paper',
 'moulin_du_roy_paper': 'moulin du roy paper',
 'verre_file': 'spun glass',
 'verre_de_murano': 'Murano glass',
 'verre': 'glass',
 'cuivre': 'copper',
 'pierre': 'Pierre',
 'pierre_fine': 'fine stone',
 'gemme': 'gem',
 'pierre_de_gemme': 'gemstone',


In [9]:
dic_uniques = pickle.load( open( "_data/dic_uniques.pickle", "rb" ) )

In [10]:
for (key, texte) in dic_uniques.items() :
    if key in dic_translated:
        dic_uniques[key] = dic_translated[key]

In [11]:
dic_uniques['corail_rouge']

'red coral'

In [12]:
len(dic_uniques)

87376

In [13]:
dic_uniques_lem = OrderedDict()

In [14]:
for (key, texte) in dic_uniques.items() :
        dic_uniques_lem[key] = [dic_uniques[key], lemmanize(dic_uniques[key])]

In [15]:
dic_uniques_lem['corail_rouge']

['red coral', 'coral']

In [16]:
pickle.dump( dic_uniques_lem, open( "_data/dic_uniques_lem.pickle", "wb" ) )

In [17]:
data_dict = pickle.load( open( "_data/data_dict.pickle", "rb" ) )

In [18]:
len(data_dict)

1268606

In [19]:
data_dict

OrderedDict([('550866727_1', [550866727, 'glass', 'glass']),
             ('639584419_1', [639584419, 'copper', 'copper']),
             ('637314587_1', [637314587, 'cotton', 'cotton']),
             ('550857329_1', [550857329, 'cotton', 'cotton']),
             ('537056276_1', [537056276, 'wood', 'wood']),
             ('537064510_1', [537064510, 'wood', 'wood']),
             ('550858757_1', [550858757, 'wood', 'wood']),
             ('550856537_1', [550856537, 'wood', 'wood']),
             ('537057576_1', [537057576, 'cotton', 'cotton']),
             ('537067898_1', [537067898, 'wood', 'wood']),
             ('537067458_1', [537067458, 'cotton', 'cotton']),
             ('634617401_1', [634617401, 'cotton', 'cotton']),
             ('603638000_1', [603638000, 'cotton', 'cotton']),
             ('537060516_1', [537060516, 'wood', 'wood']),
             ('537056102_1', [537056102, 'cotton', 'cotton']),
             ('537056096_1', [537056096, 'cotton', 'cotton']),
             ('640

In [20]:
dict_final = OrderedDict()

In [21]:
for (key, row) in data_dict.items() :
    id_listing = row[0]
    key_texte = row[1]
    translated_texte = dic_uniques_lem[key_texte][0]
    lemmanized_texte = dic_uniques_lem[key_texte][1]
    dict_final[key] = [id_listing, key_texte, translated_texte, lemmanized_texte]

In [22]:
data_final = []

In [23]:
idx = 0
for (key, row) in dict_final.items():
    data_final.append(row)
    idx = idx + 1

In [24]:
df = pd.DataFrame(data_final, columns=['id_listing', 'key_texte', 'translated_texte', 'lemmanized_texte'])

In [25]:
df.head()

Unnamed: 0,id_listing,key_texte,translated_texte,lemmanized_texte
0,550866727,glass,glass,glass
1,639584419,copper,copper,copper
2,637314587,cotton,cotton,cotton
3,550857329,cotton,cotton,cotton
4,537056276,wood,wood,wood


In [26]:
df.to_csv('_data/df_final.csv')