In [1]:
import pickle
import pandas as pd
from collections import OrderedDict
import spacy
nlp = spacy.load('en')

In [2]:
ETSY_ROUND = '_2019'
OUTPUT_FOLDER = ETSY_ROUND + '/output/'

In [3]:
df = pd.read_excel(OUTPUT_FOLDER + 'uniques_translate_final.xlsx')

In [4]:
noisy_pos_tags = ["PROP","DET","PART","CCONJ","ADP","PRON","VERB","ADJ"]
min_token_length = 2 # minimum token length to remove

In [5]:
def is_noise(token):
    '''
    standard way to validate spacy tokens
    This method validate all the passed tokens and set true false on it
    '''
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True
    elif token.is_stop == True:
        is_noise = True
    elif token.is_digit == True:
        is_noise = True
    elif token.is_space == True:
        is_noise = True
    elif len(token.string) <= min_token_length:
        is_noise = True
    return is_noise

In [6]:
def lemmanize(string):
    if isinstance(string, str):
        result_tempo = []
        nlp_words = nlp(string)
        for word in nlp_words:
            if not is_noise(word):
                result_tempo.append(word.lemma_)
        if len(result_tempo) > 0:
            return ' '.join(result_tempo)
        else:
            return ''
    else:
        print(string)

In [7]:
dic_translated = dict()

In [8]:
for index, row in df.iterrows():
    dic_translated[row.key] = row.en

In [9]:
dic_translated

{'laine': 'wool',
 'soie': 'silk',
 'pure_laine': 'pure wool',
 'alpaga': 'alpaca',
 'tricot': 'knitting',
 'fait_main': 'handmade',
 'merinos': 'merino',
 'perles': 'beads',
 'agate_blanche': 'white agate',
 'agate_marbree': 'marbled agate',
 'perle_de_bois_noir': 'black wooden bead',
 'metal_argente': 'silver plated',
 'verre_craquele': 'glass cracked',
 'verre_blanc': 'white glass',
 'cristal': 'crystal',
 'cristal_noir': 'black crystal',
 'murano_ovale_noir': 'Murano black oval',
 'murano_rond_blanc': 'Murano round white',
 'cubes_de_verre_noir': 'cubes of black glass',
 'papillon_cristal_noir': 'butterfly black crystal',
 'perles_de_bois': 'wooden beads',
 'perle_metallique': 'metallic pearl',
 'strass': 'strass',
 'pierre_fine': 'gemstone',
 'pierre_fine_noire_et_blanche': 'black and white fine stone',
 'agate_noire_rayee': 'striped black agate',
 'pierre_naturelle': 'natural stone',
 'crochet_ou_clip': 'hook or clip',
 'quartz_pasteque': 'quartz watermelon',
 'pierres_fines': 'p

In [10]:
dic_uniques = pickle.load( open( OUTPUT_FOLDER + "dic_uniques.pickle", "rb" ) )

In [11]:
for (key, texte) in dic_uniques.items() :
    if key in dic_translated:
        dic_uniques[key] = dic_translated[key]

In [12]:
dic_uniques['corail_rouge']

'red coral'

In [13]:
len(dic_uniques)

61097

In [14]:
dic_uniques_lem = OrderedDict()

In [15]:
for (key, texte) in dic_uniques.items() :
        dic_uniques_lem[key] = [dic_uniques[key], lemmanize(dic_uniques[key])]

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [16]:
dic_uniques_lem['corail_rouge']

['red coral', 'coral']

In [17]:
pickle.dump( dic_uniques_lem, open( OUTPUT_FOLDER + "dic_uniques_lem.pickle", "wb" ) )

In [18]:
data_dict = pickle.load( open( OUTPUT_FOLDER + "data_dict.pickle", "rb" ) )

In [19]:
len(data_dict)

734124

In [20]:
data_dict

OrderedDict([('743909687_1', [743909687, 'laine', 'laine']),
             ('743909687_2', [743909687, 'mohair', 'mohair']),
             ('743909687_3', [743909687, 'soie', 'soie']),
             ('743909687_4', [743909687, 'pure_laine', 'pure laine']),
             ('742464511_1', [742464511, 'laine', 'laine']),
             ('742464511_2', [742464511, 'mohair', 'mohair']),
             ('742464511_3', [742464511, 'soie', 'soie']),
             ('742464511_4', [742464511, 'pure_laine', 'pure laine']),
             ('742463375_1', [742463375, 'laine', 'laine']),
             ('742463375_2', [742463375, 'soie', 'soie']),
             ('742463375_3', [742463375, 'mohair', 'mohair']),
             ('732804293_1', [732804293, 'laine', 'laine']),
             ('732804293_2', [732804293, 'mohair', 'mohair']),
             ('732804293_3', [732804293, 'soie', 'soie']),
             ('669600823_1', [669600823, 'laine', 'laine']),
             ('669600823_2', [669600823, 'mohair', 'mohair']),
  

In [21]:
dict_final = OrderedDict()

In [22]:
for (key, row) in data_dict.items() :
    id_listing = row[0]
    key_texte = row[1]
    translated_texte = dic_uniques_lem[key_texte][0]
    lemmanized_texte = dic_uniques_lem[key_texte][1]
    dict_final[key] = [id_listing, key_texte, translated_texte, lemmanized_texte]

In [23]:
data_final = []

In [24]:
idx = 0
for (key, row) in dict_final.items():
    data_final.append(row)
    idx = idx + 1

In [25]:
df = pd.DataFrame(data_final, columns=['id_listing', 'key_texte', 'translated_texte', 'lemmanized_texte'])

In [26]:
df.head()

Unnamed: 0,id_listing,key_texte,translated_texte,lemmanized_texte
0,743909687,laine,wool,wool
1,743909687,mohair,mohair,mohair
2,743909687,soie,silk,silk
3,743909687,pure_laine,pure wool,wool
4,742464511,laine,wool,wool


In [27]:
df.to_csv(OUTPUT_FOLDER + 'df_final.csv')