In [44]:
import pandas as pd
import numpy as np
import re
import random
from sklearn import feature_extraction
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


In [21]:
cols_to_keep = ['product_id', 'client_product_name', ]

df = pd.read_parquet('part-00000-tid-8327383503513150656-7bd9eb0d-66d7-4422-840c-d7abd7313e0a-283-c000.snappy.parquet',
                     engine='pyarrow')[cols_to_keep].replace('', np.nan)

df.dropna(subset=cols_to_keep, inplace=True)  
df.drop_duplicates('product_id', inplace=True)

In [24]:
df.head()

Unnamed: 0,product_id,client_product_name
0,111715368,4-in-1 urban metal in bronze
1,j2112185whte,nora skinny
2,je258185whte,plus peri straight
3,je453361bkri,plus nora narrow
4,t2110357,clara tunic


In [4]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")

In [72]:
num_obs = 1000
seed = 1

prod_id = list(df.product_id)[:num_obs]
prod_name = list(df.client_product_name)[:num_obs]

prod_id_name = list(zip(prod_id, prod_name))
random.Random(seed).shuffle(prod_id_name)
prod_id_name[:10]

[('00146732_916_26', '3/4 wide sleeve blouse'),
 ('00145202_100_10', 'pedal pusher stud hem bengaline'),
 ('tv529', 'pantalon boule en denim taille élastiquée'),
 ('prod3110010', 'rc new bright 1:10 ford raptor'),
 ('fn580', 'tee-shirt motifs fauve'),
 ('prod3420345', 'set of 4 colorful graters'),
 ('110421_lilac_36', 'memory foam velcro slipper'),
 ('00135739_103_one', 'm175ce-crm'),
 ('ip127325', 'morphy richards 300254 breeze steam iron (2600w)'),
 ('5030109-0026', 'glitter mesh bolero')]

In [50]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [31]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                   min_df=0.05, stop_words='english',
                                   use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

In [40]:
%time tfidf_matrix = tfidf_vectorizer.fit_transform(prod_name[:100000])


CPU times: user 28.7 s, sys: 190 ms, total: 28.9 s
Wall time: 28.9 s
