In [1]:
import pandas as pd
import numpy as np
import re
import random
from sklearn import feature_extraction
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
cols_to_keep = ['product_id', 'client_product_name', ]

df = pd.read_parquet('part-00000-tid-8327383503513150656-7bd9eb0d-66d7-4422-840c-d7abd7313e0a-283-c000.snappy.parquet',
                     engine='pyarrow')[cols_to_keep].replace('', np.nan)

df.dropna(subset=cols_to_keep, inplace=True)  
df.drop_duplicates('product_id', inplace=True)

In [None]:
df.head()

In [None]:
df_aapl = df.loc[df.client_product_name.str.contains('iphone')]
df_aapl.shape

In [None]:
num_obs = 1000
seed = 1

prod_id = list(df_aapl.product_id)
prod_name = list(df_aapl.client_product_name)

prod_id_name = list(zip(prod_id, prod_name))
random.Random(seed).shuffle(prod_id_name)
prod_id_name = prod_id_name[:1000]

prod_id_name_df = pd.DataFrame(prod_id_name, columns=cols_to_keep)
prod_id_name_df.head()

In [None]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000,
                                   min_df=0.01, stop_words='english',
                                   use_idf=False, tokenizer=tokenize_and_stem, ngram_range=(1,3))

In [None]:
%time tfidf_matrix = tfidf_vectorizer.fit_transform(list(prod_id_name_df.client_product_name))


In [None]:
len(tfidf_vectorizer.get_feature_names())

In [None]:
pd.DataFrame(tfidf_matrix.todense(), columns=tfidf_vectorizer.get_feature_names()).head()

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
terms