In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models
# import nltk
from gensim.parsing.preprocessing import STOPWORDS
# from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.stem.porter import *
import re
import spacy

In [None]:
# load csv to dataframe
raw_data = pd.read_csv('/home/raw_data.csv')
new_attriubute_data = pd.read_csv('/home/new_attribute_data.csv')
new_attriubute_data = new_attriubute_data[['product_key', 'attribute_value_y']]
new_attriubute_data.rename(columns={'attribute_value_y': 'attribute_value'}, inplace=True)
print(new_attriubute_data)

In [None]:
# keep necessary data
column_list = ['product_key', 'product_name', 'cluster', 'h1_l1_hierarchy_name','h1_l2_hierarchy_name','h1_l3_hierarchy_name', 'attribute_id', 'attribute_name', 'attribute_value']
remove_attr_list = ['1007', '1009', '16', '27', '301', '303', '304', '305', '317', '32', '324', '35', '4', '42', '45', '49', '5', '518', '523', '526', '527', '528', '53', '530', '533', '549', '550', '6', '755', '764']

raw_data = raw_data[raw_data['attribute_type'] == 'UDA']
raw_data = raw_data[raw_data['attribute_name'] != 'NOT USE']
raw_data = raw_data[column_list]
raw_data = raw_data[~raw_data['attribute_id'].isin(remove_attr_list)]
raw_data['attribute_value'] = raw_data['attribute_value'].astype('str')
raw_data['product_key'] = raw_data['product_key'].astype('str')
print(raw_data)
raw_data = pd.concat([raw_data,new_attriubute_data])
print(raw_data)

In [None]:
# example product: 101215
raw_data_example = raw_data[raw_data['product_key'] == '101215']
print(raw_data_example)
print(raw_data_example.shape[0])

In [None]:
# aggergate attribute values
df = raw_data.groupby(['product_key', 'product_name', 'cluster', 'h1_l1_hierarchy_name','h1_l2_hierarchy_name','h1_l3_hierarchy_name'], as_index=False).agg({'attribute_value': ' '.join})
print(df)

In [58]:
# load spacy english package
nlp = spacy.load('en_core_web_sm')

In [60]:
# stopword removal
def preprocess(text):
    result = []
    temp = []
    text = nlp(text)
    for token in text:
        temp.append(token.lemma_)
    text = ' '.join(temp)
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3 and token !='pron':
            # result.append(lemmatize_stemming(token))
            result.append(token)
    return result

# remove special characters and punctuation
df['product_key'] = df['product_key'].astype('int')
df['attribute_value'] = df['attribute_value'].apply(lambda x:re.sub("[^A-Za-z ]+", "", x))
df['attribute_value'] = df['attribute_value'].apply(lambda x:re.sub('\s{2,}', ' ',  x))
df['attribute_value'] = df['attribute_value'].apply(lambda x:preprocess(x))

In [61]:
# Train TFIDF model
texts = df['attribute_value'].tolist()
dictionary=corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(doc) for doc in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
print(corpus_tfidf)

<gensim.interfaces.TransformedCorpus object at 0x7fc71912d588>


In [None]:
for i in range(10):
    print(i, dictionary[i])
print('\n', 'Number of total words: ', len(dictionary), '\n')

for i in range(10):
    print(i, dictionary.dfs[i])

In [63]:
# map keyword from tfidf result
result_list = []
for doc in corpus_tfidf:
    result_list.append([[dictionary[id], np.around(freq, decimals=3)] for id, freq in doc])

In [None]:
# consolidate result of keyword extraction
from operator import itemgetter
result_df = pd.DataFrame()
result_df['keyword_list'] = pd.Series(result_list)
result_df['keyword_list'] = result_df['keyword_list'].apply(lambda x: sorted(x, key=itemgetter(1), reverse=True))
df = pd.concat([df, result_df], axis=1, ignore_index=True)
print(df)

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))