In [2]:
import os
import numpy as np
import pandas as pd
from preprocessing import *

# Read and preprocess the dataset

In [3]:
reviews = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")
reviews.drop(columns=reviews.columns[0], axis=1, inplace=True)
df = reviews.copy()

In [4]:
df['Processed Review Text'] = df['Review Text'].fillna('').apply(lambda x: preprocessText(x, True))
df['Processed Title'] = df['Title'].fillna('').apply(lambda x: preprocessText(x, True))

# Method 1: Key words/aspect extraction

In [7]:
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag
import re

In [8]:
# nltk.download()# install the missing module
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lswht\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
def extract_keywords(tokens):
    tagged = pos_tag(tokens)
    keywords = [word for word, tag in tagged if tag in ('NN', 'NNS', 'JJ')] # noun, singular or plural, adjective or numeral
    return keywords

In [10]:
# get key words from review text
df['Keywords'] = df['Processed Review Text'].apply(extract_keywords)
all_keywords = sum(df['Keywords'], [])
keyword_freq = Counter(all_keywords)
print(keyword_freq.most_common(10))

[('dress', 10929), ('size', 9355), ('fit', 9021), ('top', 8168), ('color', 6848), ('great', 6076), ('look', 4863), ('fabric', 4744), ('small', 4574), ('wear', 4305)]


In [11]:
# get key words from review title
df["Keywords_title"] = df['Processed Title'].apply(extract_keywords)
all_keywords_title = sum(df["Keywords_title"], [])
keyword_freq_title = Counter(all_keywords_title)
print(keyword_freq_title.most_common(10))

[('great', 1788), ('dress', 1649), ('cute', 1531), ('love', 1421), ('beautiful', 1404), ('top', 1172), ('perfect', 803), ('fit', 668), ('nice', 506), ('comfortable', 469)]


### The keywords extracted above seem to contain some common words in the corpus like great, look etc. 
Will try to adjust by using some Corpus library.

In [12]:
# add library from corpus for keywords extraction
nltk.download('brown')
from nltk.corpus import brown
from nltk.probability import FreqDist

# Calculate frequency distribution in the Brown Corpus
brown_words = brown.words()
fdist_brown = FreqDist(w.lower() for w in brown_words)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\lswht\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [13]:
# when threshold set to 0.001, still see words like great or love. adjusting the threshold to 0.0004
def extract_keywords_withCorpus(tokens, fdist_external, threshold=0.0004): 
    # Filter out tokens that are too common in the external corpus
    tagged = pos_tag(tokens)
    filtered_tokens = [word for word, tag in tagged if tag in ('NN', 'NNS', 'JJ')]
    keywords = [word for word in filtered_tokens if fdist_external.freq(word) < threshold]
    return keywords

In [14]:
df['Keywords_v2'] = df['Processed Review Text'].apply(lambda tokens: extract_keywords_withCorpus(tokens, fdist_brown))
all_keywords_v2 = Counter(sum(df["Keywords_v2"], []))

In [15]:
all_keywords_v2.most_common(15)

[('dress', 10929),
 ('size', 9355),
 ('fit', 9021),
 ('top', 8168),
 ('color', 6848),
 ('look', 4863),
 ('fabric', 4744),
 ('wear', 4305),
 ('love', 3969),
 ('soft', 3300),
 ('perfect', 3268),
 ('comfortable', 3007),
 ('beautiful', 2908),
 ('cute', 2887),
 ('nice', 2855)]

In [16]:
df['Keywords_title_v2'] = df['Processed Title'].apply(lambda tokens: extract_keywords_withCorpus(tokens, fdist_brown))
all_keywords_title_v2 = Counter(sum(df["Keywords_title_v2"], []))

In [17]:
all_keywords_title_v2.most_common(15)

[('dress', 1649),
 ('cute', 1531),
 ('love', 1421),
 ('beautiful', 1404),
 ('top', 1172),
 ('perfect', 803),
 ('fit', 668),
 ('nice', 506),
 ('comfortable', 469),
 ('color', 455),
 ('comfy', 453),
 ('gorgeous', 446),
 ('soft', 432),
 ('summer', 429),
 ('sweater', 420)]

In [18]:
combined_counter = all_keywords_v2 + all_keywords_title_v2
combined_counter.most_common(15)

[('dress', 12578),
 ('fit', 9689),
 ('size', 9581),
 ('top', 9340),
 ('color', 7303),
 ('love', 5390),
 ('look', 5041),
 ('fabric', 5026),
 ('cute', 4418),
 ('wear', 4407),
 ('beautiful', 4312),
 ('perfect', 4071),
 ('soft', 3732),
 ('comfortable', 3476),
 ('nice', 3361)]

#### Will filter the words manually later

### Background data

In [19]:
# there are still words that seem to come from the same aspect. May worth to get the aspect categories from background dataset.
# background data is a txt file with each row containing one review
background_reviews = []
with open('background_data_dev.txt') as f:
    for line in f:
        background_reviews.append(line.split('###')[0])

df_background = pd.DataFrame(background_reviews, columns = ["Review"])

In [22]:
# preprocess
df_background['Processed Review'] = df_background['Review'].fillna('').apply(lambda x: preprocessText(x, True))
# get key words
df_background['Keywords'] = df_background['Processed Review'].apply(extract_keywords)
# Aggregating Keywords
all_keywords_bg = sum(df_background['Keywords'], [])
keyword_freq_bg = Counter(all_keywords_bg)

# Display most common keywords
print(keyword_freq_bg.most_common(10))

[('great', 39), ('good', 33), ('size', 31), ('shoe', 31), ('time', 28), ('year', 28), ('fit', 26), ('comfortable', 24), ('pair', 23), ('use', 18)]


the key words do not seem to be substantially better than the main data. Will use the key words derived from original data review text + title.

# Method 2: Key words/aspect extraction

In [9]:
# pip install gensim

Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-win_amd64.whl (24.0 MB)
     --------------------------------------- 24.0/24.0 MB 20.4 MB/s eta 0:00:00
Collecting numpy>=1.17.0 (from gensim)
  Using cached numpy-1.21.6-cp37-cp37m-win_amd64.whl (14.0 MB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Collecting Cython==0.29.28 (from gensim)
  Downloading Cython-0.29.28-py2.py3-none-any.whl (983 kB)
     ------------------------------------- 983.8/983.8 kB 20.7 MB/s eta 0:00:00
Downloading smart_open-6.4.0-py3-none-any.whl (57 kB)
   ---------------------------------------- 57.0/57.0 kB 2.9 MB/s eta 0:00:00
Installing collected packages: smart-open, numpy, Cython, gensim
  Attempting uninstall: numpy
    Found existing installation: numpy 1.16.4
    Uninstalling numpy-1.16.4:
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'c:\\programdata\\anaconda3\\lib\\site-packages\\numpy-1.16.4.dist-info\\entry_points.txt'
Consider using the `--user` option or check the permissions.



In [23]:
import nltk
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lswht\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
df['Combined_Text'] = df['Review Text'].fillna('') + ' ' + df['Title'].fillna('')

# Tokenize and remove stop words
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
texts = [
    [word for word in tokenizer.tokenize(document.lower()) if word not in stop_words]
    for document in df['Combined_Text']
]

# Create a dictionary and corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [26]:
# Set parameters
num_topics = 10  # Adjust this based on your needs
passes = 10  # Number of passes through corpus during training

# Create and train the LDA model
lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

# Print the topics
topics = lda.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.031*"blue" + 0.030*"color" + 0.021*"white" + 0.021*"one"')
(1, '0.039*"great" + 0.034*"sweater" + 0.029*"love" + 0.026*"soft"')
(2, '0.158*"dress" + 0.026*"beautiful" + 0.020*"love" + 0.014*"flattering"')
(3, '0.020*"size" + 0.019*"fit" + 0.018*"would" + 0.014*"like"')
(4, '0.071*"top" + 0.022*"bra" + 0.022*"wear" + 0.021*"tank"')
(5, '0.084*"5" + 0.033*"size" + 0.028*"fit" + 0.027*"length"')
(6, '0.042*"size" + 0.035*"small" + 0.026*"large" + 0.022*"medium"')
(7, '0.033*"like" + 0.022*"fabric" + 0.018*"top" + 0.016*"look"')
(8, '0.073*"skirt" + 0.053*"color" + 0.036*"blouse" + 0.036*"beautiful"')
(9, '0.040*"love" + 0.030*"great" + 0.027*"jeans" + 0.022*"wear"')


# Results

By examing the data, and the aspects above, we came up with the aspects below.

**list of words**:

aspects = ["dress", "love", "fit", "size", "top", "color", "look", "wear", "fabric", "cute", "flattering", "comfortable"]

**word cluster**:

{"name": "praise", "keywords": ["great", "love", "perfect", "wonderful", "good", "glad", "compliment", "favorite", "well"]},

{"name": "fabric", "keywords": ["fabric", "soft", "silky"]},

{"name": "size", "keywords": ["size", "small", "large", "fit", "length"]},

{"name": "look", "keywords": ["look", "beautiful", "flattering", "sexy", "pretty", "flirty", "fabulous"]},

{"name": "color", "keywords": ["color", "blue", "white", "black"]},

{"name": "price", "keywords": ["price", "worth", "quality"]}
