# Topic modeling on product reviews

In [None]:
#imports
import json
import spacy

from collections import Counter
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score

#visualize data
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# paths
path_reviews =".\\src\\reviews.jsonl"
path_token =".\\outputs\\token_reviews.jsonl"

## Linguistic treatment

### Read jsonl file, tokenization and lemmatisation

In [None]:
nlp = spacy.load("en_core_web_sm")

# read JSONL line by line
with open(path_reviews, 'r', encoding='utf-8') as file:
    lines = file.readlines()

all_tokens = []
all_lemmas = []
for line in lines:
    review = json.loads(line)
    title = review.get("title", "")
    text = review.get("text", "")
    full_text = f"{title} {text}"

    #spacy tokens
    doc = nlp(full_text)
    tokens = [token.text for token in doc]
    all_tokens.append(tokens)

    #spacy lemmas without stop words, numericals and prepositions
    lemmas = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and token.tag_ != "IN" ]
    all_lemmas.append(lemmas)


### Tokens

In [None]:
for tokens in all_tokens:
    print(tokens)

['No', 'white', 'background', '!', 'It', '’s', 'clear', '!', 'I', 'bought', 'this', 'bc', 'I', 'thought', 'it', 'had', 'the', 'nice', 'white', 'background', '.', 'Turns', 'out', 'it', '’s', 'clear', '&', 'since', 'my', 'phone', 'is', 'blue', 'it', 'does', 'n’t', 'look', 'anything', 'like', 'this', '.', ' ', 'If', 'I', 'had', 'known', 'that', 'I', 'would', 'have', 'purchased', 'something', 'else', '.', 'It', 'works', 'ok', '.']
['Awesome', '!', ' ', 'Great', 'price', '!', ' ', 'Works', 'well', '!', 'Perfect', '.', 'How', 'pissed', 'am', 'I', 'that', 'I', 'recently', 'paid', '$', '20', 'for', '1', 'Fitbit', 'cable', 'and', 'promptly', 'lost', 'the', 'damned', 'thing', '?', ' ', 'Extremely', 'pissed', '!', ' ', 'I', 'keep', 'the', 'spare', 'in', 'my', 'medicine', 'bag', 'so', 'hopefully', 'I', 'wo', 'n’t', 'lose', 'it', 'and', 'my', 'grandson', 'ca', 'n’t', 'get', 'to', 'it', 'and', 'try', 'to', 'use', 'it', 'as', 'a', 'belt', 'or', 'a', 'dog', 'leash', 'or', 'any', 'of', 'the', 'other', 

### Lemmmes
Without stop words and non pertinent elements (ponctuation, 123..., WWW...)

In [None]:
 for lemas in all_lemmas:
    print(lemas)

['white', 'background', 'clear', 'buy', 'bc', 'think', 'nice', 'white', 'background', 'turn', 'clear', 'phone', 'blue', 'look', 'know', 'purchase', 'work', 'ok']
['awesome', 'great', 'price', 'work', 'perfect', 'pissed', 'recently', 'pay', 'Fitbit', 'cable', 'promptly', 'lose', 'damned', 'thing', 'extremely', 'pissed', 'spare', 'medicine', 'bag', 'hopefully', 'will', 'lose', 'grandson', 'try', 'use', 'belt', 'dog', 'leash', 'nutty', 'thing']
['work', 'take', 'hour', 'install', 'Overall', 'happy', 'end', 'result', 'hate', 'puzzle', 'love', 'puzzle', 'work', 'took', 'lot', 'concentration', 'attention', 'detail', 'hour', 'YouTube', 'video', 'help', 'ton', 'instal', 'new', 'screen', 'highly', 'recommend', 'video', 'replace', 'screen', 'tool', 'supply', 'provide', 'adequate', 'use', 'additional', 'tool', 'home', 'successfully', 'instal', 'new', 'screen', 'screw', 'inside', 'iPhone', 'stick', 'use', 'x', 'acto', 'knife', 'come', 'glass', 'Screen', 'iPhone', 'beautiful', 'work', 'great', 'scr

### Save data into a jsonl file to use later

In [None]:
with open(path_token, 'w', encoding='utf-8') as output: 
    for lemmm in all_lemmas :
        json.dump({"token":lemmm}, output, ensure_ascii=False)
        output.write('\n')

## Non supervised clustering

### Starting from output file
load file

In [None]:
corpus = []
with open(path_token, 'r', encoding='utf-8') as file:
    for line in file:
        doc = json.loads(line)
        text = doc.get("token", "")
        corpus.extend(text)

# print(corpus)

vectori initialization and matrix TF IDF

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [11]:
print(f"dimension of matrix TF IDF : {X.shape}")
print(f"feature names : {vectorizer.get_feature_names_out()}")

dimension of matrix TF-IDF : (22374, 2756)
feature names : ['aaaarrrrrggggghhhh' 'ability' 'able' ... 'zipper' 'zippy' 'zoom']


dense matrix

In [None]:
dense_matrix = np.asarray(X.todense())
print(dense_matrix)

[[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]


Reduction test before DBSCAN

In [None]:
svd = TruncatedSVD(n_components=50, random_state=42)
X_svd = svd.fit_transform(dense_matrix)

pca = PCA(n_components=3, random_state=42)
X_reduced = pca.fit_transform(X_svd)

In [None]:
clusters = DBSCAN(eps=0.5, min_samples=4, metric='cosine').fit(X_reduced)

In [None]:
print(clusters.labels_)
print(len(clusters.labels_))
print(clusters.n_features_in_)

[0 0 0 ... 0 0 0]
22374
3


### Analyse des clusters

In [None]:
# score 
silhouette = silhouette_score(X_reduced, clusters.labels_, metric='cosine')
print(f"Silhouette Score: {silhouette}")

Silhouette Score: 0.9968194984750952


In [None]:
#mots les plus fréquents des clusters 
clustered_docs = {}
for idx, cluster in enumerate(clusters.labels_):
    if cluster not in clustered_docs:
        clustered_docs[cluster] = []
    clustered_docs[cluster].append(corpus[idx])

print("\nmost frquent words in clusters :")
for cluster, docs in clustered_docs.items():
    
    combined_text = " ".join(docs)
    # split text and keep alphabetical tokens
    tokens = combined_text.lower().split()
    tokens = [token for token in tokens if token.isalpha()]
    #get 10 most common words
    word_counts = Counter(tokens)
    most_common_words = word_counts.most_common(10)
    
    print(f"\nCluster {cluster}:")
    for word, count in most_common_words:
        print(f"{word}: {count} occurences")


most frquent words in clusters :

Cluster 0:
work: 338 occurence
good: 296 occurence
fit: 258 occurence
love: 254 occurence
charge: 225 occurence
screen: 204 occurence
use: 197 occurence
easy: 178 occurence
nice: 177 occurence
look: 175 occurence


### Visualize clusters 

In [None]:

df = pd.DataFrame({
    "Dim 1": X_reduced[:, 0],
    "Dim 2": X_reduced[:, 1],
    "Cluster": clusters.labels_
})

plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=df,
    x="Dimension 1",
    y="Dimension 2",
    hue="Cluster",
    palette="tab10", 
    style="Cluster",
    markers=True,
    s=100 
    )

plt.title("Clusters visualisation", fontsize=16)
plt.legend(title="Clusters", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
