# #0 Table of Content
1. Import packages and data
2. Data cleanse
3. r
4. r
5. f

# #1 Import packages, data and creating functions

In [1]:
import pandas as pd
import numpy as np
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [2]:
review_table   = pd.read_csv("../asset/etlReview.csv")
user_table   = pd.read_csv("../asset/etlUser.csv")

In [3]:
review_table

Unnamed: 0,review_id,username,product_id,date,rating,content,location
0,6ef347c0-c603-422a-b8d1-c4b96bed0207,i*****b,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-09-02 01:13:00,5,Best buy ever\r\nit looks great works great\r\...,
1,af3ccbed-1865-4492-88e3-723e9dda0de9,jessylim70,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-07-21 23:33:00,5,Item received in good condition.\r\nBought dur...,
2,d17ebe72-919d-4c2a-a230-88119aac725c,s*****b,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-05-08 19:37:00,5,Value For Money: yes\r\nBest Feature(s): comfo...,
3,9b72986c-7525-425e-89c6-e27d7e18af50,a*****w,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-06-13 11:03:00,5,Impressed with the quality and looks really ni...,
4,3af89ba2-29c8-4adb-bb3a-cf9dd3ef2255,j*****n,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-10-06 12:56:00,5,Value For Money: yes\r\nBest Feature(s): the w...,
...,...,...,...,...,...,...,...
155248,6d9aedbd-acf4-43c3-aefc-6a7eae22387b,nnumbs,e634b9ba-08e3-4d00-997b-ee4a3437b448,2021-08-05 11:38:00,5,,
155249,5a7c4ed5-a77d-4c77-92da-633f20147265,yld7zt3fab,e634b9ba-08e3-4d00-997b-ee4a3437b448,2021-11-14 08:31:00,5,,
155250,3864b6fe-2dde-4ebf-a7d9-4ab2ab54e53e,elischan93,8d69cbbc-7c81-41fd-8414-fc22895f3f6b,2023-05-03 20:43:00,5,Value For Money: yes\r\nBest Feature(s): Nice ...,
155251,dca0ff22-40cf-48fc-a222-cf30d5e00388,yxfxrblqz5,8d69cbbc-7c81-41fd-8414-fc22895f3f6b,2023-03-25 18:24:00,5,Value For Money: song system is good .buy again,


In [4]:
def pre_process(text):
    # remove self-generated words
    text = re.sub('Value For Money','', str(text))
    text = re.sub('Best Feature\(s\)','', text)
    text = re.sub('Performance: ','', text)

    # remove characters that are not a-zA-z
    text = re.sub('[^a-zA-Z]+',' ', text)
    
    # change all to lowercase
    text=text.lower()
    
    return str(text)

def remove_stop_words(contents):
    removed_stop_words = []
    for content in contents:
        removed_stop_words.append(
            ' '.join([word for word in content.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

def sentiment_score(contents):
    sia = SentimentIntensityAnalyzer()
    sentiment_list = []
    custom_lexicon = {
        'but': -1,
        'nice': 2,
        'solid': 2,
        'well': 2,
        'fast': 2,
        'ok': 0,
        'no': -2,
        'worst': -2   
    }
    
    sia.lexicon.update(custom_lexicon)
    
    for content in contents:
        if content != 'nan':
            sentiment_dict = sia.polarity_scores(content)
        
            if sentiment_dict['compound'] >= 0.4:
                sentiment_list.append("Positive")
                
            elif sentiment_dict['compound'] < 0:
                sentiment_list.append("Negative")
            else :
                sentiment_list.append("Neutral")
        else:
            sentiment_list.append(np.nan)
            
    return sentiment_list

# #2 Cleanse the text data and get sentiment scores

In [5]:
contents_all = review_table['content'].apply(lambda x:pre_process(x))
review_table['cleansed_content'] = contents_all
contents_all

0         best buy ever it looks great works great the s...
1         item received in good condition bought during ...
2          yes comfortable to wear good received very fa...
3         impressed with the quality and looks really ni...
4          yes the weight good excited to try the perfor...
                                ...                        
155248                                                  nan
155249                                                  nan
155250     yes nice bass great seller is very patient an...
155251                        song system is good buy again
155252                                                  nan
Name: content, Length: 155253, dtype: object

In [6]:
sentiment_pd = pd.DataFrame(sentiment_score(contents_all),columns=['sentiment'])
review_table = pd.concat([review_table, sentiment_pd], axis = 1)
review_table

Unnamed: 0,review_id,username,product_id,date,rating,content,location,cleansed_content,sentiment
0,6ef347c0-c603-422a-b8d1-c4b96bed0207,i*****b,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-09-02 01:13:00,5,Best buy ever\r\nit looks great works great\r\...,,best buy ever it looks great works great the s...,Positive
1,af3ccbed-1865-4492-88e3-723e9dda0de9,jessylim70,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-07-21 23:33:00,5,Item received in good condition.\r\nBought dur...,,item received in good condition bought during ...,Positive
2,d17ebe72-919d-4c2a-a230-88119aac725c,s*****b,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-05-08 19:37:00,5,Value For Money: yes\r\nBest Feature(s): comfo...,,yes comfortable to wear good received very fa...,Positive
3,9b72986c-7525-425e-89c6-e27d7e18af50,a*****w,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-06-13 11:03:00,5,Impressed with the quality and looks really ni...,,impressed with the quality and looks really ni...,Positive
4,3af89ba2-29c8-4adb-bb3a-cf9dd3ef2255,j*****n,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-10-06 12:56:00,5,Value For Money: yes\r\nBest Feature(s): the w...,,yes the weight good excited to try the perfor...,Positive
...,...,...,...,...,...,...,...,...,...
155248,6d9aedbd-acf4-43c3-aefc-6a7eae22387b,nnumbs,e634b9ba-08e3-4d00-997b-ee4a3437b448,2021-08-05 11:38:00,5,,,,
155249,5a7c4ed5-a77d-4c77-92da-633f20147265,yld7zt3fab,e634b9ba-08e3-4d00-997b-ee4a3437b448,2021-11-14 08:31:00,5,,,,
155250,3864b6fe-2dde-4ebf-a7d9-4ab2ab54e53e,elischan93,8d69cbbc-7c81-41fd-8414-fc22895f3f6b,2023-05-03 20:43:00,5,Value For Money: yes\r\nBest Feature(s): Nice ...,,yes nice bass great seller is very patient an...,Positive
155251,dca0ff22-40cf-48fc-a222-cf30d5e00388,yxfxrblqz5,8d69cbbc-7c81-41fd-8414-fc22895f3f6b,2023-03-25 18:24:00,5,Value For Money: song system is good .buy again,,song system is good buy again,Positive


In [7]:
review_table['sentiment'].value_counts()

sentiment
Positive    68264
Neutral      8532
Negative     5201
Name: count, dtype: int64

In [8]:
review_table.shape

(155253, 8)

# #3 Transform the text data: TF-IDF

In [9]:
vectorizer = TfidfVectorizer()
tfidf_vect = vectorizer.fit_transform(contents_all.tolist())



In [10]:
tfidf_transformer=TfidfTransformer(smooth_idf=True, use_idf=True)
reviews_tfidf = tfidf_transformer.fit_transform(reviews_counts)

NameError: name 'reviews_counts' is not defined

In [None]:
tfidf_vect.shape

# TESTING OF CODES

In [77]:
user_review_table = user_table.merge(review_table)
user_sentiments = user_review_table.groupby('username').apply(lambda x: x['sentiment'].values.tolist()).reset_index(name='sentiment_list')
user_contents = user_review_table.groupby('username').apply(lambda x: x['cleansed_content'].values.tolist()).reset_index(name='content_list')
user_contents_sentiments = user_review_table.groupby('username').apply(lambda x: x.set_index('cleansed_content')['sentiment'].to_dict()).reset_index(name='content__sentiment_list')

In [53]:
user_review_table

Unnamed: 0,username,no_review,no_product,mean_rating,review_list,product_dict,review_id,product_id,date,rating,content,location,cleansed_content,sentiment
0,.*****.,10,5,4.9,"['03afe402-5369-42bf-9b01-1427b239d0ee', 'afa6...","{'d19cf120-93e6-4f1c-89c2-2b18187139cb': 2, '5...",03afe402-5369-42bf-9b01-1427b239d0ee,d19cf120-93e6-4f1c-89c2-2b18187139cb,2023-06-23 22:07:00,4,the F keys on top and volume buttons don’t wor...,,the f keys on top and volume buttons don t wor...,Positive
1,.*****.,10,5,4.9,"['03afe402-5369-42bf-9b01-1427b239d0ee', 'afa6...","{'d19cf120-93e6-4f1c-89c2-2b18187139cb': 2, '5...",afa6c7a3-ce3e-4e95-8313-11de41c58f26,d19cf120-93e6-4f1c-89c2-2b18187139cb,2022-02-16 21:39:00,5,Well received in good packaging and condition....,,well received in good packaging and condition ...,Positive
2,.*****.,10,5,4.9,"['03afe402-5369-42bf-9b01-1427b239d0ee', 'afa6...","{'d19cf120-93e6-4f1c-89c2-2b18187139cb': 2, '5...",c6cc52fc-05e4-44e6-ab72-70e2a780a8c2,5407f33f-02df-4f64-91ec-339e7c6ddcce,2022-09-27 14:32:00,5,delivery was quite fast \r\neasy to set up\r\n...,,delivery was quite fast easy to set up worth i...,Positive
3,.*****.,10,5,4.9,"['03afe402-5369-42bf-9b01-1427b239d0ee', 'afa6...","{'d19cf120-93e6-4f1c-89c2-2b18187139cb': 2, '5...",e7d91051-3eff-4968-81a2-4c614ceff259,24aa663b-b441-4b06-876c-126c218f04f3,2022-05-14 14:05:00,5,Received the mouse in good condition. Nice met...,,received the mouse in good condition nice meta...,Positive
4,.*****.,10,5,4.9,"['03afe402-5369-42bf-9b01-1427b239d0ee', 'afa6...","{'d19cf120-93e6-4f1c-89c2-2b18187139cb': 2, '5...",b4368c55-0d95-429e-9adb-b39f1036d3b5,df80f796-2e87-4584-b2ac-a66afb249e3d,2022-05-14 14:05:00,5,Received the mouse in good condition. Nice met...,,received the mouse in good condition nice meta...,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155248,zzzhenling,1,1,5.0,['157eddc6-1c51-4dcc-8ac6-b7b60b456a1e'],{'c85e68dc-cd67-4947-a0ef-ed3e725d144d': 1},157eddc6-1c51-4dcc-8ac6-b7b60b456a1e,c85e68dc-cd67-4947-a0ef-ed3e725d144d,2022-11-02 14:34:00,5,,,,
155249,zzzjj.e,1,1,5.0,['8916a469-20c1-4efd-aa6e-57ef121de797'],{'34bb8bfe-e134-41af-a487-3b8d4ce6fff0': 1},8916a469-20c1-4efd-aa6e-57ef121de797,34bb8bfe-e134-41af-a487-3b8d4ce6fff0,2022-12-25 19:46:00,5,,,,
155250,zzzkelzel,1,1,5.0,['b17b99ab-3579-493e-a386-ce402d557263'],{'4718d2a4-f0a4-48cd-a0ce-e1b663bdba02': 1},b17b99ab-3579-493e-a386-ce402d557263,4718d2a4-f0a4-48cd-a0ce-e1b663bdba02,2022-08-27 19:05:00,5,,,,
155251,zzzmonsterz,1,1,5.0,['52ff1429-7fe1-4722-8741-9419676d0ab8'],{'4dba642d-d7be-409f-87b2-e21e1389a794': 1},52ff1429-7fe1-4722-8741-9419676d0ab8,4dba642d-d7be-409f-87b2-e21e1389a794,2023-09-13 07:50:00,5,,,,


In [124]:
user_contents_sentiments = pd.concat([user_sentiments, user_contents], axis = 1)

In [123]:
tfidf_vectorizer_positive = TfidfVectorizer()

for x in user_sentiments['sentiment_list'][0]:
    positive_content = ''
    if x == 'Positive':
        for y in user_contents['content_list'][0]:
            print(y)
            positive_content = ' '.join(y)
           
            print('next')
        
            
        tfidf_matrix_positive = tfidf_vectorizer_positive.fit_transform(positive_content)
positive_data = user_contents_sentiments[user_contents_sentiments['sentiment_list'] == 'positive']['content_list']

the f keys on top and volume buttons don t work for some reason other than that pretty okay fast delivery too
next
well received in good packaging and condition very nice color and smooth to type doesnt make too much noise when typing and mouse quite comfortable to use 
next
delivery was quite fast easy to set up worth it for the price
next
received the mouse in good condition nice metallic pink colour 
next
received the mouse in good condition nice metallic pink colour 
next
fast delivery for overseas shipping not bad reasonable price 
next
fast delivery for overseas shipping not bad reasonable price 
next
fast delivery for overseas shipping not bad reasonable price 
next
fast delivery for overseas shipping not bad reasonable price 
next
fast delivery for overseas shipping not bad reasonable price 
next
the f keys on top and volume buttons don t work for some reason other than that pretty okay fast delivery too
next
well received in good packaging and condition very nice color and smo

In [126]:
positive_data = user_contents_sentiments[user_contents_sentiments['sentiment_list'] == 'positive']['content_list']
positive_data = user_contents_sentiments[user_contents_sentiments['sentiment_list'] == 'negative']['content_list']

In [74]:
for i in range(len(user_sentiments['sentiment_list'])):
    for x in range(len(user_sentiments['sentiment_list'][i])):
        
#user_sentiments['sentiment_list'][0]

SyntaxError: incomplete input (2959065947.py, line 4)

In [None]:
vectorizer = TfidfVectorizer()
tfidf_vect = vectorizer.fit_transform(contents_all.tolist())

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Sample data (replace with your DataFrame)
data = {
    'user_sentiment': ['positive', 'negative', 'positive', 'positive', 'negative', 'positive'],
    'user_reviews': ["great product", "poor quality", "excellent service", "highly recommend", "disappointing experience", "wonderful job"]
}

df = pd.DataFrame(data)

# Separate data into positive and negative subsets
positive_data = df[df['user_sentiment'] == 'positive']['user_reviews']
negative_data = df[df['user_sentiment'] == 'negative']['user_reviews']

# Initialize TF-IDF vectorizers for positive and negative data
tfidf_vectorizer_positive = TfidfVectorizer()
tfidf_vectorizer_negative = TfidfVectorizer()

# Calculate TF-IDF scores for positive and negative data
tfidf_matrix_positive = tfidf_vectorizer_positive.fit_transform(positive_data)
tfidf_matrix_negative = tfidf_vectorizer_negative.fit_transform(negative_data)

# Get feature names (words) for positive and negative data
feature_names_positive = tfidf_vectorizer_positive.get_feature_names_out()
feature_names_negative = tfidf_vectorizer_negative.get_feature_names_out()

# Calculate the average TF-IDF score for each term in positive and negative data
avg_tfidf_scores_positive = tfidf_matrix_positive.mean(axis=0).A1
avg_tfidf_scores_negative = tfidf_matrix_negative.mean(axis=0).A1

# Sort terms by their average TF-IDF scores in descending order
top_positive_words = [feature_names_positive[i] for i in avg_tfidf_scores_positive.argsort()[::-1][:3]]
top_negative_words = [feature_names_negative[i] for i in avg_tfidf_scores_negative.argsort()[::-1][:3]]

# Print the top positive and negative words
print("Top 3 Positive Words:", top_positive_words)
print("Top 3 Negative Words:", top_negative_words)

Top 3 Positive Words: ['wonderful', 'service', 'recommend']
Top 3 Negative Words: ['quality', 'poor', 'experience']


# #References

In [9]:
# Download the English stopwords file
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rache\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
