In [1]:
# Importing dependencies
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Read the combined data
cmb_data = pd.read_csv(r'Combined_Categories_dataset.csv')
cmb_data.shape

(40000, 12)

In [3]:
# Extract reveiw text
cmb_data=cmb_data[['reviewText']]

In [4]:
# First and last 10000 review texts from amazon food 
first_10K=cmb_data.head(10000)
first_10K.tail()

Unnamed: 0,reviewText
9995,"these are no immatation, they are the real dea..."
9996,Great pair of shoes. This is my third pair of ...
9997,I typically wear a size 8-9 (W) depending on t...
9998,My husband has always loved deck shoes but lat...
9999,"i had this shoes but in different color, since..."


In [5]:
# Last 10000 review texts from electronics
tail_10K=cmb_data.tail(10000)
tail_10K.tail()

Unnamed: 0,reviewText
39995,"Had it 1 day and it quit working, will be retu..."
39996,Received item in 2 days. Product worked as adv...
39997,I have it plugged into a usb extension on my g...
39998,Fast delivery product was simple to use
39999,"Working as advertised, so far no problems."


In [6]:
# Combining both categaries
total_20K=pd.concat([first_10K,tail_10K], ignore_index=True)
total_20K.tail()

Unnamed: 0,reviewText
19995,"Had it 1 day and it quit working, will be retu..."
19996,Received item in 2 days. Product worked as adv...
19997,I have it plugged into a usb extension on my g...
19998,Fast delivery product was simple to use
19999,"Working as advertised, so far no problems."


In [7]:
# Lowercasing

total_20K['reviewText_processed']=total_20K['reviewText'].str.lower()
total_20K

Unnamed: 0,reviewText,reviewText_processed
0,Exactly what I needed.,exactly what i needed.
1,"I agree with the other review, the opening is ...","i agree with the other review, the opening is ..."
2,Love these... I am going to order another pack...,love these... i am going to order another pack...
3,too tiny an opening,too tiny an opening
4,Okay,okay
...,...,...
19995,"Had it 1 day and it quit working, will be retu...","had it 1 day and it quit working, will be retu..."
19996,Received item in 2 days. Product worked as adv...,received item in 2 days. product worked as adv...
19997,I have it plugged into a usb extension on my g...,i have it plugged into a usb extension on my g...
19998,Fast delivery product was simple to use,fast delivery product was simple to use


In [8]:
import string

In [9]:
# Remove punctuation
total_20K['reviewText_processed']=total_20K['reviewText_processed'].str.translate(str.maketrans('', '', string.punctuation))
total_20K.tail()

Unnamed: 0,reviewText,reviewText_processed
19995,"Had it 1 day and it quit working, will be retu...",had it 1 day and it quit working will be returned
19996,Received item in 2 days. Product worked as adv...,received item in 2 days product worked as adve...
19997,I have it plugged into a usb extension on my g...,i have it plugged into a usb extension on my g...
19998,Fast delivery product was simple to use,fast delivery product was simple to use
19999,"Working as advertised, so far no problems.",working as advertised so far no problems


In [10]:
total_20K['reviewText_processed']=total_20K['reviewText_processed'].astype(str)

# Compute IDF of all words in these reviews. Report the top 30 words and bottom 30 words, based on IDF, with their IDF scores

In [12]:

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(total_20K['reviewText_processed'])
idf = vectorizer.idf_
idf

array([10.21039037, 10.21039037, 10.21039037, ...,  9.51724319,
       10.21039037, 10.21039037])

In [13]:
terms_idf = dict(zip(vectorizer.get_feature_names_out(), idf))
sorted_words_by_idf = sorted(terms_idf.items(), key=lambda x: x[1])
column_names = ['word', 'IDF_score']
sorted_words_by_idf=pd.DataFrame(sorted_words_by_idf,columns=column_names)
sorted_words_by_idf=sorted_words_by_idf.reset_index(drop=True)

top_30_words = sorted_words_by_idf[-30:][::-1]  # Top 30 words
bottom_30_words = sorted_words_by_idf[:30]


In [14]:
# Printing top 30 words
top_30_words

Unnamed: 0,word,IDF_score
20238,zx,10.21039
20237,zwave,10.21039
20236,zurich,10.21039
20235,zs100,10.21039
20234,zreo,10.21039
20233,zpool,10.21039
20232,zooms,10.21039
20231,zooming,10.21039
20230,zoomed,10.21039
20229,zookki,10.21039


In [15]:
# Printing bottom 30 words
bottom_30_words

Unnamed: 0,word,IDF_score
0,the,1.683643
1,and,1.732458
2,to,1.987031
3,it,2.007223
4,for,2.082986
5,my,2.257599
6,is,2.269273
7,this,2.360482
8,of,2.426542
9,but,2.455052
