In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Dell\\Desktop\\amazon_scraper\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\Dell\\Desktop\\amazon_scraper'

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import pandas as pd
import numpy as np
import sqlite3

In [6]:
conn = sqlite3.connect('amazon_reviews.db')

In [7]:
df = pd.read_sql_query("SELECT * FROM reviews", conn)

In [8]:
df.head()

Unnamed: 0,Title,Description,Rating,Color,Storage Size,Verified Purchase
0,Confused? Read on! One stop review for your fl...,I went through a lot of reviews and articles b...,5.0,Black,128GB,Verified Purchase
1,Excellent product; please purchase if your dat...,The iPhone 12 is the latest iteration of Apple...,5.0,Black,128GB,Verified Purchase
2,An overall good premium experience,"SUMMERY: As a first time iPhone user, I would ...",4.0,,,Verified Purchase
3,Perfect product,It's my second iPhone ever and I bought it wit...,5.0,,,Verified Purchase
4,First step into the ecosystem!,"I have been a smartphone user since 2015, used...",5.0,Blue,128GB,Verified Purchase


In [9]:
df['Rating'].value_counts()

Rating
5.0    53
4.0    41
3.0     5
1.0     1
Name: count, dtype: int64

In [10]:
df['Verified Purchase'].value_counts()

Verified Purchase
Verified Purchase    100
Name: count, dtype: int64

In [11]:
df['Color'].value_counts()

Color
N/A             62
Blue            20
Black           16
(PRODUCT)RED     2
Name: count, dtype: int64

In [12]:
df['Storage Size'].value_counts()

Storage Size
N/A      62
128GB    36
64GB      2
Name: count, dtype: int64

In [13]:
def analyze_keywords(reviews):
    all_text = ' '.join(reviews['Description'])
    vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
    tfidf_matrix = vectorizer.fit_transform([all_text])
    feature_names = vectorizer.get_feature_names_out()
    sentiments = [TextBlob(text).sentiment.polarity for text in reviews['Description']]
    word_sentiments = {word: [] for word in feature_names}
    for review, sentiment in zip(reviews['Description'], sentiments):
        words = set(review.lower().split())
        for word in words.intersection(feature_names):
            word_sentiments[word].append(sentiment)

    avg_sentiments = {word: np.mean(sents) if sents else 0 for word, sents in word_sentiments.items()}
    sorted_words = sorted(avg_sentiments.items(), key=lambda x: x[1])
    worst_keywords = sorted_words[:10]
    best_keywords = sorted_words[-10:][::-1]
    
    return best_keywords, worst_keywords

In [14]:
best_keywords, worst_keywords = analyze_keywords(df)

In [15]:
print("Best Keywords:", best_keywords)
print("Worst Keywords:", worst_keywords)

Best Keywords: [('hii', np.float64(1.0)), ('gays', np.float64(1.0)), ('branded', np.float64(1.0)), ('sister', np.float64(0.9099999999999999)), ('loves', np.float64(0.9099999999999999)), ('law', np.float64(0.9099999999999999)), ('lags', np.float64(0.701764705882353)), ('pricelove', np.float64(0.7)), ('jindabad', np.float64(0.7)), ('superb', np.float64(0.6966911764705883))]
Worst Keywords: [('accesories', np.float64(-0.051679841897233215)), ('accessory', np.float64(-0.051679841897233215)), ('annoying', np.float64(-0.051679841897233215)), ('answer', np.float64(-0.051679841897233215)), ('boring', np.float64(-0.051679841897233215)), ('claim', np.float64(-0.051679841897233215)), ('consistency', np.float64(-0.051679841897233215)), ('decade', np.float64(-0.051679841897233215)), ('exorbitant', np.float64(-0.051679841897233215)), ('explore', np.float64(-0.051679841897233215))]


In [16]:
len(best_keywords)

10

In [17]:
len(worst_keywords)

10

In [18]:
best_keyword=[]
for i in range(len(best_keywords)):
    best_keyword.append(best_keywords[i][0])

In [19]:
best_keyword

['hii',
 'gays',
 'branded',
 'sister',
 'loves',
 'law',
 'lags',
 'pricelove',
 'jindabad',
 'superb']

In [20]:
worst_keyword=[]
for i in range(len(worst_keywords)):
    worst_keyword.append(worst_keywords[i][0])

In [21]:
worst_keyword

['accesories',
 'accessory',
 'annoying',
 'answer',
 'boring',
 'claim',
 'consistency',
 'decade',
 'exorbitant',
 'explore']