In [1]:

import pandas as pd
import numpy as np
import re
import string
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

df = pd.read_csv("Reviews.csv")

# Show first few rows
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charvi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/charvi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/charvi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [2]:
def label_sentiment(score):
    if score <= 2:
        return 0 #negative
    elif score == 3:
        return 1 #neutral 
    else: 
        return 2 #positive 

df['sentiment_score'] = df['Score'].apply(label_sentiment)


df['review'] = df['Summary'].fillna('') + ' ' + df['Text'].fillna('')

df = df[['review', 'sentiment_score']]
print(df.head())


                                              review  sentiment_score
0  Good Quality Dog Food I have bought several of...                2
1  Not as Advertised Product arrived labeled as J...                0
2  "Delight" says it all This is a confection tha...                2
3  Cough Medicine If you are looking for the secr...                0
4  Great taffy Great taffy at a great price.  The...                2


In [3]:

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize text
    tokens = nltk.word_tokenize(text)
   
    # Remove stopwords + short words
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
  
    # Join tokens back to text
    return " ".join(tokens)
   

df["cleaned_review"] = df["review"].astype(str).apply(clean_text)

df.head()


Unnamed: 0,review,sentiment_score,cleaned_review
0,Good Quality Dog Food I have bought several of...,2,good quality dog food bought several vitality ...
1,Not as Advertised Product arrived labeled as J...,0,advertised product arrived labeled jumbo salte...
2,"""Delight"" says it all This is a confection tha...",2,delight say confection around century light pi...
3,Cough Medicine If you are looking for the secr...,0,cough medicine looking secret ingredient robit...
4,Great taffy Great taffy at a great price. The...,2,great taffy great taffy great price wide assor...


In [4]:
df = df[['cleaned_review', 'sentiment_score']]

In [5]:
df.head(20)

Unnamed: 0,cleaned_review,sentiment_score
0,good quality dog food bought several vitality ...,2
1,advertised product arrived labeled jumbo salte...,0
2,delight say confection around century light pi...,2
3,cough medicine looking secret ingredient robit...,0
4,great taffy great taffy great price wide assor...,2
5,nice taffy got wild hair taffy ordered five po...,2
6,great good expensive brand saltwater taffy gre...,2
7,wonderful tasty taffy taffy good soft chewy fl...,2
8,yay barley right mostly sprouting cat eat gras...,2
9,healthy dog food healthy dog food good digesti...,2


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   cleaned_review   568454 non-null  object
 1   sentiment_score  568454 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 8.7+ MB


In [7]:
!pip install sqlalchemy psycopg2-binary




In [8]:
import psycopg2

conn = psycopg2.connect(
    dbname="Customer Feedback Analyzer DB",
    host="localhost",
    user="postgres",
    password="Charvi1234!",
    port=5433
)

cur = conn.cursor()

cur.execute("""
    CREATE TABLE IF NOT EXISTS cleaned_reviews(
        id SERIAL PRIMARY KEY,
        cleaned_review VARCHAR(50000) NOT NULL,
        sentiment_score INT NOT NULL
    );
""")

conn.commit()
conn.close()




In [9]:
from sqlalchemy import create_engine

engine = create_engine("postgresql://postgres:Charvi1234!@localhost:5433/Customer Feedback Analyzer DB")



In [10]:
df.to_sql(
    "cleaned_reviews",
    engine,
    if_exists="append",   # add new rows
    index=False
)


454

In [11]:
import psycopg2

conn = psycopg2.connect(
    dbname="Customer Feedback Analyzer DB",
    host="localhost",
    user="postgres",
    password="Charvi1234!",
    port=5433
)

cur = conn.cursor()

cur.execute("""
    DROP TABLE IF EXISTS cleaned_reviews;

    CREATE TABLE cleaned_reviews(
        id SERIAL PRIMARY KEY,
        cleaned_review TEXT NOT NULL,
        sentiment_score INT NOT NULL
    );
""")

conn.commit()
conn.close()


In [12]:
from sqlalchemy import create_engine

engine = create_engine("postgresql://postgres:Charvi1234!@localhost:5433/Customer Feedback Analyzer DB")

df.to_sql(
    "cleaned_reviews",
    engine,
    if_exists="append",   # add new rows
    index=False
)


454

In [13]:
import psycopg2

conn = psycopg2.connect(
    dbname="Customer Feedback Analyzer DB",
    host="localhost",
    user="postgres",
    password="Charvi1234!",
    port=5433
)

cur = conn.cursor()
cur.execute("""
    ALTER TABLE cleaned_reviews
    ALTER COLUMN cleaned_review TYPE TEXT;
""")

conn.commit()
conn.close()


In [14]:
from sqlalchemy import create_engine

engine = create_engine("postgresql://postgres:Charvi1234!@localhost:5433/Customer Feedback Analyzer DB")

df.to_sql(
    "cleaned_reviews",
    engine,
    if_exists="append",   # add new rows
    index=False
)


454

In [15]:
test = pd.read_sql("SELECT * FROM cleaned_reviews LIMIT 10;", engine)
test

Unnamed: 0,id,cleaned_review,sentiment_score
0,1,good quality dog food bought several vitality ...,2
1,2,advertised product arrived labeled jumbo salte...,0
2,3,delight say confection around century light pi...,2
3,4,cough medicine looking secret ingredient robit...,0
4,5,great taffy great taffy great price wide assor...,2
5,6,nice taffy got wild hair taffy ordered five po...,2
6,7,great good expensive brand saltwater taffy gre...,2
7,8,wonderful tasty taffy taffy good soft chewy fl...,2
8,9,yay barley right mostly sprouting cat eat gras...,2
9,10,healthy dog food healthy dog food good digesti...,2


In [28]:
# Split the data based on the mapped sentiment_score
negative_df = df[df['sentiment_score'] == 0]  # negative
positive_df = df[df['sentiment_score'] == 2]  # positive

# Optional: store neutral separately
neutral_df = df[df['sentiment_score'] == 1]


In [29]:


# Write two tables only
negative_df.to_sql("negative_reviews", engine, if_exists="replace", index=False)
positive_df.to_sql("positive_reviews", engine, if_exists="replace", index=False)

# Verify
print("Negative reviews:", len(negative_df))
print("Positive reviews:", len(positive_df))


Negative reviews: 82037
Positive reviews: 443777


In [30]:
print(df['sentiment_score'].value_counts())


sentiment_score
2    443777
0     82037
1     42640
Name: count, dtype: int64
