In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import nltk
import re
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
# downloading stopwords corpus
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('conll2000')
nltk.download('brown')
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv("/content/Amazon_Reviews_Oneplus_10R.csv")
df.head()

In [None]:
df1=df.copy()
df2=df.copy()
df3=df.head()

In [None]:
df1["char_count"]=df["Review"].str.len()
df1[["Review","char_count"]].head()

In [None]:
df2["word_count"]=df["Review"].apply(lambda x:len(str(x).split(" ")))
df2[["Review","word_count"]].head()

In [None]:
from nltk.corpus import stopwords
stop=stopwords.words("english")
df3["stopwords"]=df3["Review"].apply(lambda x:len([x for x in x.split() if x in stop]))
df3[["Review","stopwords"]]

In [None]:
df = df.drop(columns=['Unnamed: 0','Rating','Title',"Review_Date"])

In [None]:
#Text Preprocessing
#PreProcessing
df['Final_review'] = df['Review'].str.replace(r'@\w+', '')

# Display the modified DataFrame
df.head()

In [None]:
#Removing reviews with empty text
df = df[df['Final_review']!='']
df.head(5)

In [None]:
cleaned_reviews = []

for index, row in df.iterrows():
    # Check if the entry is a string
    if isinstance(row.Final_review, str):
        # Filtering out words that contain links
        words_without_links = [word for word in row.Final_review.split() if 'http' not in word]
        cleaned_reviews.append(' '.join(words_without_links))
    else:
        # Handle non-string entries (e.g., if the entry is a float)
        cleaned_reviews.append('')

# Create a new column with cleaned reviews
df['Final_review'] = cleaned_reviews

df.head()

In [None]:
# Converting text to lowercase, removing text in square brackets,removing links, punctuation and  words containing numbers
def clean_text(text):

    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df['Final_review']=df['Final_review'].apply(lambda x: clean_text(x))
df.head()

In [None]:
#Removing Emojis
def remove_emoji(text):
    emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF"u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['Final_review']=df['Final_review'].apply(lambda x: remove_emoji(x))
df.head(10)

In [None]:
df['tokenized_review'] = df['Final_review'].apply(lambda x: nltk.word_tokenize(x))
df.head(10)

In [None]:
df = df.drop(columns=['Final_review'])

In [None]:
#Stopwords
from nltk.corpus import stopwords
my_stop_words=stopwords.words('english')
sw = ['am','using','phone','may']
my_stop_words.extend(sw)
stopwords_set = set(my_stop_words)
cleaned_tweets = []

for index, row in df.iterrows():

    # filerting out all the stopwords
     words_without_stopwords = [word for word in row.tokenized_review if word.lower() not in my_stop_words and '#' not in word.lower()]

    # finally creating tweets list of tuples containing stopwords(list) and sentimentType
     cleaned_tweets.append(' '.join(words_without_stopwords))

df['Final_review'] = cleaned_tweets
df.head()

In [None]:
#Lemmatization
tokenized_review = df['Final_review'].apply(lambda x: x.split())
word_lemmatizer = WordNetLemmatizer()
tokenized_review = tokenized_review.apply(lambda x: [word_lemmatizer.lemmatize(i) for i in x])

In [None]:
df['Final_review'] = tokenized_review
df['Final_review'] = df['Final_review'].apply(lambda x: ' '.join(x))
df.head(10)

In [None]:
#Feature Extraction
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def fetch_sentiment_using_SIA(text):
    sid = SentimentIntensityAnalyzer()
    polarity_scores = sid.polarity_scores(text)
    return 'neg' if polarity_scores['neg'] > polarity_scores['pos'] else 'pos'

sentiments_using_SIA = df.Final_review.apply(lambda x: fetch_sentiment_using_SIA(x))
pd.DataFrame(sentiments_using_SIA.value_counts())

In [None]:
df4=pd.DataFrame()
df4['Reviews'] = df.Final_review
sid = SentimentIntensityAnalyzer()
df4['scores'] = df4['Reviews'].apply(lambda x: sid.polarity_scores(x))
df4['compound']  = df4['scores'].apply(lambda x: x['compound'])
df4 = df4.drop(columns=['scores'])
df4.head()


In [None]:
df4['sentiment'] = df4['compound'].apply(lambda c: 'Positive' if c >= 0.05 else 'Negative' if c <= -0.05 else 'Neutral')
df4.head()

In [None]:
df4['sentiment'].value_counts()

In [None]:
plt.hist(df4['sentiment'])

In [None]:
allWords_ = ' '.join([review for review in df['Final_review']])
f, axes = plt.subplots(figsize=(10,8))
wordcloud= WordCloud(
        background_color = 'black',
        width = 1800,
        height =1400).generate(allWords_)
plt.imshow(wordcloud)