# Sentiment Analysis

### Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import spacy
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
import time, datetime
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

### Read the Dataset

In [None]:
data1_path = '/kaggle/input/consumer-reviews-of-amazon-products/1429_1.csv'
data2_path = "/kaggle/input/consumer-reviews-of-amazon-products/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv"
data3_path = "/kaggle/input/consumer-reviews-of-amazon-products/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv"

In [None]:
data1 = pd.read_csv(data1_path)

In [None]:
from matplotlib import pyplot as plt
data1['reviews.rating'].value_counts().sort_values().plot(kind = 'bar')

So there is a huge imbalnce in the dataset
Need to add more data with low rate classes

In [None]:
data2 = pd.read_csv(data2_path)
data2 = data2[['reviews.rating' , 'reviews.text']]
data2 = data2[data2["reviews.rating"]<=3]

data3 = pd.read_csv(data3_path)
data3 = data3[['reviews.rating' , 'reviews.text']]

In [None]:
#Only considering those where rating is equal or less than 3
data2 = data2[data2["reviews.rating"]<=3]
data3 = data3[data3["reviews.rating"]<=3]

In [None]:
#Merging the datasets
data=pd.concat([data1, data2, data3])

In [None]:
df=pd.concat([data['reviews.text'],data['reviews.rating']], axis=1)
df.head()

In [None]:
df['reviews.rating'].value_counts()

In [None]:
df.dropna(inplace=True)
df=df.reset_index()
df.drop(columns=['index'],inplace=True)

In [None]:
df['reviews.rating'].value_counts()

## Data Preprocessing

In [None]:
sentiment = {1: 0,
            2: 0,
            3: 0,
            4: 1,
            5: 1}
df['sentiment']=df['reviews.rating'].map(sentiment)

In [None]:
df['sentiment'].value_counts().sort_values().plot(kind = 'bar')

#### Positive Class

In [None]:
positiveWords = pd.Series(' '.join(df[df['sentiment']==1]['reviews.text']).split())
wordcloud = WordCloud(width = 1000, height = 500).generate(' '.join(positiveWords))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Most Positive Words Used ")
plt.axis("off")
plt.show()

#### Negative Class

In [None]:
negativeWords=words = pd.Series(' '.join(df[df['sentiment']==0]['reviews.text']).split())
wordcloud = WordCloud(width = 1000, height = 500).generate(' '.join(negativeWords))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Most Negative Words Used ")
plt.axis("off")
plt.show()

## Text Preprocessing

Helper Functions for Data Preprocessing

In [None]:
all_text_data = np.array(df['reviews.text'])

all_urls = []
for i in tqdm(range(0,df.shape[0])):
    r = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    urls = r.findall(all_text_data[i])
    all_urls = all_urls + urls

print('Total URLS: ', len(all_urls),'\n\n')
    

all_tags = []
for i in tqdm(range(0,df.shape[0])):
    r = re.compile('<.*?>')
    tags = r.findall(all_text_data[i])
    all_tags = all_tags + tags
    
print('Total Tags: ', len(all_tags))

In [None]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup
# 
def remove_tags(html):
    # parse html content
    soup = BeautifulSoup(html, "html.parser")
  
    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()
    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

def remove_link_from(text):
    URLless_string = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}     /)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', text)
    return URLless_string

In [None]:
ps = PorterStemmer()
corpus=[]

for i in tqdm(range(0,len(df['reviews.text']))):
    #Data Cleaning only considerin the alphabets
    review = re.sub("[^a-zA-Z]"," ",df['reviews.text'][i])
    
    #Lowering the cases
    review = review.lower()
    
    #Removing Links from texts
    review = remove_link_from(review)
    
    #Removing tags from texts
    review = remove_tags(review)
    
    #Splitting sentence into words
    review = review.split()
    
    #Stemming
    #Stropwords removing
    review = [ps.stem(word) for word in review if word not in stopwords.words("english")]
    
    #Joining the words again
    review = ' '.join(review)
    
    df['reviews.text'][i] = review
    
df['text']=df['reviews.text']

# Saving Preprocessed Data

In [None]:
processed_data = {"text": df['text'], "sentiment": df['sentiment']}
processed_df = pd.DataFrame(processed_data)
processed_df.to_csv('preprocessed-dataset.csv', index=False)