In [131]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [132]:
data=pd.read_csv("../input/amazon-product-reviews/Reviews.csv")

In [133]:
data.describe()

In [134]:
data.shape

In [135]:
data.info()

In [136]:
data.columns

In [137]:
data.head()

In [138]:
data.tail()

Missing Value Handling

In [139]:
data.isna()

In [140]:
data.isna().sum()

In [141]:
data.dropna(axis=0,inplace=True)

In [142]:
data.isna().sum()

Duplicate Values Handling

In [143]:
data.duplicated().sum()

In [144]:
tdata=data[['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator',  'Time', 'Summary', 'Text','Score']]
tdata

In [145]:
#Creating a new variable X_train
X_train=tdata

In [146]:
X_train.info()

In [147]:
X_train.describe()

In [148]:
X_train.head()

In [149]:
#Features with 1 constant
!pip install fast_ml
from fast_ml.utilities import reduce_memory_usage, display_all
from fast_ml.feature_selection import get_constant_features
constant_features =get_constant_features(X_train)
constant_features.head(10)

In [150]:
#Features with quasi-constant values
constant_features =get_constant_features(X_train, threshold= 0.99,dropna=False)
constant_features.head(10)

In [151]:
import matplotlib.pyplot as plt
import seaborn as sns

In [152]:
#Building correlation matrix
sns.heatmap(X_train.corr())

In [153]:
X_train.head()

Filtering numerical variables and store their names in a list called numerical 

In [158]:
numerical = ['Id','HelpfulnessNumerator','HelpfulnessDenominator','Score']

Histograms Showing continious Features

In [159]:
X_train[numerical].hist(bins=10, figsize=(15, 8),layout=(2,3));

In [163]:
#General Visualization
#Let's take a look at the general spread of the data in each column of our dataset.
plt.figure(dpi=100, figsize=(15, 5))
sns.boxplot(data=X_train[numerical])
plt.grid(True)
plt.title("Distribution of each column")
plt.show()

Calculating multi-collinearity of numerical features: - Using Variance Inflation Factor

In [168]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_VIF(x):
    
  vif= pd.DataFrame()
  vif['variables']=x.columns
  vif["VIF"]=[variance_inflation_factor(x.values,i) for i in range(x.shape[1])]

  return(vif)

In [169]:
x=X_train[numerical]
calc_VIF(x)

String/Object-Type Features

In [170]:
import pandas as pd
import numpy as np
# for data cleaning
import string
import re
# for removing accented and special chracters
import unicodedata
# for stopwords Removal
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize
# for calculating Polarity and Subjectivity
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
# function for making ngrams
from nltk.util import ngrams
# load in all the modules we're going to need
import nltk
import collections
# for Wordscloud
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer

In [171]:
X_train['Text']

In [173]:
X_train['Summary']

In [175]:
# First lets remove Punctuations from the Text
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

X_train['Text'] = X_train['Text'].apply(punctuation_removal)
X_train['Summary'] = X_train['Summary'].apply(punctuation_removal)

In [176]:
 #lets make a function to remove Numbers from the Text
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ''.join(list_text_new)

X_train['Text'] = X_train['Text'].apply(drop_numbers)
X_train['Summary'] = X_train['Summary'].apply(drop_numbers)


In [177]:
# lets show the Top 10 Reviews after Removal of Punctuations and Numbers
X_train['Text'].head(10)

In [179]:
X_train['Summary'].head(10)

Data Cleaning

In [180]:
#Remove hyperlinks and markup
def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

In [181]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
nltk.download('stopwords')
from nltk.stem import PorterStemmer
stop=set(stopwords.words("english"))
stemmer=PorterStemmer()
lemma=WordNetLemmatizer()
#Removing stopwords
def remove_stopword(text):
   text=[word.lower() for word in text.split() if word.lower() not in stop]
   return " ".join(text)

In [182]:
#Stemming and Lematizing
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
def Stemming(text):
   stem=[]
   stopword = stopwords.words('english')
   snowball_stemmer = SnowballStemmer('english')
   word_tokens = nltk.word_tokenize(text)
   stemmed_word = [snowball_stemmer.stem(word) for word in word_tokens]
   stem=' '.join(stemmed_word)
   return stem

In [183]:
#clean and remove emoji
def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)

In [184]:
#clean unnecessary whitespaces
def unify_whitespaces(x):
    cleaned_string = re.sub(' +', ' ', x)
    return cleaned_string 

In [185]:
#clean and remove symbols
def remove_symbols(x):
    cleaned_string = re.sub(r"[^a-zA-Z0-9?!.,]+", ' ', x)
    return cleaned_string  

In [186]:
def cleaning(df,review):
    df[review] = df[review].apply(clean)
    df[review] = df[review].apply(deEmojify)
    df[review] = df[review].str.lower()
    df[review] = df[review].apply(remove_symbols)
    df[review] = df[review].apply(remove_stopword)
    df[review] = df[review].apply(unify_whitespaces)
    df[review] = df[review].apply(Stemming)

In [187]:
cleaning(X_train,'Text')
cleaning(X_train,'Summary')

Analysis and Visualization:-Text and Summary

Wordclouds For all Text and Summary

In [190]:
txt = ' '.join(rev for rev in X_train.Text)
plt.figure(figsize=(15,8))

wordcloud = WordCloud(
            background_color = 'black',
            max_font_size = 100,
            max_words = 100,
            width = 1000,
            height = 600
            ).generate(txt)


plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [189]:
txt = ' '.join(rev for rev in X_train.Summary)
plt.figure(figsize=(15,8))

wordcloud = WordCloud(
            background_color = 'black',
            max_font_size = 100,
            max_words = 100,
            width = 1000,
            height = 600
            ).generate(txt)


plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [191]:
X_train[['Text','Summary']].head()

In [192]:
X_train.head()

Split data into Traning and Testing sets

In [195]:
# random split train and test 
index = X_train.index
X_train['random_number'] = np.random.randn(len(index))
train = X_train[X_train['random_number'] <= 0.8]
test = X_train[X_train['random_number'] > 0.8]

In [196]:
train.shape

In [197]:
test.shape

In [198]:
# count vectorizer:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['Summary'])
test_matrix = vectorizer.transform(test['Summary'])

In [199]:
X_train1 = train_matrix
X_test = test_matrix
y_train = train['Score']
y_test = test['Score']

In [200]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [201]:
lr.fit(X_train1,y_train)

In [202]:
# Model Prediction

predictions = lr.predict(X_test)
predictions

Model Evaluation


ROC, AUC, Confusion Matrix and Accuracy are widely used for evaluating Logistic Regression model.


All of these metrics are based on calculating the difference between the y values predicted by the model and the actual y values of the test set, hence y_pred and y_test. There are four possible scenarios while comparing the differences:

    
True Positive: survived when predicted survived
    

True Negative: Didn't survive when predicted Didn't survive
    

False Positive: Didn;t survive when predicted survived
    

False Negative:Survived when predicted Didn't survive

Confusion Matrix

In [203]:
from sklearn import metrics
#confusion matrix
confusion_matrix=metrics.plot_confusion_matrix(lr,X_test,y_test, cmap="GnBu")
confusion_matrix

In [204]:
#Accuracy calculates the ratio of all correct predictions
print("Accuracy:",metrics.accuracy_score(y_test,predictions))

URL

In [None]:
#https://www.kaggle.com/code/danielbeltsazar/amazon-review-sentiment-analysis/notebook
#https://www.kaggle.com/code/tookiprotvic/netflix-visualizations-recommendation-eda/edit
#https://www.kaggle.com/code/mohamedbakrey/eda-for-amazon-product-review-sentiment-analysis
#https://www.analyticsvidhya.com/blog/2020/03/what-is-multicollinearity/
#https://medium.com/jovianml/exploratory-data-analysis-using-python-a-case-study-78aa34e5922e
