In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Reviews.csv',nrows=10000,usecols=['Score','Summary','Text'])
df.head()

Unnamed: 0,Score,Summary,Text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,5,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Score    10000 non-null  int64 
 1   Summary  10000 non-null  object
 2   Text     10000 non-null  object
dtypes: int64(1), object(2)
memory usage: 234.5+ KB


In [4]:
df.isnull().sum()

Score      0
Summary    0
Text       0
dtype: int64

In [5]:
df['Score'].value_counts(normalize=True)

5    0.6183
4    0.1433
1    0.0932
3    0.0862
2    0.0590
Name: Score, dtype: float64

In [6]:
df['Word_Count'] = df['Text'].apply(lambda x: len(str(x).split()))

In [7]:
df['Char_Count'] = df['Text'].apply(lambda x: len(x))

In [8]:
def get_avg_word(x):
    word_len = 0
    words = x.split()
    for i in words:
        word_len =  word_len + len(i)
    return word_len/len(words)

df['Mean'] = df['Text'].apply(lambda x:get_avg_word(x))

In [9]:
df.head()

Unnamed: 0,Score,Summary,Text,Word_Count,Char_Count,Mean
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,263,4.479167
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,31,190,5.16129
2,4,"""Delight"" says it all",This is a confection that has been around a fe...,94,509,4.37234
3,2,Cough Medicine,If you are looking for the secret ingredient i...,41,219,4.317073
4,5,Great taffy,Great taffy at a great price. There was a wid...,27,140,4.111111


#### Stop word count

In [10]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
len(STOP_WORDS)

326

In [11]:
df['Stop_count'] = df['Text'].apply(lambda x: len([t for t in x.split() if t in STOP_WORDS]))
df.head()

Unnamed: 0,Score,Summary,Text,Word_Count,Char_Count,Mean,Stop_count
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,263,4.479167,22
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,31,190,5.16129,12
2,4,"""Delight"" says it all",This is a confection that has been around a fe...,94,509,4.37234,43
3,2,Cough Medicine,If you are looking for the secret ingredient i...,41,219,4.317073,16
4,5,Great taffy,Great taffy at a great price. There was a wid...,27,140,4.111111,12


In [12]:
df['#tags_count'] = df['Text'].apply(lambda x: len([t for t in x.split() if t.startswith('#')])) 
df['@tags_count'] = df['Text'].apply(lambda x: len([t for t in x.split() if t.startswith('@')]))
df['Number_count'] = df['Text'].apply(lambda x: len([t for t in x.split() if t.isdigit]))

#### tokenization

In [13]:
from nltk.tokenize import word_tokenize, blankline_tokenize
df['Tokens'] = df['Text'].apply(word_tokenize)
df['blank_Tokens'] = df['Text'].apply(blankline_tokenize)

In [14]:
df.head()

Unnamed: 0,Score,Summary,Text,Word_Count,Char_Count,Mean,Stop_count,#tags_count,@tags_count,Number_count,Tokens,blank_Tokens
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,263,4.479167,22,0,0,48,"[I, have, bought, several, of, the, Vitality, ...",[I have bought several of the Vitality canned ...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,31,190,5.16129,12,0,0,31,"[Product, arrived, labeled, as, Jumbo, Salted,...",[Product arrived labeled as Jumbo Salted Peanu...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...,94,509,4.37234,43,0,0,94,"[This, is, a, confection, that, has, been, aro...",[This is a confection that has been around a f...
3,2,Cough Medicine,If you are looking for the secret ingredient i...,41,219,4.317073,16,0,0,41,"[If, you, are, looking, for, the, secret, ingr...",[If you are looking for the secret ingredient ...
4,5,Great taffy,Great taffy at a great price. There was a wid...,27,140,4.111111,12,0,0,27,"[Great, taffy, at, a, great, price, ., There, ...",[Great taffy at a great price. There was a wi...


In [15]:
from nltk.probability import FreqDist
df['Probs'] = df['Tokens'].apply(FreqDist)

In [16]:
df.head()

Unnamed: 0,Score,Summary,Text,Word_Count,Char_Count,Mean,Stop_count,#tags_count,@tags_count,Number_count,Tokens,blank_Tokens,Probs
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,263,4.479167,22,0,0,48,"[I, have, bought, several, of, the, Vitality, ...",[I have bought several of the Vitality canned ...,"{'I': 1, 'have': 2, 'bought': 1, 'several': 1,..."
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,31,190,5.16129,12,0,0,31,"[Product, arrived, labeled, as, Jumbo, Salted,...",[Product arrived labeled as Jumbo Salted Peanu...,"{'Product': 1, 'arrived': 1, 'labeled': 1, 'as..."
2,4,"""Delight"" says it all",This is a confection that has been around a fe...,94,509,4.37234,43,0,0,94,"[This, is, a, confection, that, has, been, aro...",[This is a confection that has been around a f...,"{'This': 1, 'is': 5, 'a': 4, 'confection': 1, ..."
3,2,Cough Medicine,If you are looking for the secret ingredient i...,41,219,4.317073,16,0,0,41,"[If, you, are, looking, for, the, secret, ingr...",[If you are looking for the secret ingredient ...,"{'If': 1, 'you': 1, 'are': 1, 'looking': 1, 'f..."
4,5,Great taffy,Great taffy at a great price. There was a wid...,27,140,4.111111,12,0,0,27,"[Great, taffy, at, a, great, price, ., There, ...",[Great taffy at a great price. There was a wi...,"{'Great': 1, 'taffy': 3, 'at': 1, 'a': 4, 'gre..."


In [17]:
from nltk.util import bigrams,trigrams, ngrams

In [18]:
df.head()

Unnamed: 0,Score,Summary,Text,Word_Count,Char_Count,Mean,Stop_count,#tags_count,@tags_count,Number_count,Tokens,blank_Tokens,Probs
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,263,4.479167,22,0,0,48,"[I, have, bought, several, of, the, Vitality, ...",[I have bought several of the Vitality canned ...,"{'I': 1, 'have': 2, 'bought': 1, 'several': 1,..."
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,31,190,5.16129,12,0,0,31,"[Product, arrived, labeled, as, Jumbo, Salted,...",[Product arrived labeled as Jumbo Salted Peanu...,"{'Product': 1, 'arrived': 1, 'labeled': 1, 'as..."
2,4,"""Delight"" says it all",This is a confection that has been around a fe...,94,509,4.37234,43,0,0,94,"[This, is, a, confection, that, has, been, aro...",[This is a confection that has been around a f...,"{'This': 1, 'is': 5, 'a': 4, 'confection': 1, ..."
3,2,Cough Medicine,If you are looking for the secret ingredient i...,41,219,4.317073,16,0,0,41,"[If, you, are, looking, for, the, secret, ingr...",[If you are looking for the secret ingredient ...,"{'If': 1, 'you': 1, 'are': 1, 'looking': 1, 'f..."
4,5,Great taffy,Great taffy at a great price. There was a wid...,27,140,4.111111,12,0,0,27,"[Great, taffy, at, a, great, price, ., There, ...",[Great taffy at a great price. There was a wi...,"{'Great': 1, 'taffy': 3, 'at': 1, 'a': 4, 'gre..."


In [19]:
df.head()

Unnamed: 0,Score,Summary,Text,Word_Count,Char_Count,Mean,Stop_count,#tags_count,@tags_count,Number_count,Tokens,blank_Tokens,Probs
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,263,4.479167,22,0,0,48,"[I, have, bought, several, of, the, Vitality, ...",[I have bought several of the Vitality canned ...,"{'I': 1, 'have': 2, 'bought': 1, 'several': 1,..."
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,31,190,5.16129,12,0,0,31,"[Product, arrived, labeled, as, Jumbo, Salted,...",[Product arrived labeled as Jumbo Salted Peanu...,"{'Product': 1, 'arrived': 1, 'labeled': 1, 'as..."
2,4,"""Delight"" says it all",This is a confection that has been around a fe...,94,509,4.37234,43,0,0,94,"[This, is, a, confection, that, has, been, aro...",[This is a confection that has been around a f...,"{'This': 1, 'is': 5, 'a': 4, 'confection': 1, ..."
3,2,Cough Medicine,If you are looking for the secret ingredient i...,41,219,4.317073,16,0,0,41,"[If, you, are, looking, for, the, secret, ingr...",[If you are looking for the secret ingredient ...,"{'If': 1, 'you': 1, 'are': 1, 'looking': 1, 'f..."
4,5,Great taffy,Great taffy at a great price. There was a wid...,27,140,4.111111,12,0,0,27,"[Great, taffy, at, a, great, price, ., There, ...",[Great taffy at a great price. There was a wi...,"{'Great': 1, 'taffy': 3, 'at': 1, 'a': 4, 'gre..."


In [26]:
from copy import deepcopy
data = deepcopy(df[['Text','Score']])
data['Score'] = data['Score'].replace({1:0, 2:0,3:0,4:1,5:1})
data = deepcopy(data.iloc[:1000])

In [27]:
data.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,1
1,Product arrived labeled as Jumbo Salted Peanut...,0
2,This is a confection that has been around a fe...,1
3,If you are looking for the secret ingredient i...,0
4,Great taffy at a great price. There was a wid...,1


In [28]:
!pip install texthero

Collecting texthero
  Downloading texthero-1.0.9-py3-none-any.whl (25 kB)
Collecting unidecode>=1.1.1
  Using cached Unidecode-1.1.2-py2.py3-none-any.whl (239 kB)
Collecting plotly>=4.2.0
  Downloading plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11435 sha256=d3d0bce75b212ff241692f87810e48d5e7fa6b7a4343cf200d47fbe51415c743
  Stored in directory: c:\users\mayyi\appdata\local\pip\cache\wheels\f9\8d\8d\f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: unidecode, retrying, plotly, texthero
Successfully installed plotly-4.14.3 retrying-1.3.3 texthero-1.0.9 unidecode-1.1.2


In [29]:
import texthero as hero