In [None]:
pip install textblob

In [2]:
import pandas
from textblob import TextBlob

In [3]:
dataset = pandas.read_csv("data/cleaned_amazon_alexa.csv")
dataset

Unnamed: 0.1,Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,0,5,2018-07-31,Charcoal Fabric,Love my Echo!,1
1,1,5,2018-07-31,Charcoal Fabric,Loved it!,1
2,2,4,2018-07-31,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,3,5,2018-07-31,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,4,5,2018-07-31,Charcoal Fabric,Music,1
...,...,...,...,...,...,...
3145,3145,5,2018-07-30,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,3146,5,2018-07-30,Black Dot,"Listening to music, searching locations, check...",1
3147,3147,5,2018-07-30,Black Dot,"I do love these things, i have them running my...",1
3148,3148,5,2018-07-30,White Dot,Only complaint I have is that the sound qualit...,1


In [4]:
# In case you have a problem reading your dataset , check its encoding

# import chardet
# with open("McDonalds-Yelp-Sentiment-DFE.csv", 'rb') as rawdata:
#     result = chardet.detect(rawdata.read(100000))
# print(result)
# dataset = pandas.read_csv("McDonalds-Yelp-Sentiment-DFE.csv",encoding="Windows-1252")

**Length**

In [5]:
dataset["length"] = dataset["verified_reviews"].apply(len)
dataset["length"][:10]

0     13
1      9
2    195
3    172
4      5
5    172
6    365
7    221
8     11
9    114
Name: length, dtype: int64

**Polarity and Subjectivity**

In [6]:
def get_polarity(text):
    textblob = TextBlob(str(text.encode("utf-8")))
    pol = textblob.sentiment.polarity
    return pol

In [7]:
def get_subjectivity(text):
    textblob = TextBlob(str(text.encode("utf-8")))
    pol = textblob.sentiment.subjectivity
    return pol

In [8]:
dataset["Polarity"] = dataset["verified_reviews"].apply(get_polarity)
dataset["Subjectivity"] = dataset["verified_reviews"].apply(get_subjectivity)

print(dataset[['length', 'Polarity', 'Subjectivity']].describe())

            length     Polarity  Subjectivity
count  3150.000000  3150.000000   3150.000000
mean    132.049524     0.349792      0.528922
std     182.099952     0.303362      0.256324
min       1.000000    -1.000000      0.000000
25%      30.000000     0.123852      0.419196
50%      74.000000     0.350000      0.585000
75%     165.000000     0.533333      0.695486
max    2851.000000     1.000000      1.000000


In [9]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,rating,date,variation,verified_reviews,feedback,length,Polarity,Subjectivity
0,0,5,2018-07-31,Charcoal Fabric,Love my Echo!,1,13,0.625,0.6
1,1,5,2018-07-31,Charcoal Fabric,Loved it!,1,9,0.875,0.8
2,2,4,2018-07-31,Walnut Finish,"Sometimes while playing a game, you can answer...",1,195,-0.1,0.5125
3,3,5,2018-07-31,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,172,0.35,0.45
4,4,5,2018-07-31,Charcoal Fabric,Music,1,5,0.0,0.0


**Words , Characters and punctuation**

In [10]:
dataset["charsCount"] = dataset["verified_reviews"].apply(len)
dataset["wordCount"] = dataset["verified_reviews"].apply(lambda x: len(x.split()))
dataset["word_density"] = dataset["charsCount"] / (dataset["wordCount"]+1)
print(dataset[['charsCount', 'wordCount', 'word_density']].describe())

        charsCount    wordCount  word_density
count  3150.000000  3150.000000   3150.000000
mean    132.049524    25.293016      4.605345
std     182.099952    34.584971      1.134737
min       1.000000     0.000000      0.500000
25%      30.000000     6.000000      4.269231
50%      74.000000    14.000000      4.805665
75%     165.000000    32.000000      5.207974
max    2851.000000   526.000000     32.500000


**Nouns , Verbs , pronoun , Adverbs , Adj**

In [27]:
def get_nouns(text):
    list_of_nouns=[]
    textblob = TextBlob(str(text.encode("utf-8")))
    tags = textblob.tags
    for i in tags:
        if i[1] =="NN":
            list_of_nouns.append(i[0])
    return list_of_nouns

In [28]:
def get_verbs(text):
    list_of_verbs=[]
    textblob = TextBlob(str(text.encode("utf-8")))
    tags = textblob.tags
    for i in tags:
        if "VB" in i[1]:
            list_of_verbs.append(i[0])
    return list_of_verbs

In [29]:
def get_pronouns(text):
    list_of_pronouns=[]
    textblob = TextBlob(str(text.encode("utf-8")))
    tags = textblob.tags
    for i in tags:
        if i[1] =="WP":
            list_of_pronouns.append(i[0])
    return list_of_pronouns

In [30]:
def get_adverbs(text):
    list_of_adverbs=[]
    textblob = TextBlob(str(text.encode("utf-8")))
    tags = textblob.tags
    for i in tags:
        if "RB" in i[1]:
            list_of_adverbs.append(i[0])
    return list_of_adverbs

In [31]:
def get_adj(text):
    list_of_adj=[]
    textblob = TextBlob(str(text.encode("utf-8")))
    tags = textblob.tags
    for i in tags:
        if "JJ" in i[1]:
            list_of_adj.append(i[0])
    return list_of_adj

In [32]:
dataset["nouns"] = dataset["verified_reviews"].apply(lambda x: get_nouns(x))
dataset["verbs"] = dataset["verified_reviews"].apply(lambda x: get_verbs(x))
dataset["pronouns"] = dataset["verified_reviews"].apply(lambda x: get_pronouns(x))
dataset["adverbs"] = dataset["verified_reviews"].apply(lambda x: get_adverbs(x))
dataset["adj"] = dataset["verified_reviews"].apply(lambda x: get_adj(x))
dataset[["verified_reviews","nouns" , "verbs","pronouns","adverbs","adj"]]

Unnamed: 0,verified_reviews,nouns,verbs,pronouns,adverbs,adj
0,Love my Echo!,[],[b'Love],[],[],[]
1,Loved it!,[],[b'Loved],[],[],[]
2,"Sometimes while playing a game, you can answer...","[game, question, home]","[playing, answer, says, got, answers, like, be...",[],"[correctly, away]","[wrong, same, able]"
3,I have had a lot of fun with this thing. My 4 ...,"[b, lot, fun, thing, music]","[have, had, control, play, sound, playing]",[],"[yr, nice, when, as, well]",[old]
4,Music,[b'Music],[],[],[],[]
...,...,...,...,...,...,...
3145,"Perfect for kids, adults and everyone in betwe...","[b'Perfect, everyone]",[],[],[],[]
3146,"Listening to music, searching locations, check...","[music, time, weather, task]","[b'Listening, searching, checking, looking, ar...",[],[],"[many, more, simple]"
3147,"I do love these things, i have them running my...","[b, home, TV, thermostat, door, bolt, value, h...","[do, love, have, running, i, love, listening, ...",[what],"[well, really, once]","[entire, front, dead, smart, entire, amazing, ..."
3148,Only complaint I have is that the sound qualit...,"[b, complaint, sound, quality, dot, stereo, sy...","[have, is, is, use, have, hooked, play, have, ...",[],"[Only, n't, mostly, up, n't]","[great, audio, full, sure, larger]"
