In [64]:
import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
data = pd.read_csv('GOT_Reviews.csv', header=None)
data.columns = ['Comments', 'Ratings']

data.head()

Unnamed: 0,Comments,Ratings
0,"Finally, a Show that stays true to it's written roots",9.0
1,Excellent adaptation.,10.0
2,One of the most stunning shows on television,10.0
3,Lord of the Rings - I think not!,10.0
4,Fantasy has never seemed so real,10.0


In [8]:
data.shape

(1070, 2)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 2 columns):
Comments    1070 non-null object
Ratings     1067 non-null float64
dtypes: float64(1), object(1)
memory usage: 16.8+ KB


Data Cleaning

Step 1 - Removing punctuation

In [28]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [54]:
#Remove punctuation in the string

def remove_punct(text):
    text_nonpunct = "".join([char for char in text if char not in string.punctuation])
    
    return text_nonpunct

data['Comments_clean'] = data['Comments'].apply(lambda x: remove_punct(x))

data.head()

Unnamed: 0,Comments,Ratings,Comments_clean,Comments_tokenized,Comments_nonstop
0,"Finally, a Show that stays true to it's written roots",9.0,Finally a Show that stays true to its written roots,"[, finally, a, show, that, stays, true, to, it, s, written, roots]","[, finally, show, stays, true, written, roots]"
1,Excellent adaptation.,10.0,Excellent adaptation,"[, excellent, adaptation, ]","[, excellent, adaptation, ]"
2,One of the most stunning shows on television,10.0,One of the most stunning shows on television,"[, one, of, the, most, stunning, shows, on, television]","[, one, stunning, shows, television]"
3,Lord of the Rings - I think not!,10.0,Lord of the Rings I think not,"[, lord, of, the, rings, i, think, not, ]","[, lord, rings, think, ]"
4,Fantasy has never seemed so real,10.0,Fantasy has never seemed so real,"[, fantasy, has, never, seemed, so, real]","[, fantasy, never, seemed, real]"


In [55]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data['Comments_tokenized'] = data['Comments'].apply(lambda x: tokenize(x.lower()))

data.head()

Unnamed: 0,Comments,Ratings,Comments_clean,Comments_tokenized,Comments_nonstop
0,"Finally, a Show that stays true to it's written roots",9.0,Finally a Show that stays true to its written roots,"[, finally, a, show, that, stays, true, to, it, s, written, roots]","[, finally, show, stays, true, written, roots]"
1,Excellent adaptation.,10.0,Excellent adaptation,"[, excellent, adaptation, ]","[, excellent, adaptation, ]"
2,One of the most stunning shows on television,10.0,One of the most stunning shows on television,"[, one, of, the, most, stunning, shows, on, television]","[, one, stunning, shows, television]"
3,Lord of the Rings - I think not!,10.0,Lord of the Rings I think not,"[, lord, of, the, rings, i, think, not, ]","[, lord, rings, think, ]"
4,Fantasy has never seemed so real,10.0,Fantasy has never seemed so real,"[, fantasy, has, never, seemed, so, real]","[, fantasy, never, seemed, real]"


In [66]:
#Remove stopwords
stopword = nltk.corpus.stopwords.words('english')

In [67]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

data['Comments_nonstop'] = data['Comments_tokenized'].apply(lambda x: remove_stopwords(x))

data.head()

Unnamed: 0,Comments,Ratings,Comments_clean,Comments_tokenized,Comments_nonstop,Comments_lemmatized
0,"Finally, a Show that stays true to it's written roots",9.0,Finally a Show that stays true to its written roots,"[, finally, a, show, that, stays, true, to, it, s, written, roots]","[, finally, show, stays, true, written, roots]","[, finally, show, stay, true, written, root]"
1,Excellent adaptation.,10.0,Excellent adaptation,"[, excellent, adaptation, ]","[, excellent, adaptation, ]","[, excellent, adaptation, ]"
2,One of the most stunning shows on television,10.0,One of the most stunning shows on television,"[, one, of, the, most, stunning, shows, on, television]","[, one, stunning, shows, television]","[, one, stunning, show, television]"
3,Lord of the Rings - I think not!,10.0,Lord of the Rings I think not,"[, lord, of, the, rings, i, think, not, ]","[, lord, rings, think, ]","[, lord, ring, think, ]"
4,Fantasy has never seemed so real,10.0,Fantasy has never seemed so real,"[, fantasy, has, never, seemed, so, real]","[, fantasy, never, seemed, real]","[, fantasy, never, seemed, real]"


Lemmatization

In [74]:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [86]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data['Comments_lemmatized'] = data['Comments_nonstop'].apply(lambda x: lemmatizing(x))

#data.head(10)

In [76]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopword]
    return text

In [85]:
count_vect = CountVectorizer(analyzer=clean_text)

x_counts = count_vect.fit_transform(data['Comments'])
print(x_counts.shape)
#print(count_vect.get_feature_names())

(1070, 1093)


In [80]:
x_counts_df = pd.DataFrame(x_counts.toarray())
x_counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1066,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1067,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1068,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
x_counts_df.columns = count_vect.get_feature_names()
x_counts_df

Unnamed: 0,Unnamed: 1,0,1,10,100,100100,11,110,14,2,...,writer,written,wrong,wrote,ye,year,yet,your,youv,zero
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1066,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1067,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1068,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
