In [35]:
import pandas as pd
import nltk
from sklearn.utils import shuffle

# Loading our datafile as a DataFrame with pandas

df = pd.read_csv('Reviewss.csv', usecols=[6,9], header=0)
shuf_red_df = shuffle(df[:10000])
print(shuf_red_df)

      Score                                               Text
7414      1  I am the biggest fan of mushrooms you'll ever ...
275       4  Can't say anything bad about Sugar in the Raw....
6480      5  My daughter is very picky, but she like this c...
8226      1  My one year old cat ate a few of these and see...
5325      5  I bought this product because my dogs were hav...
...     ...                                                ...
4037      5  I found out about these from a co-worker. They...
1285      5  If there is one thing our dog likes, it is the...
3921      4  This is good stuff, and hard to find in the lo...
6645      2  I was surprised at the ingredient list and "10...
9725      2  I tried the Frooties mBerry and the effect las...

[10000 rows x 2 columns]


In [45]:
# PoS-tagging our dataset
train_df = shuf_red_df[:8000]
test_df = shuf_red_df[:2000]

train_tagged = []
test_tagged = []

for index, item in train_df.iterrows():
    score = item[0]
    review = item[1]
    review = review.split()
    rev = []
    for i in review:
        rev += nltk.pos_tag([i], tagset="universal")
    train_tagged.append([rev, score])

for index, item in test_df.iterrows():
    score = item[0]
    review = item[1]
    review = review.split()
    rev = []
    for i in review:
        rev += nltk.pos_tag([i], tagset="universal")
    test_tagged.append([rev, score])

In [46]:
import pickle

with open('train_tagged_reviews.txt', 'wb') as writer:
    pickle.dump(train_tagged, writer)
    
with open('test_tagged_reviews.txt', 'wb') as writer:
    pickle.dump(test_tagged, writer)

In [64]:
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
# Lemmatizing our dataset + filtering for stopwords (except negation)
training_dicts = dict((k,[]) for k in [1,2,3,4,5])
test_dicts = dict((k,[]) for k in [1,2,3,4,5])

stopwords = set(stopwords.words('english'))

un2wn_mapping = {"VERB" : wn.VERB, "NOUN" : wn.NOUN, "ADJ" : wn.ADJ, "ADV" : wn.ADV}

lemmatized = []

for review in train_tagged:
    rev = review[0]
    score = review[1]
    lem_rev = []
    for tup in rev:
        w, tag = tup
        
        if len(w) > 1 and w.isalpha():
            w = w.lower()
        else:
            continue
        
        if tag in [".", "X"]:
            continue
        elif w.lower() in stopwords:
            if w.lower() in ["not", "t", "no"]:  
                lemma = w.lower()
                tag = "NEGATION"
            else: 
                continue
        
        elif tag in un2wn_mapping.keys():
            lemma = nltk.WordNetLemmatizer().lemmatize(w, pos = un2wn_mapping[tag])
        else:
            lemma = nltk.WordNetLemmatizer().lemmatize(w)
        lem_rev.append("-".join([lemma, tag]))
    training_dicts[score].append(lem_rev)

In [65]:
for review in test_tagged:
    rev = review[0]
    score = review[1]
    lem_rev = []
    for tup in rev:
        w, tag = tup
        
        if len(w) > 1 and w.isalpha():
            w = w.lower()
        else:
            continue
        if tag in [".", "X"]:
            continue
        elif w.lower() in stopwords:
            if w.lower() in ["not", "t", "no"]:  
                lemma = w.lower()
                tag = "NEGATION"
                lem_rev.append("-".join([lemma, tag]))
            else: 
                continue
        elif tag in un2wn_mapping.keys():
            lemma = nltk.WordNetLemmatizer().lemmatize(w, pos = un2wn_mapping[tag])
        else:
            lemma = nltk.WordNetLemmatizer().lemmatize(w)
        lem_rev.append("-".join([lemma, tag]))
    test_dicts[score].append(lem_rev)

In [66]:
print(training_dicts[3])



In [9]:
#save lemmatized text to a pickle file
pickle.dump(training_dicts, open("training_dicts.txt", "wb"))
pickle.dump(test_dicts, open("test_dicts.txt", "wb"))
