In [38]:
import pandas as pd
import nltk

# Loading our datafile as a DataFrame with pandas

df = pd.read_csv('Reviews.csv', usecols=[6,9], header=0)
print(df)

        Score                                               Text
0           5  I have bought several of the Vitality canned d...
1           1  Product arrived labeled as Jumbo Salted Peanut...
2           4  This is a confection that has been around a fe...
3           2  If you are looking for the secret ingredient i...
4           5  Great taffy at a great price.  There was a wid...
...       ...                                                ...
568449      5  Great for sesame chicken..this is a good if no...
568450      2  I'm disappointed with the flavor. The chocolat...
568451      5  These stars are small, so you can give 10-15 o...
568452      5  These are the BEST treats for training and rew...
568453      5  I am very satisfied ,product is as advertised,...

[568454 rows x 2 columns]


In [28]:
# PoS-tagging our dataset


tagged = []
reduced_df = df[:1000] # Using now the first 1000 reviews to be able to use them

for index, item in reduced_df.iterrows():
    score = item[0]
    review = item[1]
    review = review.split()
    rev = []
    for i in review:
        rev += nltk.pos_tag([i], tagset="universal")
    tagged.append(rev)


In [29]:
import pickle

with open('tagged_reviews.txt', 'wb') as writer:
    pickle.dump(tagged, writer)

In [33]:
with open('tagged_reviews.txt', 'rb') as file:
    reviews_PoSTagged = pickle.load(file)
    
print(reviews_PoSTagged[0])

[('I', 'PRON'), ('have', 'VERB'), ('bought', 'NOUN'), ('several', 'ADJ'), ('of', 'ADP'), ('the', 'DET'), ('Vitality', 'NOUN'), ('canned', 'VERB'), ('dog', 'NOUN'), ('food', 'NOUN'), ('products', 'NOUN'), ('and', 'CONJ'), ('have', 'VERB'), ('found', 'NOUN'), ('them', 'PRON'), ('all', 'DET'), ('to', 'PRT'), ('be', 'VERB'), ('of', 'ADP'), ('good', 'ADJ'), ('quality.', 'NOUN'), ('The', 'DET'), ('product', 'NOUN'), ('looks', 'NOUN'), ('more', 'ADV'), ('like', 'ADP'), ('a', 'DET'), ('stew', 'NOUN'), ('than', 'ADP'), ('a', 'DET'), ('processed', 'VERB'), ('meat', 'NOUN'), ('and', 'CONJ'), ('it', 'PRON'), ('smells', 'NOUN'), ('better.', 'NOUN'), ('My', 'PRON'), ('Labrador', 'NOUN'), ('is', 'VERB'), ('finicky', 'NOUN'), ('and', 'CONJ'), ('she', 'PRON'), ('appreciates', 'NOUN'), ('this', 'DET'), ('product', 'NOUN'), ('better', 'ADV'), ('than', 'ADP'), ('most.', 'NOUN')]


In [34]:
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
# Lemmatizing our dataset + filtering for stopwords (except negation)


stopwords = set(stopwords.words('english'))

un2wn_mapping = {"VERB" : wn.VERB, "NOUN" : wn.NOUN, "ADJ" : wn.ADJ, "ADV" : wn.ADV}

lemmatized = []
for rev in tagged:
    lem_rev = []
    for tup in rev:
        w, tag = tup
        
        if len(w) > 1 and w.isalpha():
            w = w.lower()
        else:
            continue
        
        if tag in [".", "X"]:
            continue
        elif w.lower() in stopwords:
            if w.lower() in ["not", "t", "no"]:  
                lemma = w.lower()
                tag = "NEGATION"
                lem_rev.append("-".join([lemma, tag]))
            else: 
                continue
        
        elif tag in un2wn_mapping.keys():
            lemma = nltk.WordNetLemmatizer().lemmatize(w, pos = un2wn_mapping[tag])
        else:
            lemma = nltk.WordNetLemmatizer().lemmatize(w)
        lem_rev.append("-".join([lemma, tag]))
    lemmatized.append(lem_rev)

In [36]:
print(lemmatized)

[['bought-NOUN', 'several-ADJ', 'vitality-NOUN', 'can-VERB', 'dog-NOUN', 'food-NOUN', 'product-NOUN', 'found-NOUN', 'good-ADJ', 'product-NOUN', 'look-NOUN', 'like-ADP', 'stew-NOUN', 'process-VERB', 'meat-NOUN', 'smell-NOUN', 'labrador-NOUN', 'finicky-NOUN', 'appreciates-NOUN', 'product-NOUN', 'well-ADV'], ['product-NOUN', 'arrive-VERB', 'label-VERB', 'jumbo-ADJ', 'salt-VERB', 'peanut-NOUN', 'actually-ADV', 'small-ADJ', 'size-VERB', 'not-NEGATION', 'not-NEGATION', 'sure-NOUN', 'error-NOUN', 'vendor-NOUN', 'intend-VERB', 'represent-NOUN', 'product-NOUN'], ['confection-NOUN', 'around-ADP', 'pillowy-NOUN', 'citrus-NOUN', 'gelatin-NOUN', 'nut-NOUN', 'case-NOUN', 'cut-NOUN', 'tiny-ADJ', 'square-NOUN', 'liberally-ADV', 'coat-VERB', 'powder-VERB', 'tiny-ADJ', 'mouthful-NOUN', 'not-NEGATION', 'not-NEGATION', 'highly-ADV', 'recommend-NOUN', 'yummy-NOUN', 'familiar-ADJ', 'story-NOUN', 'treat-NOUN', 'seduces-NOUN', 'edmund-NOUN', 'sell-VERB', 'brother-NOUN', 'sister-NOUN'], ['look-VERB', 'secret-N

In [37]:
import pickle

#save lemmatized text to a pickle file
pickle.dump(lemmatized, open("lemmatized.txt", "wb"))
