In [None]:
import nltk
from nltk.util import ngrams
import pandas as pd
import gensim
import contractions
import pickle

In [None]:
df = pd.read_csv('unique_data.csv')

In [None]:
df.head()


In [None]:
df.describe()

In [None]:
# Label the data with new column where review_stars is greater than 3 is positive and less than 3 is negative and equal to 3 is neutral
df['sentiment'] = df['review_stars'].apply(lambda x: 'positive' if x > 3 else 'negative' if x < 3 else 'neutral')

In [None]:
# Preprocess the data
stop_list = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

# We use the following list to store the sequence of sentence labels.
labels = []

# We use the following list to store the sentences, where each sentence itself is a list of words.
corpus = []

# For every row in the data frame
for index, row in df.iterrows():
    # Extract the label and the text.
    label = row['sentiment']
    text = row['review_text']
    
    # Store the label into the list of labels.
    labels.append(label)
    
    # Tokenize the text.
    sent = nltk.word_tokenize(text)
    
    # Lowercase conversion
    sent = [w.lower() for w in sent]
    
    # Stop word removal 
    sent = [w for w in sent if w not in stop_list]

    # Remove punctuation
    sent = [w for w in sent if w.isalnum()]
    
    # Lemmatization 
    sent = [lemmatizer.lemmatize(w) for w in sent]

    # Expand contractions
    sent = [contractions.fix(w) for w in sent]

    # Create bigrams
    bigrams = [' '.join(w) for w in list(ngrams(sent, 2))]
    sent.extend(bigrams)
    
    # Store the sentence into the corpus.
    corpus.append(sent)


print('Finished reading sentences.')



In [49]:
# Create a dictionary from the corpus.
dictionary = gensim.corpora.Dictionary(corpus)

# Store the labeled training data in the following list.
labeled_data = []
count = 0
# Going through the two lists in parallel to create the labeled data set.
for (l, s) in zip(labels, corpus):
    
    # Add the labeled sentence to the labeled data set.
    labeled_data.append([l,s])
    
print('Finished preparing the data.')


Finished preparing the data.
['positive', ['belgian', 'cafe', 'turned', 'someone', 'would', 'barely', 'look', 'mussel', 'someone', 'actually', 'eat', 'occasion', 'get', 'high', 'mark', 'french', 'fry', 'dipping', 'sauce', 'also', 'praised', 'course', 'serve', 'favorite', 'monk', 'sour', 'flemish', 'ale', 'overall', 'good', 'neighborhood', 'spot', 'meet', 'friend', 'drink', 'dinner', 'take', 'good', 'book', 'treat', 'belgian cafe', 'cafe turned', 'turned someone', 'someone would', 'would barely', 'barely look', 'look mussel', 'mussel someone', 'someone actually', 'actually eat', 'eat occasion', 'occasion get', 'get high', 'high mark', 'mark french', 'french fry', 'fry dipping', 'dipping sauce', 'sauce also', 'also praised', 'praised course', 'course serve', 'serve favorite', 'favorite monk', 'monk sour', 'sour flemish', 'flemish ale', 'ale overall', 'overall good', 'good neighborhood', 'neighborhood spot', 'spot meet', 'meet friend', 'friend drink', 'drink dinner', 'dinner take', 'take 

In [36]:
# Split the data into training, validation, and test sets using 60%, 20%, 20% respectively.
train_size = int(len(labeled_data) * 0.6)
val_size = int(len(labeled_data) * 0.2)
train_data = labeled_data[:train_size]
val_data = labeled_data[train_size:train_size+val_size]
test_data = labeled_data[train_size+val_size:]

print('Finished splitting the data.')

Finished splitting the data.


In [31]:
import nltk

pos_lexicon = 'positive-words.txt'
neg_lexicon = 'negative-words.txt'

# Read the positive sentiment lexicon.
pos_dict = {}
f = open(pos_lexicon, 'r', encoding = "ISO-8859-1")
for line in f:
    line = line.strip()
    pos_dict[line] = 1
f.close()

# Read the negative sentiment lexicon.
neg_dict = {}
f = open(neg_lexicon, 'r', encoding = "ISO-8859-1")
for line in f:
    line = line.strip()
    neg_dict[line] = 1
f.close()

In [55]:
# The following list stores the predicted labels.
predicted_labels = []
for record in test_data:
    score = 0
    for w in record[1]:
        if w in pos_dict:
            score = score + 1
        # If the word w is inside the negative lexicon, then decrease the score by 1.
        # elif means "else if"
        elif w in neg_dict:
            score = score - 1
    if score > 0:
        predicted_labels.append('positive')
    elif score < 0:
        predicted_labels.append('negative')
    else:
        predicted_labels.append('neutral')   
        

In [78]:
# Compute the accuracy of the predicted labels.
total = len(test_data)
correct = 0

test_data_labels = []
for i in test_data:
    test_data_labels.append(i[0])

for (tl, pl) in zip(test_data_labels, predicted_labels):
    if tl == pl:
        correct = correct + 1

accu = correct / total
print("Accuracy: ", accu)

from sklearn.metrics import f1_score
f1 = f1_score(test_data_labels, predicted_labels, labels=["positive", "negative", "neutral"], average='weighted')

print("F1 Score:", f1)
print()
print("Lexicon Positive Reviews: ", predicted_labels.count('positive'))
print("Lexicon Negative Reviews: ", predicted_labels.count('negative'))
print()
print("Actual Positive Reviews: ", test_data_labels.count('positive'))
print("Actual Negative Reviews: ", test_data_labels.count('negative'))

Accuracy:  0.7746781115879828
F1 Score: 0.7273501263941857

Lexicon Positive Reviews:  425
Lexicon Negative Reviews:  25

Actual Positive Reviews:  366
Actual Negative Reviews:  29


function

In [61]:
import nltk

pos_lexicon = 'positive-words.txt'
neg_lexicon = 'negative-words.txt'

# Read the positive sentiment lexicon.
pos_dict = {}
f = open(pos_lexicon, 'r', encoding = "ISO-8859-1")
for line in f:
    line = line.strip()
    pos_dict[line] = 1
f.close()

# Read the negative sentiment lexicon.
neg_dict = {}
f = open(neg_lexicon, 'r', encoding = "ISO-8859-1")
for line in f:
    line = line.strip()
    neg_dict[line] = 1
f.close()

In [64]:
def lexicon(test):
    
    # Preprocessing 
    sent = nltk.word_tokenize(text)
    
    # Lowercase conversion
    sent = [w.lower() for w in sent]
    
    # Stop word removal 
    sent = [w for w in sent if w not in stop_list]

    # Remove punctuation
    sent = [w for w in sent if w.isalnum()]
    
    # Lemmatization 
    sent = [lemmatizer.lemmatize(w) for w in sent]

    # Expand contractions
    sent = [contractions.fix(w) for w in sent]

    # Create bigrams
    bigrams = [' '.join(w) for w in list(ngrams(sent, 2))]
    sent.extend(bigrams)
    
    words = test.split()
    
    # Lexicon
    score = 0
    for word in words:
        if word in pos_dict:
            score += 1
        elif word in neg_dict:
            score -= 1
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'


In [68]:
print(lexicon("BEST experience I have ever had at the KT on 6/14/2015.  John, our server, was amazing.  Really awesome.  The food was awesome as well.  I was sooooo happy we went.  Great job guys!!  Keep up the great work.  BTW the burgers are amazing.  Thanks again!"))

positive
