# Sentiment Analysis

By Gurleen, Kyleigh, Yolanda

Used DigitalOcean tutorial to implement sentiment analysis.
https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk#splitting-the-dataset-for-training-and-testing-the-model

### Extract file with review, run this once!

In [120]:
import tarfile

filename = 'dataset/review_polarity.tar.gz'

# open and extract
with tarfile.open(filename, 'r:gz') as tar:
    tar.extractall(path='dataset')

  tar.extractall(path='dataset')


### Installing NLTK

In [91]:
import nltk
import re, string
import random

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gurle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gurle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gurle\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gurle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Downloading the Data

In [92]:
import os

def load_reviews(pos_folder, neg_folder):
    texts, labels = [], []

    # Load positive reviews
    for file in os.listdir(pos_folder):
        if file.endswith('.txt'):
            with open(os.path.join(pos_folder, file), 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(1)  # Positive label

    # Load negative reviews
    for file in os.listdir(neg_folder):
        if file.endswith('.txt'):
            with open(os.path.join(neg_folder, file), 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(0)  # Negative label

    return texts, labels

pos_folder_path = 'dataset/txt_sentoken/pos' 
neg_folder_path = 'dataset/txt_sentoken/neg' 

texts, labels = load_reviews(pos_folder_path, neg_folder_path)


### Tokenizing the Data

In [93]:
text_tokens = [word_tokenize(text.lower()) for text in texts]

### Normalizing the Data

In [95]:
# stemming (lemmatization)
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
             pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

### Removing Noise from the Data (using regular expressions)

In [103]:
def remove_noise(tokens, stop_words = ()):
    cleaned_tokens = []

    for token, tag in pos_tag(tokens):
        # Remove URLs
        token = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
        # Remove mentions
        token = re.sub(r"(@[A-Za-z0-9_]+)", "", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())

    return cleaned_tokens

stop_words = stopwords.words('english')

# clean the tokenized movie reviews
cleaned_tokens_list = [remove_noise(tokens, stop_words) for tokens in text_tokens]

# print a few cleaned tokens to verify
print(cleaned_tokens_list[:3]) 

[['film', 'adapt', 'comic', 'book', 'plenty', 'success', 'whether', "'re", 'superheroes', 'batman', 'superman', 'spawn', 'gear', 'toward', 'kid', 'casper', 'arthouse', 'crowd', 'ghost', 'world', "'s", 'never', 'really', 'comic', 'book', 'like', 'hell', 'starter', 'create', 'alan', 'moore', 'eddie', 'campbell', 'bring', 'medium', 'whole', 'new', 'level', 'mid', "'80s", '12-part', 'series', 'call', 'watchman', 'say', 'moore', 'campbell', 'thoroughly', 'research', 'subject', 'jack', 'ripper', 'would', 'like', 'say', 'michael', 'jackson', 'start', 'look', 'little', 'odd', 'book', '``', 'graphic', 'novel', '``', '500', 'page', 'long', 'include', 'nearly', '30', 'consist', 'nothing', 'footnote', 'word', "n't", 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumble', 'block', 'hell', "'s", 'director', 'albert', 'allen', 'hughes', 'get', 'hughes', 'brother', 'direct', 'seem', 'almost', 'ludicrous', 'cast', 'carrot', 'top', 'well', 'a

### Preparing Data for the Model

In [118]:
# convertings tokens to a dictionary

def get_reviews_for_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)

review_tokens_for_model = get_reviews_for_model(cleaned_tokens_list)

# splitting the dataset for training and testing the model
dataset = [(review_dict, "Positive" if label == 1 else "Negative")
           for review_dict, label in zip(review_tokens_for_model, labels)]

random.shuffle(dataset)

split_index = int(0.8 * len(dataset)) # 80% for training, 20% for testing
train_data = dataset[:split_index]
test_data = dataset[split_index:]

### Building and Testing the Model

In [119]:
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))

Accuracy is: 0.7575
Most Informative Features
             outstanding = True           Positi : Negati =     14.2 : 1.0
               ludicrous = True           Negati : Positi =     13.0 : 1.0
                    plod = True           Negati : Positi =     11.6 : 1.0
                 b-movie = True           Negati : Positi =     11.0 : 1.0
                   poker = True           Positi : Negati =     10.4 : 1.0
              degenerate = True           Negati : Positi =     10.3 : 1.0
               stupidity = True           Negati : Positi =     10.2 : 1.0
                  turkey = True           Negati : Positi =      9.8 : 1.0
            exploitation = True           Negati : Positi =      9.6 : 1.0
               insulting = True           Negati : Positi =      9.6 : 1.0
None
