In [16]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\liuxi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [23]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\liuxi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [20]:
import re

# Step 1: Load Data

In [6]:
def load_data(filepath):
    review_list, label_list = [], []
    with open(filepath, "r") as fin:
        for line in fin:
            sample = line.strip().split("\t")
            label = sample[-1]
            input_text = " ".join(sample[:-1])
            review_list.append(input_text)
            label_list.append(label)
        return review_list, label_list

In [12]:
train_data_path = "./Downloads/train.tsv"
train_text, train_label = load_data(train_data_path)
print(train_text[:10], train_label[:10])

["The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .", "The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer\\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth .", 'Singer\\/composer Bryan Adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- but the whole package certainly captures the intended , er , spirit of the piece .', 'Yet the act is still charming here .', "Whether or not you 're enlightened by any of Derrida 's lectures on `` the other '' and `` the self , '' Derrida is an undeniably fascinating and playful fellow .", 'Just the labour involved in creating the layered richness of the imagery in this chiaroscuro of madness and light is astonishing .', 'Part of the charm of Satin 

# Step 2: Extract Features

In [30]:
nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\liuxi\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [32]:
pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())
print(len(pos_list), len(neg_list))

2006 4783


In [48]:
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
ps = WordNetLemmatizer()

rocks : rock
corpora : corpora
greater : greater


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\liuxi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\liuxi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [58]:
def extract_features(sentence):
    # Text preprocessing, lowercase, removing stop words, lemmatization etc.
    sentence = re.sub(r'\W+', ' ', sentence)
    sentence = sentence.lower()
    sentence = sentence.replace("[^a-zA-Z]", " ")
    sentence = word_tokenize(sentence)
    text = [w for w in sentence if not w in stop_words]
    lem = WordNetLemmatizer()
    lem_text = [lem.lemmatize(w, pos='a') for w in text]
    # Count positive words and negative words of a sentence.
    count_pos_words, count_neg_words = 0, 0
    for idx in range(len(text)):
        if text[idx] in pos_list or lem_text[idx] in pos_list:
            count_pos_words += 1
        if text[idx] in neg_list or lem_text[idx] in neg_list:
            count_neg_words += 1
    print(text)
    return {"num_positive_words": count_pos_words, "num_negative_words": count_neg_words}

In [60]:
result_dict = extract_features(train_text[1])
print(result_dict)

The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth .
['gorgeously', 'elaborate', 'continuation', 'lord', 'rings', 'trilogy', 'huge', 'column', 'words', 'adequately', 'describe', 'co', 'writer', 'director', 'peter', 'jackson', 'expanded', 'vision', 'j', 'r', 'r', 'tolkien', 'middle', 'earth']
{'num_positive_words': 1, 'num_negative_words': 0}


# Step 3: Rule-based Classifier

In [61]:
def classify(feature_dict):
    if feature_dict["num_positive_words"] > feature_dict["num_negative_words"]:
        return 1 # "Positive"
    else:
        return 0 # "Negative"

In [62]:
classify(extract_features(train_text[0]))

The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .
['rock', 'destined', '21st', 'century', 'new', 'conan', 'going', 'make', 'splash', 'even', 'greater', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'steven', 'segal']


1