In [56]:
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer, TweetTokenizer, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk import classify, NaiveBayesClassifier
import contractions, re, string, random
import pandas as pd
import pickle

In [57]:
# access the trainung dataset and store in a dataframe
train_df = pd.read_csv("Training_data.csv").dropna()
train_df2 = pd.read_csv("IMDB Dataset.csv").dropna()
train_df = pd.concat([train_df, train_df2])
test_df = pd.read_csv("Testing_data.csv").dropna()
train_data_text = [str(val) for val in train_df["Review"].values]
test_data_text = [str(val) for val in test_df["Review"].values]
train_data_sentiment = [str(val) for val in train_df["Sentiment"].values]
test_data_sentiment = [str(val) for val in test_df["Sentiment"].values]

In [58]:
# positive_tweets
def clean(text: str):
    # remove numbers
    text = re.sub(r"\d+", "", text)
    # expand contraction
    text = contractions.fix(text)
    #remove the retweet sign
    text = re.sub(r"^(RT)", "", text)
    # remove links
    text = re.sub(r"(ftp|http[s]?)://\S+", "", text)
    # remove mentions
    text = re.sub(r"@[A-Za-z0-9_]+", "", text)
    # remove hashtags
    text = re.sub(r"#\w+", "", text)
    # remove punctuations
    sub_txt = r"[" + re.escape(string.punctuation) + r"]"
    text = re.sub(sub_txt, "", text)
    
    return text.strip()

In [59]:
train_data_text = [clean(tweet) for tweet in train_data_text]
test_data_text = [clean(tweet) for tweet in test_data_text]

In [60]:
tokenizer = WordPunctTokenizer()
def lemmatize_sentence(token: list):
    lemmatizer = WordNetLemmatizer()
    sentence = []
    for word, tag in pos_tag(token):
        if tag.startswith("NN"):
            tag = "n" # noun
        elif tag.startswith("V"):
            tag = "v" # verb
        elif tag.startswith(("RB")):
            tag = "r" # adverb
        else:
            tag = "a" # adjective
        sentence.append(lemmatizer.lemmatize(word, tag))
    return sentence

# split texts into tokens
tokenized_train_text = [tokenizer.tokenize(text) for text in train_data_text]
lemmatized_train_text = [lemmatize_sentence(token) for token in tokenized_train_text]
# lemmatized_data_text
tokenized_test_text = [tokenizer.tokenize(text) for text in test_data_text]
lemmatized_test_text = [lemmatize_sentence(token) for token in tokenized_test_text]

In [61]:
stop_words = set(stopwords.words("english"))
def remove_stopwords(token: list):
    cleaned_up = []
    for word in token:
        if len(word) > 1 and word not in string.punctuation and word.lower() not in stop_words:
            cleaned_up.append(word.lower())
    return cleaned_up
    
cleaned_train_text = [remove_stopwords(text) for text in lemmatized_train_text]
cleaned_test_text = [remove_stopwords(text) for text in lemmatized_test_text]

In [62]:
# prepare text for model
# the model requires a dictionary of word with with values as True
def get_dict_for_model(cleaned_tokenized_text: list):
    for tokens in cleaned_tokenized_text:
        yield dict((word, True) for word in tokens)

train_data = [(text_dict, train_data_sentiment[n]) for n, text_dict in enumerate(get_dict_for_model(cleaned_train_text))]
test_data = [(text_dict, test_data_sentiment[n]) for n, text_dict in enumerate(get_dict_for_model(cleaned_test_text))]

In [63]:
random.shuffle(train_data)
random.shuffle(test_data)

In [64]:
analyzer = NaiveBayesClassifier.train(train_data)
print("Accuracy: " + str(classify.accuracy(analyzer, test_data)))
with open("NBModel", 'wb') as f:
    pickle.dump(analyzer, f)
analyzer.show_most_informative_features(10)
print(analyzer.labels())

Accuracy: 0.91736
Most Informative Features
                     uwe = True           negati : positi =     59.8 : 1.0
                    boll = True           negati : positi =     41.6 : 1.0
                    tsui = True           positi : negati =     32.3 : 1.0
            interminable = True           negati : positi =     30.2 : 1.0
                 antwone = True           positi : negati =     29.7 : 1.0
                    icet = True           negati : positi =     29.7 : 1.0
               deathtrap = True           positi : negati =     27.0 : 1.0
                   ronny = True           positi : negati =     27.0 : 1.0
                   hayao = True           positi : negati =     25.7 : 1.0
               toughness = True           positi : negati =     25.7 : 1.0
['negative', 'positive']


In [65]:
def analyze(text: str):
    text = clean(text)
    tokenized_text = tokenizer.tokenize(text)
    lemmatized_text = lemmatize_sentence(tokenized_text)
    cleaned_text = remove_stopwords(lemmatized_text)
    return analyzer.classify(dict([(token, True) for token in cleaned_text]))