# Model training

In [36]:
import nltk
nltk.download("subjectivity")

[nltk_data] Downloading package subjectivity to
[nltk_data]     /Users/nicolasantacroce/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!


True

In [37]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.tokenize import SpaceTokenizer
import re

## Tutorial part

n_instances = 100
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
#subj_docs[3][0]

train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs

sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words(training_docs,labeled= True)
#all_words_neg

unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
#unigram_feats

training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
#test_set

print(testing_docs[0][1],testing_docs[9][1],testing_docs[19][1])

#trainer = NaiveBayesClassifier.train
#classifier = sentim_analyzer.train(trainer, training_set)
NBC= NaiveBayesClassifier.train(training_set)
NBC.classify_many([test_set[0][0],test_set[9][0],test_set[19][0]])

## Formal/imformal model training

In [None]:
informal_corpus= "./soap-text.txt"
formal_corpus= "./wikipedia-text.txt"
model_name= "NBC"

In [45]:
# reading soap operas text
soap_file = open(informal_corpus, "r")
soap_text = soap_file.read() # string
soap_file.close()
# tokenizing sentences using punctuations
punkt_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
soap_sentences= (punkt_tokenizer.tokenize(soap_text))
# tokenizing sentences using spaces (and removal of words containing "@")
space_tokenizer = SpaceTokenizer()
informal_docs= [] # list of tuples, tuples contain tokens and label
for sentence in soap_sentences:
    tokenized_sentence= space_tokenizer.tokenize(sentence)
    if len(tokenized_sentence)>1:
        informal_docs.append(([ele for ele in tokenized_sentence if not (re.search('[@#/<>*]', ele))],"informal"))

In [46]:
# reading wikipedia articles text
wiki_file = open(formal_corpus, "r")
wiki_text = wiki_file.read() # string
wiki_file.close()
# tokenizing sentences using punctuations
punkt_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
wiki_sentences= (punkt_tokenizer.tokenize(wiki_text))
# tokenizing sentences using spaces (and removal of words containing "@")
space_tokenizer = SpaceTokenizer()
formal_docs= [] # list of tuples, tuples contain tokens and label
for sentence in wiki_sentences:
    tokenized_sentence= space_tokenizer.tokenize(sentence)
    if len(tokenized_sentence)>1: # no single word sentences
        formal_docs.append(([ele for ele in tokenized_sentence if not (re.search('[@#/<>*]', ele))],"formal"))

In [47]:
train_formal, test_formal = nltk.sentiment.util.split_train_test(formal_docs, n=None)
train_informal, test_informal = nltk.sentiment.util.split_train_test(informal_docs, n=None)
train_docs= train_formal+train_informal
test_docs= test_formal+test_informal

In [48]:
sentim_analyzer = SentimentAnalyzer()
all_words = sentim_analyzer.all_words(train_docs,labeled= True)

In [86]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=100)

In [87]:
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [51]:
train_set = sentim_analyzer.apply_features(train_docs)
test_set = sentim_analyzer.apply_features(test_docs)

In [56]:
NBC= NaiveBayesClassifier.train(train_set)

KeyboardInterrupt: 

In [60]:
results= []
for i in range(1000):
    if NBC.classify(test_set[i][0])==test_docs[i][1]:
        results.append(1)
    else:
        results.append(0)

In [62]:
sum(results)/len(results)

0.905

In [88]:
import pickle
with open(model_name+'_model.pkl', 'wb') as file:
    # A new file will be created
    pickle.dump(NBC, file)

In [89]:
with open(model_name+"_features.pkl","wb") as file:
    pickle.dump(sentim_analyzer, file)