In [19]:
import sklearn as sk
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords

import gensim as gs
import numpy as np
import pandas as pd

import string
import time

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#Import Data

phrases_raw = pd.read_csv('./stanfordSentimentTreebank/dictionary.txt', 
                         delimiter = '|', 
                         header=None, 
                         names = ['phrase', 'phrase_id'], 
                         index_col='phrase_id')
labels_raw = pd.read_csv('./stanfordSentimentTreebank/sentiment_labels.txt', 
                     delimiter = '|',
                     header=0, 
                     names = ['phrase_id', 'sentiment'], 
                     index_col='phrase_id')

labeled_phrases = labels_raw.join(phrases_raw)
labeled_phrases.head()

Unnamed: 0_level_0,sentiment,phrase
phrase_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.5,!
1,0.5,'
2,0.44444,' (
3,0.5,' ( the cockettes
4,0.42708,' ( the cockettes )


In [3]:
#Quantize Labels

num_classes = 5
classes = [0]
class_names = [0]
for i in range(1, num_classes):
    classes.append(classes[i-1] + 1/(num_classes-1))
    class_names.append(i)
    
print("Classes: ", classes)

labels = []
small_dist = 1
for label in labels_raw.values:
    for i,cls in enumerate(classes):
        dist = abs(float(label) - cls)
        if (dist < small_dist):
            small_dist = dist
            small_label = i
    labels.append(small_label)
    
print("Labels:  ", labels[0:5])


Classes:  [0, 0.25, 0.5, 0.75, 1.0]
Labels:   [2, 2, 2, 2, 2]


## Vectorize Words

In [4]:
#for phrase in dictionary:
#    gs.utils.simple_preprocess(phrase)
    
def get_words(phraseList):
    words = []
    for phrase in phraseList:
        tokList = phrase.split()
        for word in tokList:
            words.append(word)
    return words

#def get_frequency(words):
#    freqDict = {}
#    for word in words:
#        if word not in freqDict:
#            freqDict[word] = 1
#        else:
#            freqDict[word] += 1
#    return freqDict
#
#freqDict = get_frequency(words)


phraseList = phrases_raw.phrase.tolist()
words = get_words(phraseList)
vocab = set(words)

In [8]:
print("Original Size of Vocabulary: ", len(vocab))

Initial Size of Vocabulary:  22346


In [5]:
#phrases = []
#for phrase in phrases_raw["phrase"]:
#    word_list = phrase.lower().translate(string.punctuation).split(' ')
#    phrases.append([word for word in word_list if word not in stopwords.words('english')])
    
#freq_matrix = np.zeros((len(phrases),len(vocab)))
#for i,phrase in enumerate(phrases):
#    for j,word in enumerate(vocab):
#        if (word in phrase):
#            freq_matrix[i][j] = 1
            
            
            

In [6]:
#sorted_by_value = sorted(freqDict.items(), reverse = True, key=lambda kv: kv[1])
    
#for tup in sorted_by_value:
#    wordFreqL.add(tup)
    
#pd.DataFrame.from_records()  

In [11]:
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


token_dict = {}
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


phrases = []
for phrase in phrases_raw['phrase']:
    phrase_clean = phrase.lower().translate(string.punctuation)
    phrases.append(phrase_clean)


tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(phrases)

  'stop_words.' % sorted(inconsistent))


In [13]:
print("Number of Phrases: ", tfs.shape[0])
print("Reduced Size of Vocabulary", tfs.shape[1])

Number of Phrases:  239232
Reduced Size of Vocabulary 14171


In [14]:
tfs


<239232x14171 sparse matrix of type '<class 'numpy.float64'>'
	with 1095126 stored elements in Compressed Sparse Row format>

In sparse format, this data matrix contains just over a million elements

If this data matrix was represented as a dense dataset it would be over 3 billion elements. This is not able to fit in memory.

The partial_fit method enables training a classifier incrementally in minibatches.

## Determine Baseline Accuracy - Naive Bayes

In [15]:
X_train, X_test_val, y_train, y_test_val = train_test_split(tfs, labels, test_size=0.30) # 70% Train, 30% Test/Val
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.33) # 10% Val, 20% Test

In [21]:
partial_fit_classifiers = {
    #'NB Gaussian': GaussianNB(),
    'NB Multinomial': MultinomialNB(alpha=0.01),
    #'NB Complement': ComplementNB(alpha=0.01),
    #'NB Bernoulli': BernoulliNB(alpha=0.01),
}

clf_stats = {}

for clf_name in partial_fit_classifiers:
    stats = {'n_train': 0, 'n_train_pos': 0,
             'accuracy': 0.0, 
        'accuracy_history': [(0, 0)], 't0': time.time(),
             'runtime_history': [(0, 0)], 'total_fit_time': 0.0
            }
    clf_stats[clf_name] = stats

In [23]:
def progress(clf_name, stats):
    """Report progress information, return a string."""
    duration = time.time() - stats['t0']
    s = "%20s classifier : \t" % clf_name
    s += "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats
    #s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % test_stats
    s += "accuracy: %(accuracy).3f " % stats
    s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
    return s


def get_minibatch(start_row, size, X, y):
    """Extract a minibatch of examples, return a tuple X_batch, y_batch."""
    end_row = size + start_row
    max_row = X.shape[1]
    if (end_row > max_row):
        end_row = max_row
        
    X_batch = X[start_row:end_row, :]
    y_batch = y[start_row:end_row]
    return X_batch, y_batch

def iter_minibatches(start, size, X, y):
    """Generator of minibatches."""
    X_batch, y_batch = get_minibatch(start, size, X, y)
    while X_batch.shape[0]:
        yield X_batch, y_batch
        X_batch, y_batch = get_minibatch(start, size, X, y)


In [24]:
minibatch_size = 1000
minibatch_iterators = iter_minibatches(0, minibatch_size, X_train, y_train)
total_vect_time = 0.0


for i, (X_train_batch, y_train_batch) in enumerate(minibatch_iterators):
    for clf_name, clf in partial_fit_classifiers.items():
        
        tick = time.time()
        total_vect_time += time.time() - tick
        
        clf.partial_fit(X_train_batch.toarray(), y_train_batch, classes = np.array(class_names))
        
        clf_stats[clf_name]['total_fit_time'] += time.time() - tick
        clf_stats[clf_name]['n_train'] += X_train.shape[0]
        clf_stats[clf_name]['n_train_pos'] += sum(y_train)
        tick = time.time()
        
        clf_stats[clf_name]['accuracy'] = clf.score(X_val.toarray(), y_val)
        clf_stats[clf_name]['prediction_time'] = time.time() - tick
        acc_history = (clf_stats[clf_name]['accuracy'],
                       clf_stats[clf_name]['n_train'])
        clf_stats[clf_name]['accuracy_history'].append(acc_history)
        run_history = (clf_stats[clf_name]['accuracy'],
                       total_vect_time + clf_stats[clf_name]['total_fit_time'])
        clf_stats[clf_name]['runtime_history'].append(run_history)
            
        if i % 3 == 0:
            print(progress(clf_name, clf_stats[clf_name]))
    if i % 3 == 0:
        print('\n')


  self.class_log_prior_ = (np.log(self.class_count_) -


      NB Multinomial classifier : 	334924 train docs (669848 positive) accuracy: 1.000 in 39.34s ( 8514 docs/s)




  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -


      NB Multinomial classifier : 	837310 train docs (1674620 positive) accuracy: 1.000 in 48.70s (17194 docs/s)




  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -


      NB Multinomial classifier : 	1339696 train docs (2679392 positive) accuracy: 1.000 in 57.05s (23484 docs/s)




  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -


      NB Multinomial classifier : 	1842082 train docs (3684164 positive) accuracy: 1.000 in 62.95s (29263 docs/s)




  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -


      NB Multinomial classifier : 	2344468 train docs (4688936 positive) accuracy: 1.000 in 69.61s (33680 docs/s)




  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -


      NB Multinomial classifier : 	2846854 train docs (5693708 positive) accuracy: 1.000 in 76.51s (37209 docs/s)




  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -


      NB Multinomial classifier : 	3349240 train docs (6698480 positive) accuracy: 1.000 in 83.33s (40192 docs/s)




  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -


KeyboardInterrupt: 

In [25]:
clf.score(X_test, y_test)

1.0

In [1]:
import pytreebank
# load the sentiment treebank corpus in the parenthesis format,
# e.g. "(4 (2 very ) (3 good))"
dataset = pytreebank.load_sst()
# add Javascript and CSS to the Ipython notebook
pytreebank.LabeledTree.inject_visualization_javascript()
# select and example to visualize
example = dataset["train"][0]
# display it in the page
example.display()

FileNotFoundError: [Errno 2] No such file or directory: '/home/david/stanford_sentiment_treebank/trainDevTestTrees_PTB/trees/train.txt'