In [1]:
from collections import Counter
import numpy as np
import time 
import sys

In [2]:
# open file reviews
with open('reviews.txt', 'r', encoding='utf-8') as rev:
    reviews = list(map(lambda x: x[:-1], rev.readlines()))
rev.close()

# open file labels
with open('labels.txt', 'r', encoding='utf-8') as lab:
    labels = list(map(lambda x: x[:-1], lab.readlines()))
lab.close()    
    

In [3]:
print('len(reviews)',len(reviews))
print('\nlen(labels)',len(labels))

len(reviews) 25000

len(labels) 25000


In [4]:
print('reviews[0]=', reviews[0][:80])

print('\nlabels[0]=', labels[0])

reviews[0]= bromwell high is a cartoon comedy . it ran at the same time as some other progra

labels[0]= positive


In [5]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [6]:
for _ in range(len(reviews)):
    if(labels[_]=='positive'):
        for word in reviews[_].split(" "):
            positive_counts[word] += 1
            total_counts[word]    += 1
    else:
        for word in reviews[_].split(" "):
            negative_counts[word] += 1
            total_counts[word]    += 1
            

In [7]:
pos_neg_ratios = Counter()

In [8]:
for term, cnt in list(total_counts.most_common()):
    if(cnt > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

for word, ratio in pos_neg_ratios.most_common():
    if(ratio > 1):
        pos_neg_ratios[word] = np.log(ratio)
    else:
        pos_neg_ratios[word] = -np.log((1 / (ratio+0.01)))
    

In [9]:
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print('vocab_size=', vocab_size)

vocab_size= 74074


In [10]:
list(vocab)
layer_0 = np.zeros((1,vocab_size))


In [11]:
word2index = {}

for i, word in enumerate(vocab):
    word2index[word] = i

In [12]:
def update_input_layer(review):
    global layer_0
    
    #clear out previous state, reste the layer
    layer_0 *= 0
    for word in review.split(' '):
        layer_0[0][word2index[word]] += 1

update_input_layer(reviews[0])        

In [13]:
def get_target_for_label(label):
    if(label == 'positive'):
        return 1
    else:
        return 0

In [33]:
class SentimentNetwork:
    def __init__(self, reviews, labels, hidden_nodes = 10, lr = 0.001):
    
        # set our random number generator
        np.random.seed(1)
        
        self.pre_process_data(reviews, labels)
        
        self.init_network(len(self.review_vocab), hidden_nodes, 1, lr)
        
  
    def pre_process_data(self, reviews, labels):
        review_vocab = set()
        for review in reviews:
            for word in review.split(' '):
                    review_vocab.add(word)
        self.review_vocab = list(review_vocab)  
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size  = len(self.label_vocab)
        
        self.word2index  = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word]   = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

                
    def init_network(self, input_nodes, hidden_nodes, outpu_nodes, lr):
        self.input_nodes  = input_nodes
        self.hidden_nodes = hidden_nodes
        self.outpu_nodes  = outpu_nodes
        
        # Initialize weights
        self.W1 = np.zeros((self.input_nodes, self.hidden_nodes))
        self.W2 = np.random.normal(0.0, self.outpu_nodes**-0.5, 
                                  (self.hidden_nodes, self.outpu_nodes))
        
        self.lr = lr
        
        self.layer_0 = np.zeros((1, input_nodes))
        
        
    def update_input_layer(self, review):
        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] = 1
    
    def get_target_for_label(self, label):
        if(label == 'positive'):
            return 1
        else:
            return 0
    
    def sigmoid(self, x, der = False):
        if(der == True):
            return x * (1 - x)
        return 1.0 / (1.0 + np.exp(-x))
    
    def train(self, training_reviews, training_labels):
        assert(len(training_reviews) == len(training_labels))
    
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            review = training_reviews[i]
            label  = training_labels[i]
            
            # Forward pass
            self.update_input_layer(review)
            layer_1 = self.layer_0.dot(self.W1)
            layer_2 = self.sigmoid(layer_1.dot(self.W2))
            
            # Backward pass
            
            layer_2_error = layer_2 - self.get_target_for_label(label)
            layer_2_delta = layer_2_error*self.sigmoid(layer_2, True)
            
            # Backpropagation error
            layer_1_error = layer_2_delta.dot(self.W2.T)
            # hidden layer gradients - no nonlinearity so it's the same as the error
            layer_1_delta = layer_1_error
            
            # Update the weights
            self.W2 -= layer_1.T.dot(layer_2_delta) * self.lr
            self.W1 -= self.layer_0.T.dot(layer_1_delta) * self.lr
            
            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
                
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if (i % 2500 ==  0):
                print('')
                
                
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        
        # Input Layer
        self.update_input_layer(review.lower())

        # Hidden layer
        layer_1 = self.layer_0.dot(self.W1)

        # Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.W2))
        
        if(layer_2[0] > 0.5):
            return "positive"
        else:
            return "negative"     
        

In [34]:
# %%debug
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], lr=0.001)

In [35]:
# %%debug

mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):117.7 #Correct:1940 #Trained:2501 Training Accuracy:77.5%
Progress:20.8% Speed(reviews/sec):114.0 #Correct:3987 #Trained:5001 Training Accuracy:79.7%
Progress:31.2% Speed(reviews/sec):113.1 #Correct:6085 #Trained:7501 Training Accuracy:81.1%
Progress:41.6% Speed(reviews/sec):112.8 #Correct:8204 #Trained:10001 Training Accuracy:82.0%
Progress:52.0% Speed(reviews/sec):112.6 #Correct:10337 #Trained:12501 Training Accuracy:82.6%
Progress:62.5% Speed(reviews/sec):111.5 #Correct:12423 #Trained:15001 Training Accuracy:82.8%
Progress:72.9% Speed(reviews/sec):111.6 #Correct:14524 #Trained:17501 Training Accuracy:82.9%
Progress:83.3% Speed(reviews/sec):111.5 #Correct:16697 #Trained:20001 Training Accuracy:83.4%
Progress:93.7% Speed(reviews/sec):111.4 #Correct:18856 #Trained:22501 Training Accuracy:83.8%
Progress:99.9% Speed(reviews/sec):111.4 #Correct:20172 #Trained:24000 Training 

In [17]:
# %%debug

# evaluate our model before training (just to show how horrible it is)

mlp.test(reviews[-1000:],labels[-1000:])

Progress:99.9% Speed(reviews/sec):370.0% #Correct:500 #Tested:1000 Testing Accuracy:50.0%

In [18]:
items = [1,2,3,4,5,]
squared  = []

for x in items:
    squared.append(x**2)
    
print(squared)    

[1, 4, 9, 16, 25]


In [19]:
sqrt = list(map((lambda x: x**2), items))
print(sqrt)

[1, 4, 9, 16, 25]


In [20]:
type(reviews[0])

str

In [21]:
lista = ["paulo paulo paulo", "eli eli eli eli"]

In [22]:
tot_count = Counter()

In [23]:
for _ in range(len(lista)):
    for word in lista[_].split(' '):
        tot_count[word] += 1 
        
tot_count        

Counter({'eli': 4, 'paulo': 3})

In [24]:
vokab = set(tot_count.keys())
print('set(tot_count.keys()) = ', set(tot_count.keys()))
vokab_size = len(vokab)
print('vokab_size = ',vokab_size)



set(tot_count.keys()) =  {'paulo', 'eli'}
vokab_size =  2


In [25]:
lay_0 = np.zeros((1,vokab_size))

print('lay_0.shape=', lay_0.shape)


wrd2index = {}

for i, word in enumerate(vokab):
    wrd2index[word] = i
    
print('wrd2index=', wrd2index)    

lay_0.shape= (1, 2)
wrd2index= {'paulo': 0, 'eli': 1}


In [26]:
lay_0[0][1]

0.0

In [27]:
a = np.arange(12).reshape(1,12)

print(a)


[[ 0  1  2  3  4  5  6  7  8  9 10 11]]


In [28]:
a[0][3]

3

In [29]:
wrd2index

{'eli': 1, 'paulo': 0}

In [30]:
wrd2index['eli']

1

In [31]:
lista[0].split(' ')

['paulo', 'paulo', 'paulo']

In [32]:
#clear out previous state, reste the layer
lay_0 *= 0
for word in lista[0].split(' '):
    lay_0[0][wrd2index[word]] += 1
    
lay_0    



array([[ 3.,  0.]])