In [2]:
import pandas as pd
import numpy as np

s140 = pd.read_pickle('ds_28nov/s140_clean_28nov.pkl')
s140 = s140[s140.target != 'neutral']
cc = pd.read_pickle('ds_28nov/cc_clean_28nov.pkl')
cc = cc[pd.notna(cc.target)]
mr = pd.read_pickle('ds_28nov/mr_clean_28nov.pkl')

In [20]:
from collections import Counter
# Get a list of all the words
words = (' '.join(s140.trimmed.tolist())).split()

# Count all the words using Counter Method
count_words = Counter(words)

total_words = len(words)
sorted_words = count_words.most_common(total_words)

['switchfoot', '_link_', 'aw', "'s", 'bummer', 'shoulda', 'got', 'david', 'carr', 'third']


In [45]:
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

text_int = np.array([[vocab_to_int[w] for w in text] for text in s140.lemma])
encoded_target = np.array([int(target == 'positive') for target in s140.target])

In [49]:
def pad_features(reviews_int, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(reviews_int), seq_length), dtype = int)
    
    for i, review in enumerate(reviews_int):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

features_size = 1000
features = pad_features(text_int, features_size)

In [75]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(features, encoded_target, test_size=0.2, stratify=encoded_target, random_state=42)

In [52]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 64
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [135]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.tanh = nn.Tanh()
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.tanh(self.i2h(combined))
        output = self.softmax(self.i2o(combined))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(1, n_hidden, 2)

In [136]:
def output_rnn(rnn, sequence):
    hidden = rnn.initHidden()
    print(sequence)
    for i in range(sequence.size()[0]):
        print(sequence[i])
        output, hidden = rnn(sequence[i], hidden)
    return output

In [137]:
def train(model, train_loader, optimizer, epoch ):
    """Perform one epoch of training."""
    model.train()
    
    for batch_idx, (inputs, target) in enumerate(train_loader):
        inputs, target = inputs.to(device), target.to(device)
        
        optimizer.zero_grad()
        for i, t in zip(inputs, targets):
            output = output_rnn(model, i)
            loss = criterion(output, t)
            loss.backward()
            optimizer.step()
            current_loss += loss.item()
        
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(inputs), len(train_loader) *len(inputs) ,
                100. * batch_idx / len(train_loader), loss.item()))

In [138]:
def test(model, test_loader):
    """Evaluate the model by doing one pass over a dataset"""
    model.eval()
    
    test_loss = 0
    correct = 0
    test_size = 0
    
    with torch.no_grad():
        for inputs, target in test_loader:
            inputs, target = inputs.to(device), target.to(device)
            
            # TODO: code the evaluation loop
            output = model(inputs)
            test_size += len(inputs)
            test_loss += test_loss_fn(output, target).item() # sum up batch loss
            # output = batch size * n_classes
            pred = output.argmax(dim=1)
            correct += (pred == target).sum().item()
            # pred = output.max(1, keepdim=True)
            # pred = pred[1] # get the index of the max log-probability

            # correct += pred.eq(target.view_as(pred)).sum().item()
            # #

    test_loss /= test_size
    accuracy = correct / test_size
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, test_size,
        100. * accuracy))
    
    return test_loss, accuracy

In [139]:
import os

device = 'cuda'

savedir = 'results'
if not os.path.exists(savedir):
    os.makedirs(savedir)

model = rnn.to(device)

lr = 0.0005
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

results = {'name':'basic', 'lr': lr, 'loss': [], 'accuracy':[]}
savefile = os.path.join(savedir, results['name']+str(results['lr'])+'.pkl' )

for epoch in range(1, 200):
    train(model, train_loader, optimizer, epoch)
    loss, acc = test(model, train_loader)
    
    # save results every epoch
    results['loss'].append(loss)
    results['accuracy'].append(acc)
    with open(savefile, 'wb') as fout:
        pickle.dump(results, fout)

tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated

In [74]:
for a in train_loader:
    print(len(a[1]))
    break

64


In [70]:
encoded_target

array([0, 0, 0, ..., 1, 1, 1])