# Sentiment Analysis in torchtext of OASIS datasets captions

In [18]:
# Imports

%matplotlib inline
import os, sys
import re
import string
import pathlib
import random
from collections import Counter, OrderedDict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torchtext
from torchtext import data
from torchtext import vocab

from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

import warnings
from utils.scoring_utils import *
from utils.data_utils import *
from utils.plotting_utils import *

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

warnings.filterwarnings('ignore')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print('device:',device)
print('Python version:',sys.version)
print('Pandas version:',pd.__version__)
print('Pytorch version:', torch.__version__)
print('Torch Text version:', torchtext.__version__)
print('Spacy version:', spacy.__version__)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
device: cuda:0
Python version: 3.6.4 |Anaconda custom (64-bit)| (default, Jan 16 2018, 18:10:19) 
[GCC 7.2.0]
Pandas version: 0.22.0
Pytorch version: 0.4.0
Torch Text version: 0.2.3
Spacy version: 2.0.11


## Load data

In [41]:
captions_root = "/home/elkhand/git-repos/human-emotions-classifier/dataset/metadata"
oasis_csv_path = "dataset/metadata/OASIS.csv"
captions_root_path = pathlib.Path(captions_root)
dfHuman = read_caption_csv_into_dataframe(captions_root_path/'captions.csv')
#pd.read_csv(data_root/'captions.csv', error_bad_lines=False)
dfHuman["id"] = dfHuman["id"].apply(lambda x: "I"+str(x))
dfHuman.columns = ['id', 'image_title_human', 'caption_human']
dfHuman.shape
dfHuman.head(1)

dfAuto = read_caption_csv_into_dataframe(data_root/'auto_generated_captions.csv', delimeter='|')
#pd.read_csv(data_root/'auto_generated_captions.csv', sep = "|", error_bad_lines=False)
dfAuto.columns = ['id', 'image_title_auto', 'caption_auto']
dfAuto.shape
dfAuto.head(1)


dfOasis = read_oasis_csv_into_dataframe(oasis_csv_path)
dfOasis.shape
dfOasis.head(1)

# frames = [dfHuman, dfAuto, dfOasis]
df = pd.merge(dfHuman,dfAuto, on= 'id')
df = pd.merge(df,dfOasis, on= 'id')
## Select the ones you want
df = df[['id','image_title_human','caption_human','image_title_auto','caption_auto','theme','valence_mean']]
df.shape
df.head(3)

dfHuman = df[['id','image_title_human','caption_human','valence_mean']]
dfHuman.columns = ['id', 'image_title', 'caption', 'valence_mean']
dfHuman.shape
dfHuman.head(1)

dfAuto = df[['id','image_title_auto','caption_auto','valence_mean']]
dfAuto.columns = ['id', 'image_title', 'caption','valence_mean']
dfAuto.shape
dfAuto.head(1)


humanCaptionWithScorePath = captions_root_path/'humanCaptionWithScoredf.csv'
autoCaptionWithScorePath = captions_root_path/'autoCaptionWithScoredf.csv'

dfHuman.to_csv(humanCaptionWithScorePath, index=False)
dfAuto.to_csv(autoCaptionWithScorePath, index=False)

# dfDiff = df[df["image_title_human"] != df["image_title_auto"]]
# dfDiff.shape
# dfDiff.head(20)

(900, 3)

Unnamed: 0,id,image_title_human,caption_human
0,I109,Camping 6.jpg,A man wearing a blue jacket and a headlamp lig...


(900, 3)

Unnamed: 0,id,image_title_auto,caption_auto
0,I672,Rocks 5.jpg,a bunch of carrots are sitting on the ground.


(900, 10)

Unnamed: 0,id,theme,category,source,valence_mean,valence_std,valence_n,arousal_mean,arousal_std,arousal_n
0,I1,Acorns 1,Object,Pixabay,4.686275,0.954203,102,2.346535,1.60272,101


(900, 7)

Unnamed: 0,id,image_title_human,caption_human,image_title_auto,caption_auto,theme,valence_mean
0,I109,Camping 6.jpg,A man wearing a blue jacket and a headlamp lig...,Camping 7.jpg,a group of people on a small boat in the water.,Camping 7,5.215686
1,I56,Bark 5.jpg,The trunk of an old tree with a rough bark cov...,BDSM 1.jpg,a group of people standing next to each other.,BDSM 1,4.333333
2,I61,Beach 3.jpg,A person walking along a sandy seashore at dawn.,Beach 3.jpg,a person on a beach with a surfboard.,Beach 3,5.514851


(900, 4)

Unnamed: 0,id,image_title,caption,valence_mean
0,I109,Camping 6.jpg,A man wearing a blue jacket and a headlamp lig...,5.215686


(900, 4)

Unnamed: 0,id,image_title,caption,valence_mean
0,I109,Camping 7.jpg,a group of people on a small boat in the water.,5.215686


## Define how to process data

In [52]:
nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])
def tokenizer(s): return [w.text.lower() for w in nlp(caption_clean(s))]
def caption_clean(caption):
    caption = re.sub(r'[^A-Za-z0-9]+', ' ', caption) # remove non alphanumeric character
    caption = remove_stop_words(caption) # remove links
    return caption

def remove_stop_words(caption):
    cleanedCaption = ""
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(caption)
    for word in words:
        if word not in stop_words:
            cleanedCaption += word + " "
    return cleanedCaption.strip()


txt_field = data.Field(sequential=True, tokenize=tokenizer, include_lengths=True, use_vocab=True)
label_field = data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)

test_fields = [
    ('id', None),
    ('image_title', None),
     ('caption', txt_field),
    ('valence_mean', label_field)   
]


# trainds, valds = data.TabularDataset.splits(path=datasets_root, format='csv', train='traindf.csv', validation='valdf.csv', fields=train_val_fields, skip_header=True)
# %%time
testHumanCaption = data.TabularDataset(
        path=humanCaptionWithScorePath, format='csv', 
        skip_header=True,
        fields=test_fields)

testAutoCaption = data.TabularDataset(
        path=autoCaptionWithScorePath, format='csv', 
        skip_header=True,
        fields=test_fields)

print(type(testHumanCaption))
print(len(testHumanCaption))
ex = testHumanCaption[0]
type(ex)
testHumanCaption.fields.items()
ex.valence_mean
ex.caption

<class 'torchtext.data.dataset.TabularDataset'>
900


torchtext.data.example.Example

dict_items([('id', None), ('image_title', None), ('caption', <torchtext.data.field.Field object at 0x7f8c1f15dfd0>), ('valence_mean', <torchtext.data.field.Field object at 0x7f8c1f15de80>)])

'5.2156862745098005'

['a',
 'man',
 'wearing',
 'blue',
 'jacket',
 'headlamp',
 'lighting',
 'fire',
 'pile',
 'wood',
 'kindling',
 'front']

## Load pretrained word vectors and building vocabulary

In [56]:
%%time
glove_dataset_root = "/home/elkhand/datasets/glove-vectors"
vec = vocab.Vectors('glove.twitter.27B.100d.txt', glove_dataset_root)


CPU times: user 471 ms, sys: 144 ms, total: 616 ms
Wall time: 592 ms


In [57]:
txt_field.build_vocab(testHumanCaption, testAutoCaption, max_size=100000, vectors=vec)
label_field.build_vocab(testHumanCaption)
txt_field.vocab.vectors.shape
txt_field.vocab.vectors[txt_field.vocab.stoi['dog']]

torch.Size([2367, 100])

tensor([ 0.5078, -1.0274,  0.4814, -0.0942,  0.4484, -0.5229,  0.5150,
        -0.0389,  0.3587, -0.0660, -0.8288,  0.7618, -3.8030, -0.0106,
         0.2165,  0.5971,  0.3742, -0.0226, -0.0103, -0.3397,  0.0943,
         0.2625, -0.4016, -0.0080,  1.0206, -0.3579, -0.5650,  0.5882,
        -0.8185,  0.3029,  0.4720, -0.0974, -0.6123, -0.1780, -0.1162,
         0.3259,  0.1150, -0.1903,  0.0116,  0.4648, -0.1681,  0.2197,
        -0.2594, -0.0135,  0.7071,  0.7811,  0.7992,  1.0389,  0.5279,
        -0.1116, -0.6227,  0.0307,  0.3385, -0.5309, -0.0997,  0.2160,
         0.6052,  1.2356, -0.0035, -0.0975, -0.2494,  0.2154,  0.4464,
         0.0954, -0.2737, -0.2854, -0.4089,  0.4822,  0.3032,  0.1944,
         0.8324, -0.5038,  0.3009, -0.4979,  0.5030,  0.0327, -0.5179,
        -0.2354,  0.2296, -0.6359,  1.6270,  0.6283, -0.7485,  0.6007,
        -0.0112, -0.3211,  0.1434, -0.0608,  0.0882,  0.6594, -0.4613,
        -0.3764, -0.1133,  0.1587,  0.3912,  0.6766, -0.0712,  0.1746,
      

## Load saved model state


In [60]:
vocab_size = len(txt_field.vocab)
embedding_dim = 100
n_hidden = 64
n_out = 2

class ConcatPoolingGRUAdaptive(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out, pretrained_vec, bidirectional=True):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_hidden = n_hidden
        self.n_out = n_out
        self.bidirectional = bidirectional
        
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.emb.weight.data.copy_(pretrained_vec)
        self.emb.weight.requires_grad = False
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden, bidirectional=bidirectional)
        if bidirectional:
            self.out = nn.Linear(self.n_hidden*2*2, self.n_out)
        else:
            self.out = nn.Linear(self.n_hidden*2, self.n_out)
        
    def forward(self, seq, lengths):
        bs = seq.size(1)
        self.h = self.init_hidden(bs)
        seq = seq.transpose(0,1)
        embs = self.emb(seq)
        embs = embs.transpose(0,1)
        embs = pack_padded_sequence(embs, lengths)
        gru_out, self.h = self.gru(embs, self.h)
        gru_out, lengths = pad_packed_sequence(gru_out)        
        
        avg_pool = F.adaptive_avg_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)
        max_pool = F.adaptive_max_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)        
        outp = self.out(torch.cat([avg_pool,max_pool],dim=1))
        return F.log_softmax(outp)
    
    def init_hidden(self, batch_size): 
        if self.bidirectional:
            return torch.zeros((2,batch_size,self.n_hidden)).to(device)
        else:
            return torch.zeros((1,batch_size,self.n_hidden)).cuda().to(device)

def save_checkpoint(checkpoint_path, model, optimizer):
    state = {'state_dict': model.state_dict(),
             'optimizer' : optimizer.state_dict()}
    torch.save(state, checkpoint_path)
    print('model saved to %s' % checkpoint_path)
    
def load_checkpoint(checkpoint_path, model, optimizer):
    state = torch.load(checkpoint_path)
    model.load_state_dict(state['state_dict'])
    optimizer.load_state_dict(state['optimizer'])
    print('model loaded from %s' % checkpoint_path)
    
# Load model
m = ConcatPoolingGRUAdaptive(vocab_size, embedding_dim, n_hidden, n_out, testHumanCaption.fields['caption'].vocab.vectors).to(device)
opt = optim.Adam(filter(lambda p: p.requires_grad, m.parameters()), 1e-3)
load_checkpoint('/home/elkhand/git-repos/sentiment-analysis-torchtext/model/twitter-%i.pth' % 5, m, opt)    

RuntimeError: Error(s) in loading state_dict for ConcatPoolingGRUAdaptive:
	While copying the parameter named "emb.weight", whose dimensions in the model are torch.Size([2367, 100]) and whose dimensions in the checkpoint are torch.Size([100002, 100]).