# Exercise 13-2: Sentiment analysis on movie reviews
https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews<br>
The sentiment labels are:<br>
<br>
0 - negative<br>
1 - somewhat negative<br>
2 - neutral<br>
3 - somewhat positive<br>
4 - positive<br>

In [4]:
from tqdm import tqdm_notebook as tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

In [82]:
train_df = pd.read_csv('train.tsv', sep='\t')
test_df = pd.read_csv('test.tsv', sep='\t')
df = pd.concat([train_df, test_df], sort=True, ignore_index=True)

# Analyse Dataset

## Observe some records

In [83]:
df['Phrase'].iloc[0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [84]:
df['Phrase'].iloc[1]

'A series of escapades demonstrating the adage that what is good for the goose'

### Strategy
1. `Phrase` column will be `lowercase()`
2. Find the `input_size` by finding the maximum word counting from `Phrase`
3. `word2idx` for word embedding
4. Check `imbalanced dataset` or not?

#### 1. lowercase()

In [85]:
df['Phrase'] = df['Phrase'].apply(lambda x : x.lower())

In [86]:
msg = df['Phrase'].iloc[0]

In [87]:
msg

'a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [88]:
alist = [i for i in msg.split(' ')]

In [89]:
df['tokenized'] = df['Phrase'].apply(lambda x : [i for i in x.split(' ')])

In [90]:
df['phrase_length'] = df['tokenized'].apply(lambda x : len(x))

#### 2. Find maximum input_size

In [91]:
df[df['phrase_length'] == df.phrase_length.max()]

Unnamed: 0,Phrase,PhraseId,SentenceId,Sentiment,tokenized,phrase_length
159836,the film is faithful to what one presumes are ...,159837,8685,,"[the, film, is, faithful, to, what, one, presu...",56


#### 3. Prepare word embedding

In [97]:
from torch.utils.data import Dataset, DataLoader

In [96]:
class Dictionary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word: str):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __len__(self):
        return len(self.word2idx)

In [107]:
class WordDataset(Dataset):
    def __init__(self, filename='train.tsv'):
        super().__init__()
        df = pd.read_csv(filename, sep='\t')
        df['Phrase'] = df['Phrase'].apply(lambda x : x.lower())
        df['tokenized'] = df['Phrase'].apply(lambda x : [i for i in x.split(' ')])

        self.len = len(df)
        self.df = df
    
    def __getitem__(self, index):
        record = self.df.iloc[index]
        return record.tokenized, record.Sentiment

    def __len__(self):
        return self.len    

In [108]:
class RawDataset(Dataset):
    """
    Intentionally to use only 
    """
    def __init__(self, df: pd.DataFrame):
        super().__init__()
        df['Phrase'] = df['Phrase'].apply(lambda x : x.lower())
        df['tokenized'] = df['Phrase'].apply(lambda x : [i for i in x.split(' ')])

        self.len = len(df)
        self.df = df
    
    def __getitem__(self, index):
        record = self.df.iloc[index]
        return record.tokenized, record.Sentiment

    def __len__(self):
        return self.len    

In [126]:
my_dict = Dictionary()

In [127]:
# Simple cross check my order of executions
assert len(my_dict) == 0

In [109]:
total_dataset = RawDataset(df)

In [110]:
total_loader = DataLoader(dataset=total_dataset, num_workers=2) # No need to do shuffling since it is anaylyse phase

In [128]:
for i, (data, target) in enumerate(tqdm(total_loader)):
    for word in data:
        my_dict.add_word(word)

HBox(children=(IntProgress(value=0, max=222352), HTML(value='')))

##### Save the dictioanry to binary

In [130]:
import pickle

with open('dictioanry_data.pkl', 'wb') as output:
    pickle.dump(my_dict, output, pickle.HIGHEST_PROTOCOL)

##### Reuse the `my_dict` because it takes 5 minutes to build a dictionary

In [133]:
with open('dictioanry_data.pkl', 'rb') as input:
    my_dict = pickle.load(input)