<a href="https://colab.research.google.com/github/greenkode/pytorch/blob/master/Chapter_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torchtext import data 
import torchtext
from pathlib import Path
import pandas as pd
import spacy

In [3]:
!wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
!unzip trainingandtestdata.zip
!rm trainingandtestdata.zip

--2020-09-06 12:10:28--  http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip [following]
--2020-09-06 12:10:28--  https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81363704 (78M) [application/zip]
Saving to: ‘trainingandtestdata.zip’


2020-09-06 12:10:29 (79.4 MB/s) - ‘trainingandtestdata.zip’ saved [81363704/81363704]

Archive:  trainingandtestdata.zip
  inflating: testdata.manual.2009.06.14.csv  
  inflating: training.1600000.processed.noemoticon.csv  


In [5]:
tweetsDF = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None, engine='python')

In [6]:
tweetsDF.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
tweetsDF["sentiment_cat"] = tweetsDF[0].astype('category')

In [8]:
tweetsDF['sentiment'] = tweetsDF['sentiment_cat'].cat.codes

In [9]:
tweetsDF.to_csv('train-processed.csv', header=None, index=None)

In [10]:
tweetsDF.sample(10000).to_csv('train-processed-sample.csv',header=None, index=None)

In [47]:
LABEL = data.LabelField()
TWEET = data.Field(tokenize='spacy', lower=True)

In [48]:
fields = [('score', None), 
          ('id', None), 
          ('date', None), 
          ('query', None), 
          ('name', None), 
          ('tweet', TWEET), 
          ('category', None), 
          ('label', LABEL)]

In [49]:
twitterDataset = torchtext.data.TabularDataset(path='train-processed-sample.csv', format='csv', fields=fields, skip_header=False)

In [73]:
(train, test, valid) = twitterDataset.split(split_ratio=[0.8, 0.1, 0.1])

In [74]:
vars(train.examples[7])

{'label': '1',
 'tweet': ['@jcinqc',
  'lol',
  'what',
  'is',
  'a',
  'kaffeklatch',
  '?',
  'i',
  'have',
  'been',
  'friends',
  'with',
  'kelly',
  'for',
  'a',
  'while',
  ',',
  'she',
  'is',
  'good',
  'people',
  '  ',
  'she',
  'is',
  'just',
  'on',
  'fb',
  'more']}

In [75]:
device = 'cuda'
vocab_size = 16654
TWEET.build_vocab(train, max_size=vocab_size)
LABEL.build_vocab(train)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train, valid, test), batch_size=32, device=device,
                                                                           sort_key = lambda x: len(x.tweet), sort_within_batch=False)

In [76]:
len(TWEET.vocab)
TWEET.vocab.freqs.most_common(10)

[('i', 5012),
 ('!', 4656),
 ('.', 4105),
 (' ', 2984),
 ('to', 2819),
 ('the', 2629),
 (',', 2378),
 ('a', 1814),
 ('my', 1634),
 ('you', 1509)]

In [77]:
class OurFirstLSTM(nn.Module):
  def __init__(self, hidden_size, embedding_dim, vocab_size):
    super(OurFirstLSTM, self).__init__()

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1)
    self.predictor = nn.Linear(hidden_size, 2)

  def forward(self, seq):
    output, (hidden,_) = self.encoder(self.embedding(seq))
    preds = self.predictor(hidden.squeeze(0))
    return preds

model = OurFirstLSTM(100, 300, 16654)
model.to(device)

OurFirstLSTM(
  (embedding): Embedding(16654, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=2, bias=True)
)

In [78]:
optimizer = optim.Adam(model.parameters(), lr=2e-2)
criterion = nn.CrossEntropyLoss()

def train(epochs, model, optimizer, criterion, train_iterator, valid_iterator):
  for epoch in range(1, epochs + 1):

    training_loss = 0.0
    valid_loss = 0.0
    model.train()

    for batch_idx, batch in enumerate(train_iterator):
      optimizer.zero_grad()
      predict = model(batch.tweet)
      loss = criterion(predict, batch.label)
      loss.backward()
      optimizer.step()
      training_loss += loss.data.item() * batch.tweet.size(0)
    training_loss /= len(train_iterator)

    model.eval()
    for batch_idx, batch in enumerate(valid_iterator):
      predict = model(batch.tweet)
      loss = criterion(predict, batch.label)
      valid_loss += loss.data.item() * batch.tweet.size(0)
    valid_loss /= len(valid_iterator)
    print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}'.format(epoch, training_loss, valid_loss))

In [79]:
train(5, model, optimizer, criterion, train_iterator, valid_iterator)

Epoch: 1, Training Loss: 25.22, Validation Loss: 13.34
Epoch: 2, Training Loss: 22.83, Validation Loss: 13.06
Epoch: 3, Training Loss: 20.72, Validation Loss: 14.58
Epoch: 4, Training Loss: 19.32, Validation Loss: 15.03
Epoch: 5, Training Loss: 18.38, Validation Loss: 15.05


In [84]:
def classify_tweet(tweet):
  categories = {0: 'Negative', 1: 'Positive'}
  processed = TWEET.process([TWEET.preprocess(tweet)])
  processed = processed.to(device)
  return categories[model(processed).argmax().item()]

In [85]:
classify_tweet('Reading my kindle2... Love it... Lee childs is good read.')

'Negative'

In [137]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import random
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')
nlp = English()

def remove_stopwords(sentence):
  doc = nlp(sentence)
  token_list = []
  for token in doc:
    token_list.append(token.text)

    filtered_sentence = []

    for word in token_list:
      lexeme = nlp.vocab[word]
      if lexeme.is_stop == False:
        filtered_sentence.append(word)
  return " ".join(filtered_sentence)

def get_synonyms(word):
  synonyms = wordnet.synsets(word)
  return word if len(synonyms) == 0 else synonyms[0]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [138]:
def random_insertion(sentence, n):
  words = remove_stopwords(sentence)
  print(words)
  for _ in range(n):
    new_synonym = get_synonyms(random.choice(words))
    sentence.insert(randrange(len(sentence)+1), new_synonym)
  return sentence

In [140]:
def random_swap(sentence, n=5):
    length = range(len(sentence))
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1]
    return sentence

In [141]:
def random_insertion(sentence,n):
    words = remove_stopwords(sentence)
    for _ in range(n):
        new_synonym = get_synonyms(random.choice(words))
        sentence.insert(randrange(len(sentence)+1), new_synonym)
    return sentence

In [None]:
# !pip install googletrans

In [147]:
import googletrans

translator = googletrans.Translator()
sentences = ['The cat sat on the mat']

translations_fr = translator.translate(sentences, dest='fr')
fr_text = [t.text for t in translations_fr]
translations_en = translator.translate(fr_text, dest='en')
en_text = [t.text for t in translations_en]
print(en_text)

['The cat sat on the carpet']


In [149]:
for i in range(5):
  available_langs = list(googletrans.LANGUAGES.keys())
  tr_lang = random.choice(available_langs)
  print(f"Translating to {googletrans.LANGUAGES[tr_lang]}")

  translations = translator.translate(sentences, dest=tr_lang)
  t_text = [t.text for t in translations]
  print(t_text)

  translations_en_random = translator.translate(t_text, src=tr_lang, dest='en')
  en_text = [t.text for t in translations_en_random]
  print(en_text)

Translating to thai
['แมวนั่งอยู่บนเสื่อ']
['A cat sitting on a mat']
Translating to hungarian
['A macska leült a szőnyegre']
['The cat sat down on the carpet']
Translating to igbo
['Pusi nọdụrụ n’ute']
['The cat sat on the mat']
Translating to slovenian
['Mačka je sedela na preprogi']
['The cat was sitting on the carpet']
Translating to esperanto
['La kato sidis sur la mato']
['The cat was sitting on the mat']
