<a href="https://colab.research.google.com/github/chinmay002/Pytorch/blob/main/simple_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn as nn
import torchvision
import torch.nn.functional as f
from torch.utils.data import DataLoader,Dataset

import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm, tqdm_notebook

In [17]:
df = pd.read_csv('/content/imdb_reviews.txt',sep='\t',header = None)
df.columns = ['text','target']

In [18]:
df.head()

Unnamed: 0,text,target
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [76]:
cv = CountVectorizer(max_df = 0.99,min_df=0.02,stop_words='english')
seq = cv.fit_transform(df['text'].to_list())
seq

<748x38 sparse matrix of type '<class 'numpy.int64'>'
	with 1122 stored elements in Compressed Sparse Row format>

In [77]:
seq.toarray()
cv.get_feature_names_out()

array(['10', 'acting', 'actors', 'bad', 'best', 'better', 'cast',
       'character', 'characters', 'didn', 'don', 'film', 'films', 'funny',
       'good', 'great', 'just', 'like', 'little', 'look', 'love', 'make',
       'movie', 'movies', 'plot', 'real', 'really', 'scenes', 'script',
       'seen', 'story', 'think', 'time', 'watch', 'watching', 'way',
       'wonderful', 'work'], dtype=object)

In [84]:
seq.[1,:]

<1x38 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [78]:
cv.vocabulary_ #gives token to index


{'movie': 22,
 'characters': 8,
 'acting': 1,
 'plot': 24,
 'little': 18,
 'best': 4,
 'good': 14,
 'look': 19,
 'film': 11,
 'think': 31,
 'films': 12,
 'love': 20,
 'just': 16,
 'time': 32,
 'character': 7,
 'make': 21,
 'funny': 13,
 'don': 10,
 'better': 5,
 'like': 17,
 'story': 30,
 'real': 25,
 'work': 37,
 'scenes': 27,
 'seen': 29,
 'didn': 9,
 'cast': 6,
 '10': 0,
 'actors': 2,
 'really': 26,
 'great': 15,
 'movies': 23,
 'script': 28,
 'way': 35,
 'watch': 33,
 'bad': 3,
 'watching': 34,
 'wonderful': 36}

In [168]:
class Sequences(Dataset):
    def __init__(self):
        df = pd.read_csv('/content/imdb_reviews.txt',sep='\t',header = None)
        df.columns = ['review','label']
        self.vectorizer = CountVectorizer(stop_words='english', max_df=0.99, min_df=0.005)
        self.sequences = self.vectorizer.fit_transform(df.review.tolist())
        self.labels = df.label.tolist()
        self.token2idx = self.vectorizer.vocabulary_
        self.idx2token = {idx: token for token, idx in self.token2idx.items()}
        
    def __getitem__(self, i):
        return self.sequences[i, :].toarray(), self.labels[i]
    
    def __len__(self):
        return self.sequences.shape[0]

dataset = Sequences()
train_loader = DataLoader(dataset, batch_size=16)

print(dataset[5][0].shape)

(1, 320)


In [169]:

next(iter(train_loader))

[tensor([[[0, 0, 0,  ..., 0, 0, 1]],
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         ...,
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]]]),
 tensor([0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0])]

In [170]:
class NN(nn.Module):
  
  def __init__(self, vocab_size,hidden_units,num_classes):
    super(NN,self).__init__()
    self.linear1 = nn.Linear(vocab_size,hidden_units)
    self.linear2 = nn.Linear(hidden_units,num_classes)

  def forward(self,x_in)  :
    '''
    dataloader return size in [16,1,59]
    #for pred in loop we need [16,59] hence we need to squeeze dim 1
    '''


    return self.linear2(f.relu(self.linear1(x_in.squeeze(1).float())))

In [175]:
model = NN(320,100,1)
model

NN(
  (linear1): Linear(in_features=320, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=1, bias=True)
)

In [171]:
loss = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(),lr=0.5)

In [176]:
model.train()
train_losses = []
for epoch in range(5):
  progress_bar = tqdm_notebook(train_loader, leave=False)
  losses = []
  b_size= 0
  for inputs, target in progress_bar:
    #print(inputs.shape)
    #print(inputs.squeeze(1).shape)
    #print(inputs)
    pred = model(inputs)

    y_loss = loss(pred.squeeze(),target.float())

    optimizer.zero_grad()
    y_loss.backward()
    optimizer.step()

    losses.append(y_loss)
    b_size +=1
  epoch_loss = sum(losses)/b_size
  train_losses.append(epoch_loss)
  
  print(f'loss at each epoch {epoch} {epoch_loss}')






Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(train_loader, leave=False)


  0%|          | 0/47 [00:00<?, ?it/s]

loss at each epoch 0 0.6948881149291992


  0%|          | 0/47 [00:00<?, ?it/s]

loss at each epoch 1 0.6948881149291992


  0%|          | 0/47 [00:00<?, ?it/s]

loss at each epoch 2 0.6948881149291992


  0%|          | 0/47 [00:00<?, ?it/s]

loss at each epoch 3 0.6948881149291992


  0%|          | 0/47 [00:00<?, ?it/s]

loss at each epoch 4 0.6948881149291992


In [185]:
import torch
def predict_snetiment(text):
  with torch.no_grad():
    get_vector = torch.LongTensor(dataset.vectorizer.transform([text]).toarray())
    output = model(get_vector)
    pred = torch.sigmoid(output)
    print(pred)

predict_snetiment(text)    

tensor([[0.5075]])


In [181]:
text ="""
This poor excuse for a movie is terrible. It has been 'so good it's bad' for a
while, and the high ratings are a good form of sarcasm, I have to admit. But
now it has to stop. Technically inept, spoon-feeding mundane messages with the
artistic weight of an eighties' commercial, hypocritical to say the least, it
deserves to fall into oblivion. Mr. Derek, I hope you realize you are like that
weird friend that everybody know is lame, but out of kindness and Christian
duty is treated like he's cool or something. That works if you are a good
decent human being, not if you are a horrible arrogant bully like you are. Yes,
Mr. 'Daddy' Derek will end on the history books of the internet for being a
delusional sour old man who thinks to be a good example for kids, but actually
has a poster of Kim Jong-Un in his closet. Destroy this movie if you all have a
conscience, as I hope IHE and all other youtube channel force-closed by Derek
out of SPITE would destroy him in the courts.This poor excuse for a movie is
terrible. It has been 'so good it's bad' for a while, and the high ratings are
a good form of sarcasm, I have to admit. But now it has to stop. Technically
inept, spoon-feeding mundane messages with the artistic weight of an eighties'
commercial, hypocritical to say the least, it deserves to fall into oblivion.
Mr. Derek, I hope you realize you are like that weird friend that everybody
know is lame, but out of kindness and Christian duty is treated like he's cool
or something. That works if you are a good decent human being, not if you are a
horrible arrogant bully like you are. Yes, Mr. 'Daddy' Derek will end on the
history books of the internet for being a delusional sour old man who thinks to
be a good example for kids, but actually has a poster of Kim Jong-Un in his
closet. Destroy this movie if you all have a conscience, as I hope IHE and all
other youtube channel force-closed by Derek out of SPITE would destroy him in
the courts.
"""
dataset.vectorizer.transform([text])

<1x320 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [179]:
dataset.vectorizer.get_feature_names_out()

array(['10', '90', 'absolutely', 'acting', 'action', 'actor', 'actors',
       'actually', 'adorable', 'age', 'amazing', 'annoying', 'art',
       'attempt', 'audience', 'avoid', 'away', 'awful', 'bad', 'barely',
       'beautiful', 'beginning', 'believable', 'believe', 'best',
       'better', 'big', 'bit', 'black', 'book', 'bored', 'boring',
       'brilliant', 'budget', 'came', 'camera', 'care', 'cast', 'casting',
       'certainly', 'character', 'characters', 'cheap', 'chemistry',
       'child', 'cinema', 'cinematography', 'classic', 'clever', 'come',
       'comedy', 'comes', 'completely', 'consider', 'convincing', 'cool',
       'couldn', 'cover', 'crap', 'created', 'cult', 'dance', 'day',
       'death', 'definitely', 'depth', 'dialogue', 'did', 'didn',
       'different', 'directing', 'direction', 'director', 'disappointed',
       'does', 'doesn', 'don', 'drama', 'editing', 'effects', 'end',
       'ending', 'energy', 'enjoy', 'enjoyed', 'entertaining', 'entire',
       'espe

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as fn
from torch.utils.data import Dataset,DataLoader

import torch.optim as optim
import torchvision
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm, tqdm_notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
sentence = 'quick brown fox jumped into water'
token2idx = {word:i for i, word in enumerate(set(sentence.split(' ')))}

In [None]:
token2idx

In [None]:
idx_tensor = torch.LongTensor([token2idx[word] for word  in sentence.split(' ')])
idx_tensor

create a **which is of size **(n_vocab , emb_dim)**.
here we have vpcab length of 6 and emb_dim =f 200
emb_dim is randomly assigned wieghts which gets updatde on each epoch during triannig

In [None]:
emb_layer = nn.Embedding(num_embeddings = len(token2idx),embedding_dim=10)
emb_layer(idx_tensor),emb_layer(idx_tensor).shape

In [None]:
GLOVE_FILENAME = '/content/glove.6B.50d.txt'
glove_dict = {}
with open(GLOVE_FILENAME, encoding="utf8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    vectors = np.array(values[1:],float)
    glove_dict[word] = vectors

In [None]:
from scipy import spatial
from sklearn.manifold import TSNE
from torch.nn.functional import cosine_similarity

In [None]:
def similarity(glove_dict,w1,w2):
  
  v1 = torch.from_numpy(glove_dict[w1]).unsqueeze(0)
  v2 = torch.from_numpy(glove_dict[w2]).unsqueeze(0)
  return cosine_similarity(v1,v2)

similarity(glove_dict,'dog','cat')

In [None]:
word_pairs = [
    ('dog', 'cat'),
    ('tree', 'cat'),
    ('tree', 'leaf'),
    ('king', 'queen'),
]

for word1, word2 in word_pairs:
    print(similarity(glove_dict,word1,word2))