In [1]:
!apt-get install wget


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
wget is already the newest version (1.21.2-2ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [2]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


--2024-09-02 03:07:27--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2024-09-02 03:07:34 (13.1 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [3]:
!tar -xf aclImdb_v1.tar.gz


In [4]:
import os
import pandas as pd

def load_imdb_data(directory):
    data = {'review': [], 'sentiment': []}

    for label in ['pos', 'neg']:
        sentiment_dir = os.path.join(directory, label)
        for review_file in os.listdir(sentiment_dir):
            with open(os.path.join(sentiment_dir, review_file), 'r', encoding='utf-8') as file:
                data['review'].append(file.read())
                data['sentiment'].append(1 if label == 'pos' else 0)

    return pd.DataFrame(data)
train_data = load_imdb_data('aclImdb/train')
test_data = load_imdb_data('aclImdb/test')

train_data.head()


Unnamed: 0,review,sentiment
0,This movie took the Jerry Springer approach to...,1
1,Twisted Desire (1996) was a TV movie starring ...,1
2,Other reviewers have summarized this film noir...,1
3,"My wife, Kate and I absolutely loved the serie...",1
4,A wonderful early musical film from Rene Clair...,1


In [5]:
len(train_data)

25000

In [6]:
len(test_data)

25000

In [7]:
train_data['review'][24995]

'This anime series starts out great: Interesting story, exciting events, interesting characters, beautifully rendered and executed. Not everything is explained right away, dangling a proverbial carrot before the viewer, enticing the viewer to watch each succeeding episode. But imagine the disappointment to find that the sci-fi thriller/giant robot adventure is only a backdrop for psycho-babble and quasi-religious preachy exploitation. If you want to hear "You\'re OK. It\'s good to be you." after being embattled with negative slogans and the characters\' negative emotions, then this is for you. If you want a good sci-fi flick that is simply fun to watch, forget this one. Both the original, and the alternate endings were grossly disappointing to me. All that, AND this movie was too preachy.'

In [8]:
import numpy as np
print(np.sum(len(train_data['review'][i]) for i in range(len(train_data)))//len(train_data))

  print(np.sum(len(train_data['review'][i]) for i in range(len(train_data)))//len(train_data))


1325


In [9]:
len(np.unique(train_data['review']))

24904

In [10]:
df = train_data.drop_duplicates(subset=['review'])#Taking training data as dataset for computational reasons


In [11]:
len(df)

24904

In [12]:
df

Unnamed: 0,review,sentiment
0,This movie took the Jerry Springer approach to...,1
1,Twisted Desire (1996) was a TV movie starring ...,1
2,Other reviewers have summarized this film noir...,1
3,"My wife, Kate and I absolutely loved the serie...",1
4,A wonderful early musical film from Rene Clair...,1
...,...,...
24995,This anime series starts out great: Interestin...,0
24996,Do you like stand up? Then stay away from this...,0
24997,"In the ravaged wasteland of the future, mankin...",0
24998,This is another typical unbelievable and non-s...,0


In [13]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,12472
0,12432


In [14]:
!pip install transformers torch



In [15]:
df

Unnamed: 0,review,sentiment
0,This movie took the Jerry Springer approach to...,1
1,Twisted Desire (1996) was a TV movie starring ...,1
2,Other reviewers have summarized this film noir...,1
3,"My wife, Kate and I absolutely loved the serie...",1
4,A wonderful early musical film from Rene Clair...,1
...,...,...
24995,This anime series starts out great: Interestin...,0
24996,Do you like stand up? Then stay away from this...,0
24997,"In the ravaged wasteland of the future, mankin...",0
24998,This is another typical unbelievable and non-s...,0


In [16]:
from torch.utils.data import Dataset, DataLoader,random_split
import torch.nn as nn
import torch
from transformers import DistilBertTokenizer,DistilBertModel,AdamW


In [17]:
class IMBDDataset(Dataset):
  def __init__(self,df,tokenizer,max_length):
    self.dataset=df
    self.tokenizer=tokenizer
    self.max_length=max_length
  def __len__(self):
    return len(self.dataset)
  def __getitem__(self, index):
    text=self.dataset.iloc[index]['review']
    sentiment=self.dataset.iloc[index]['sentiment']
    encoding=self.tokenizer(text, return_tensors='pt',max_length=self.max_length,truncation=True,padding='max_length')

    return {
      'review': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'sentiment': torch.tensor(self.dataset.iloc[index]['sentiment'], dtype=torch.long)
    }



In [18]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dataset = IMBDDataset(df, tokenizer, 512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [19]:
dataset[7770]

{'review': 'Let me start off by saying that after watching this episode for the first time on DVD at 10 o\'clock P.M. one night, I could not fall asleep until about 3:00 A.M.<br /><br />This brief review may contain spoilers.<br /><br />I\'m a long-time fan of The Sopranos and I can safely say this is the best episode I\'ve seen. I\'m not saying everyone should feel this way, but I do. This episode is identical to the weekend I spent with my family, watching over my own father, comatose in the ICU before he passed.<br /><br />The episode begins with Tony in an alternate reality: he is a salesman who\'s identity has been mistaken for that of a man named Kevin Finnerty.<br /><br />By the time ten minutes had gone by, I knew either Tony was dreaming, or I was watching some other show. It wasn\'t like the normal Sopranos and I loved it.<br /><br />Option 1 is confirmed when Anthony (or "Kevin") looks into the sky at a "helicopter spotlight" and we see prodding through it, a doctor with a f

In [20]:
tokenizer.decode(dataset[7770]['input_ids'])

'[CLS] let me start off by saying that after watching this episode for the first time on dvd at 10 o\'clock p. m. one night, i could not fall asleep until about 3 : 00 a. m. < br / > < br / > this brief review may contain spoilers. < br / > < br / > i\'m a long - time fan of the sopranos and i can safely say this is the best episode i\'ve seen. i\'m not saying everyone should feel this way, but i do. this episode is identical to the weekend i spent with my family, watching over my own father, comatose in the icu before he passed. < br / > < br / > the episode begins with tony in an alternate reality : he is a salesman who\'s identity has been mistaken for that of a man named kevin finnerty. < br / > < br / > by the time ten minutes had gone by, i knew either tony was dreaming, or i was watching some other show. it wasn\'t like the normal sopranos and i loved it. < br / > < br / > option 1 is confirmed when anthony ( or " kevin " ) looks into the sky at a " helicopter spotlight " and we

In [21]:
dataset[900]

{'review': 'TESS OF THE STORM COUNTRY is possibly the best movie of all of Mary Pickford\'s films. At two hours, it was quite long for a 1922 silent film yet continues to hold your interest some 80 years after it was filmed. Mary gives one of her finest performances at times the role seems like a "greatest hits" performance with bits of Mary the innocent, Mary the little devil, Mary the little mother, Mary the spitfire, Mary the romantic heroine, etc. characteristics that often were used throughout a single film in the past. The movie is surprisingly frank about one supporting character\'s illegitimate child for 1922 and at one point our Little Mary is thought the unwed mother in question! If the Academy Awards had been around in 1922, no doubt the Best Actress Oscar for the year would have been Mary\'s.',
 'input_ids': tensor([  101, 15540,  1997,  1996,  4040,  2406,  2003,  4298,  1996,  2190,
          3185,  1997,  2035,  1997,  2984,  4060,  3877,  1005,  1055,  3152,
          1

In [22]:
train_size = int(0.8 * len(df))
val_size = len(df) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, val_size])

In [23]:
train_loader=DataLoader(train_dataset,batch_size=10,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=10,shuffle=True)

In [24]:
len(train_loader)+len(test_loader)

2492

In [25]:
class SentimentModel(nn.Module):
  def __init__(self,num_labels=2):
    super().__init__()
    self.distilbert=DistilBertModel.from_pretrained('distilbert-base-uncased')
    self.layer=nn.Linear(768,768)
    self.dropout=nn.Dropout(0.2)
    self.layer2=nn.Linear(768,num_labels)

  def forward(self,input_ids,attention_mask):
    output=self.distilbert(input_ids=input_ids,attention_mask=attention_mask)
    cls=output.last_hidden_state[:,0]
    o1=self.layer(cls)
    o2=nn.ReLU()(o1)
    logits=self.layer2(o2)



    return logits
    # print(output.last_hidden_state.shape)
    # print(output)
    print("*********************")
    # print(output[0]==output.last_hidden_state)




In [26]:
# len(z['input_ids'].flatten())
for i in train_loader:
  z=i
  break

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
z5=SentimentModel()
z5.to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

SentimentModel(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

In [30]:

for param in z5.distilbert.parameters():
    param.requires_grad = False

trainable_params = sum(p.numel() for p in z5.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {trainable_params}")


Number of trainable parameters: 592130


In [31]:
z

{'review': ["I go to the cinema to be entertained. There is absolutely nothing entertaining about this film. From beginning to end, there is no respite from the gray, grinding reality of this woman's life. It is one-paced, with no change of mood. I remained until the end only because I was convinced that things must get better. They don't, and I don't think I was the only one, as evidenced by the many groans ringing around the cinema as the film drew mercifully to a close. Honestly depicting social depravation is no crime, but boring your audience to groans is not the way to win the sympathy of the public. A dreadful film.",
  'Holy crap this is so hysterical! Why aren\'t American comedies written like this? For anybody who thinks comedy has to be dumb-- there is more wit and intelligence in the six episodes of this series than in a shelf of novels! Hugh Laurie is a complete hoot. I couldn\'t believe it was the same guy as House! There are so many great lines and gags in this series yo

In [32]:
y=z5(z['input_ids'].to(device),z['attention_mask'].to(device))

In [33]:
y

tensor([[-0.0137, -0.0536],
        [-0.0401, -0.0352],
        [-0.0348, -0.0512],
        [-0.0130, -0.0379],
        [-0.0279, -0.0303],
        [-0.0397, -0.0028],
        [-0.0122, -0.0465],
        [-0.0337, -0.0754],
        [-0.0062, -0.0442],
        [ 0.0053, -0.0843]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [30]:
for i in train_loader:
  z=i
  break

In [31]:
print(y)

None


In [32]:
z['input_ids']

tensor([[  101,  1045,  2001,  ...,     0,     0,     0],
        [  101,  1045,  1012,  ...,     0,     0,     0],
        [  101,  2004, 15444,  ...,     0,     0,     0],
        ...,
        [  101,  1996,  6819,  ...,     0,     0,     0],
        [  101,  2520,  8997,  ...,     0,     0,     0],
        [  101,  1045,  2288,  ...,     0,     0,     0]])

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [61]:
z5=SentimentModel()

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
z5.to(device)

optimizer = AdamW(z5.parameters(), lr=5e-5)

z5.train()
for epoch in range(1):
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['sentiment'].to(device)

        optimizer.zero_grad()
        logits = z5(input_ids=input_ids, attention_mask=attention_mask)
        # predictions = torch.argmax(logits[0], dim=1)
        # print(logits)
        # print(labels)
        loss = nn.CrossEntropyLoss()(logits, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f"Epoch {epoch + 1}, Batch {i + 1}, Loss: {loss.item():.4f}")

Epoch 1, Batch 100, Loss: 0.6696
Epoch 1, Batch 200, Loss: 0.5470
Epoch 1, Batch 300, Loss: 0.5578
Epoch 1, Batch 400, Loss: 0.6730
Epoch 1, Batch 500, Loss: 0.4268
Epoch 1, Batch 600, Loss: 0.5155
Epoch 1, Batch 700, Loss: 0.3114
Epoch 1, Batch 800, Loss: 0.2881
Epoch 1, Batch 900, Loss: 0.2297
Epoch 1, Batch 1000, Loss: 0.4431
Epoch 1, Batch 1100, Loss: 0.3747
Epoch 1, Batch 1200, Loss: 0.3751
Epoch 1, Batch 1300, Loss: 0.3936
Epoch 1, Batch 1400, Loss: 0.3803
Epoch 1, Batch 1500, Loss: 0.4007
Epoch 1, Batch 1600, Loss: 0.5442
Epoch 1, Batch 1700, Loss: 0.5525
Epoch 1, Batch 1800, Loss: 0.2854
Epoch 1, Batch 1900, Loss: 0.3458


In [38]:
z5.eval()
total_correct = 0
total = 0
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['sentiment'].to(device)

    with torch.inference_mode():
        logits = z5(input_ids=input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(logits, dim=1)
    total_correct += (predictions == labels).sum().item()
    total += predictions.size(0)

print(f'Test Accuracy: {total_correct / total:.4f}')

Test Accuracy: 0.8508


In [39]:
def predict_sentiment(text, model, tokenizer, max_length = 512):
    model.eval()
    encoding=tokenizer(text, return_tensors='pt',max_length=max_length,truncation=True,padding='max_length')
    # print(encoding)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    with torch.inference_mode():
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
    prediction = torch.argmax(logits, dim=1).item()
    label_dict = {0: 'negative', 1: 'positive'}
    sentiment = label_dict[prediction]
    return sentiment


In [40]:

print(predict_sentiment("that movie was good.", z5, tokenizer))

positive


In [44]:
print(predict_sentiment("that movie was not bad.", z5, tokenizer))

negative
