## Reading combined csv

In [2]:
!pip install datasets



In [3]:
import pandas as pd

df= pd.read_csv('combined_youtube_data.csv')
df

Unnamed: 0,sentiment,combined_text
0,1.0,focus others motivational video grab power wor...
1,1.0,dont waste life powerful motivational speech v...
2,1.0,kill excuse motivational speech download video...
3,1.0,one greatest speech ever steve job steve job e...
4,1.0,confidence motivational speech confidence spok...
...,...,...
360,0.0,many child survive gun violence face barrier m...
361,0.0,texas student protest gun violence here whats ...
362,0.0,pattern gun violence united state annual revie...
363,0.0,gun violence chicago president obama issue exe...


In [4]:
sample_df=df.sample(n=115, random_state=1).index
df=df.drop(sample_df)
df=df.reset_index(drop=True)
df

Unnamed: 0,sentiment,combined_text
0,1.0,focus others motivational video grab power wor...
1,1.0,dont waste life powerful motivational speech v...
2,1.0,kill excuse motivational speech download video...
3,1.0,one greatest speech ever steve job steve job e...
4,1.0,english speech muniba mazari motivational word...
...,...,...
245,0.0,mapping global gun violence el salvador venezu...
246,0.0,serbian turn outrage action recent gun violenc...
247,0.0,addressing america gun violence crisis beyond ...
248,0.0,many child survive gun violence face barrier m...


In [5]:
# Map the labels
def map_labels(label):
    if label == -1:
        return 0  # Map -1 to 0
    elif label == 0:
        return 1  # Map 0 to 1
    elif label == 1:
        return 2  # Map 1 to 2

df['sentiment'] = df['sentiment'].apply(map_labels)

In [6]:
df

Unnamed: 0,sentiment,combined_text
0,2,focus others motivational video grab power wor...
1,2,dont waste life powerful motivational speech v...
2,2,kill excuse motivational speech download video...
3,2,one greatest speech ever steve job steve job e...
4,2,english speech muniba mazari motivational word...
...,...,...
245,1,mapping global gun violence el salvador venezu...
246,1,serbian turn outrage action recent gun violenc...
247,1,addressing america gun violence crisis beyond ...
248,1,many child survive gun violence face barrier m...


## Model training

In [7]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

checkpoint='bert-base-uncased'
tokenizer= BertTokenizer.from_pretrained(checkpoint)
model= BertForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenize function

In [8]:
def tokenize_data(data):
  return tokenizer(data['combined_text'], truncation=True)

## Tokenize dataset

In [9]:
tokenize_data = df.apply(tokenize_data, axis=1)

In [37]:
sentiment=df['sentiment']

In [10]:
tokenize_data[0]

{'input_ids': [101, 3579, 2500, 14354, 2389, 2678, 6723, 2373, 2773, 3942, 3573, 3579, 2500, 14354, 2389, 2678, 2651, 2342, 9979, 2272, 2524, 2154, 2342, 5138, 4119, 2651, 2342, 2991, 2293, 2832, 7564, 4495, 2131, 22585, 4558, 6896, 2228, 3214, 2115, 2063, 2183, 3362, 10461, 2081, 6279, 2568, 2507, 2034, 2051, 3587, 2051, 16215, 2051, 2428, 2812, 2134, 2102, 2215, 2919, 2438, 2242, 2115, 2063, 8929, 21660, 2333, 5263, 2298, 2115, 2063, 22585, 2146, 2635, 2123, 2102, 2507, 2111, 2409, 2115, 2063, 7729, 2903, 2878, 2166, 2183, 2644, 8929, 2183, 2202, 3437, 10047, 2183, 7392, 19960, 3695, 26775, 3012, 10047, 2183, 2562, 11828, 2404, 2540, 3198, 14635, 2884, 2546, 2428, 2215, 2428, 2215, 4339, 3015, 3426, 3241, 3241, 9005, 3746, 2131, 3746, 2131, 3746, 3652, 2311, 4432, 2568, 28036, 2008, 2015, 2904, 2088, 2228, 2562, 6975, 3341, 2562, 2327, 3754, 2228, 7374, 10597, 2303, 22718, 2568, 3579, 2172, 2175, 3300, 5293, 2707, 2673, 2707, 2115, 2063, 10597, 3201, 2115, 2063, 2196, 2428, 8186, 481

In [11]:
input_ids = [item['input_ids'] for item in tokenize_data]
attention_masks = [item['attention_mask'] for item in tokenize_data]

In [12]:
tokenized_df = pd.DataFrame({
    'input_ids': input_ids,
    'attention_mask': attention_masks,
    'labels': df['sentiment']
})

In [13]:
tokenized_df

Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 3579, 2500, 14354, 2389, 2678, 6723, 237...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
1,"[101, 2123, 2102, 5949, 2166, 3928, 14354, 238...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
2,"[101, 3102, 8016, 14354, 2389, 4613, 8816, 267...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,"[101, 2028, 4602, 4613, 2412, 3889, 3105, 3889...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
4,"[101, 2394, 4613, 14163, 3490, 3676, 5003, 905...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
...,...,...,...
245,"[101, 12375, 3795, 3282, 4808, 3449, 10582, 83...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
246,"[101, 6514, 2735, 19006, 2895, 3522, 3282, 480...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
247,"[101, 12786, 2637, 3282, 4808, 5325, 3458, 374...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
248,"[101, 2116, 2775, 5788, 3282, 4808, 2227, 8803...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


## Dynamic Padding

In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Spliting into train and validation data

In [15]:
from sklearn.model_selection import train_test_split

train_data, val_data= train_test_split(tokenized_df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

In [16]:
train_data

Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 3188, 25090, 6774, 5177, 2740, 3145, 134...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,"[101, 4254, 3282, 4808, 12084, 2637, 17870, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,"[101, 6734, 3282, 4808, 3282, 2746, 2610, 2903...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,"[101, 6986, 19895, 7911, 6319, 3233, 6279, 261...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
4,"[101, 2893, 5881, 4997, 2131, 4895, 3367, 1272...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
...,...,...,...
195,"[101, 2707, 2154, 8404, 2296, 2154, 5630, 3407...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
196,"[101, 2373, 2894, 6517, 25619, 14129, 14855, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
197,"[101, 4088, 2154, 2643, 4952, 2707, 2154, 2851...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
198,"[101, 2678, 2265, 2450, 16939, 2743, 2102, 247...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [17]:
val_data

Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 3422, 7226, 2368, 8847, 5604, 4997, 2522...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,"[101, 10047, 2183, 2663, 2190, 14354, 2389, 46...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
2,"[101, 2643, 4471, 2412, 20744, 10303, 2775, 26...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,"[101, 5304, 2147, 8075, 2126, 9223, 3112, 2466...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
4,"[101, 6473, 3077, 2155, 3745, 3893, 4471, 2966...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
5,"[101, 4854, 2743, 2102, 23902, 2792, 28667, 93...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
6,"[101, 4187, 4106, 8065, 3076, 3722, 3357, 6235...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
7,"[101, 21625, 22226, 2015, 4854, 2743, 2102, 26...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
8,"[101, 3959, 14354, 2389, 2678, 4942, 29234, 23...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
9,"[101, 6366, 4997, 2245, 6517, 25619, 14129, 14...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2


In [46]:
# !pip install datasets

In [18]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [19]:
train_dataset.set_format('torch')
val_dataset.set_format('torch')

In [20]:
train_dataset, val_dataset

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 200
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 50
 }))

In [21]:
train_dataset['input_ids'][0]

tensor([  101,  3188, 25090,  6774,  5177,  2740,  3145, 13463,  4997,  2739,
         3167,  4119,  3191,  4265, 17079,  6357,  2192,  2553,  2145,  4812,
         2028,  9968,  2051,  2739, 20110,  5266,  5177,  2740,  8476,  2802,
         3204,  2089,  5177,  2740,  7073,  3204,  2733,  2559,  8304,  5377,
        11265, 20697,  7730,  2088, 27363, 13160,  5177,  2110,  2739, 20110,
         2085,  7648, 12940,  2162,  2331,  4808,  2156,  5396,  2963,  7249,
         2175, 11529,  6748,  4254, 11265, 20697,  7730,  5177,  2740,  2568,
         3993,  3189,  6517,  3773,  2066,  9202,  2518,  2183,  2105,  5396,
         2823,  2064,  2102,  2903,  2088,  2272,  2694,  7249, 27696,  2739,
         2095,  2214,  2158,  4187,  6540,  4650, 16061,  2166,  2729,  3282,
         2915,  6357,  2028,  2711,  8760,  2512, 15509,  8701,  4544, 11585,
        21690,  8545,  4088,  3892,  2158,  5307,  3715,  4465,  3944, 21690,
         2741,  2711,  2902,  2468, 10827,  3426, 12039,  2926, 

## Dividing the data into batches using DataLoader

In [22]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
val_dataloader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator)

In [32]:
# for i in train_dataloader:
#   output= model(**i)
#   break

In [33]:
# output.logits

## Optimizer

In [23]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=3e-5)



## Initializing Accelerator

In [24]:
from accelerate import Accelerator

accelerator = Accelerator()

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, val_dataloader, model, optimizer
)

## Some initialization

In [25]:
num_epochs=3
num_training_steps= num_epochs * len(train_dataloader)
gradient_accumulation_steps = 1

## Scheduler and loss function

In [26]:
from transformers import get_scheduler


lr_scheduler= get_scheduler("linear",
                            optimizer=optimizer,
                            num_warmup_steps=0,
                            num_training_steps=num_training_steps
                            )
criterion = torch.nn.CrossEntropyLoss()

In [27]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
criterion = torch.nn.CrossEntropyLoss()

  0%|          | 0/75 [00:00<?, ?it/s]

In [28]:
criterion

CrossEntropyLoss()

In [29]:
!pip install evaluate



In [32]:
import evaluate

import evaluate
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import load_metric, Dataset
import numpy as np

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)






In [33]:
accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1 = f1_metric.compute(predictions=preds, references=labels, average='weighted')
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,

)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6817,0.433775,0.84,0.84
2,0.3869,0.217647,0.94,0.939879


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6817,0.433775,0.84,0.84
2,0.3869,0.217647,0.94,0.939879
3,0.141,0.116864,0.98,0.980024


TrainOutput(global_step=75, training_loss=0.39721564133961995, metrics={'train_runtime': 4288.4418, 'train_samples_per_second': 0.14, 'train_steps_per_second': 0.017, 'total_flos': 157851606048192.0, 'train_loss': 0.39721564133961995, 'epoch': 3.0})

## Another way to train

In [None]:
import evaluate

for epoch in range(num_epochs):

  total_loss = 0
  model.train()

  for batch in train_dl:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss= outputs.loss
    total_loss += loss.item()
    accelerator.backward(loss) #Compute the gradients by backpropagation.

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss}")

  model.eval()
  accuracy_metric = evaluate.load("accuracy")
  f1_metric = evaluate.load("f1", average="weighted")
  total_val_loss = 0
  correct_predictions = 0
  total_predictions = 0

  for batch in eval_dl:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']

    with torch.no_grad():
      outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

      loss = outputs.loss
      total_val_loss += loss.item()

      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)

      accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
      f1_metric.add_batch(predictions=predictions, references=batch["labels"])
  accuracy_metric.compute()
  f1_metric.compute()

  avg_val_loss = total_val_loss / len(eval_dl)
  print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {accuracy_metric['accuracy']}, Validation F1 Score: {f1_metric['f1']}")

## Downloading files

In [34]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [35]:
%cd /content/gdrive/MyDrive/NLP/Sentiment_analysis

/content/gdrive/MyDrive/NLP/Sentiment_analysis


In [36]:
!mv /content/results /content/gdrive/MyDrive/NLP/Sentiment_analysis

In [37]:
!ls

results


In [38]:
!zip -r folder.zip

  adding: results/ (stored 0%)
  adding: results/checkpoint-75/ (stored 0%)
  adding: results/checkpoint-75/config.json (deflated 51%)
  adding: results/checkpoint-75/model.safetensors (deflated 7%)
  adding: results/checkpoint-75/tokenizer_config.json (deflated 75%)
  adding: results/checkpoint-75/special_tokens_map.json (deflated 42%)
  adding: results/checkpoint-75/vocab.txt (deflated 53%)
  adding: results/checkpoint-75/training_args.bin (deflated 51%)
  adding: results/checkpoint-75/optimizer.pt (deflated 22%)
  adding: results/checkpoint-75/scheduler.pt (deflated 56%)
  adding: results/checkpoint-75/rng_state.pth (deflated 24%)
  adding: results/checkpoint-75/trainer_state.json (deflated 69%)


In [39]:
from google.colab import files
files.download('folder.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [40]:
!mv /content/logs /content/gdrive/MyDrive/NLP/Sentiment_analysis

In [41]:
%cd /content/gdrive/MyDrive/NLP/Sentiment_analysis/logs

/content/gdrive/MyDrive/NLP/Sentiment_analysis/logs


In [43]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')