<a href="https://colab.research.google.com/github/evarda17/Natural-Language-Processing/blob/main/BERT_for_NLP_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reading the Data

In [None]:
!pip install "tf-models-official==2.13.*"



In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')
import pandas as pd


In [None]:
import pandas as pd
import requests
from io import StringIO

# Replace this with your BOS URL
url = 'https://tufts.box.com/shared/static/423pwoe2cbf5hrw6wsfdo4pn83cynb2v.csv'

# If authentication is required, add the appropriate headers or tokens
response = requests.get(url)
response.raise_for_status()  # This will raise an error if the download failed

# Convert to a pandas DataFrame
data = StringIO(response.text)
df = pd.read_csv(data)





# Data Preprocessing

In [None]:
#missing values - checking

# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)


clean_text    4
category      7
dtype: int64


In [None]:
#handling missing values
df = df.dropna()


In [None]:
# Convert 'category' to integer if it's categorical
df['category'] = df['category'].astype(int)


In [None]:
#text preprocessing

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download necessary NLTK data
nltk.download('stopwords')

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Stemming
    stemmer = PorterStemmer()
    text = ' '.join(stemmer.stem(word) for word in text.split())
    return text

# Apply the preprocessing function to the 'clean_text' column
df['clean_text'] = df['clean_text'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162969 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   clean_text  162969 non-null  object
 1   category    162969 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.7+ MB


In [None]:
df

Unnamed: 0,clean_text,category
0,modi promis minimum govern maximum govern expe...,-1
1,talk nonsens continu drama vote modi,0
2,say vote modi welcom bjp told rahul main campa...,1
3,ask support prefix chowkidar name modi great s...,1
4,answer among power world leader today trump pu...,1
...,...,...
162975,crore paid neerav modi recov congress leader h...,-1
162976,dear rss terrorist payal gawar modi kill plu m...,-1
162977,cover interact forum left,0
162978,big project came india modi dream project happ...,0


# Feature Selection - Using Chi-Square Feature selection

To implement Chi-square feature selection, we will typically use it in scenarios where you have categorical input features and a categorical target variable. Since we have text data in the 'clean_text' column and a numeric (possibly categorical) 'category' column, we'll first need to convert the text data into a numerical format that can be used for Chi-square testing.

In [None]:
#1. Vectorize the Text Data
#We'll use TF-IDF Vectorization as an example. It's a common technique to convert text to a numeric form.

from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=1000)  # you can choose the number of features

# Apply the vectorizer to the 'clean_text' column
X_tfidf = tfidf.fit_transform(df['clean_text'].fillna(''))


KeyboardInterrupt: ignored

In [None]:
#2. Apply Chi-Square Test
#Now, we'll apply the Chi-square test to select the features that have the strongest relationship with the target variable.


from sklearn.feature_selection import chi2
import pandas as pd

# Apply the Chi-square test
chi_scores = chi2(X_tfidf, df['category'])

# Create a DataFrame with feature names and their corresponding Chi-square scores
chi_scores_df = pd.DataFrame({'Feature': tfidf.get_feature_names_out(), 'Chi2Score': chi_scores[0], 'P-value': chi_scores[1]})


In [None]:
#Select top features

# Selecting features with the highest Chi-square scores
# we can choose a threshold or a number of top features
top_features = chi_scores_df.sort_values(by='Chi2Score', ascending=False).head(100)  # top 100 features

# we might also consider a p-value threshold
# For instance, selecting features with p-value less than 0.05
significant_features = chi_scores_df[chi_scores_df['P-value'] < 0.05]


In [None]:
#normalization and standardization
from sklearn.preprocessing import StandardScaler

# Standardize the numeric column (if needed)
scaler = StandardScaler()
df['category'] = scaler.fit_transform(df[['category']])


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162969 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162969 non-null  object 
 1   category    162969 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB


# BERT

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch

# Assuming 'df' is your DataFrame
# Map the categories from -1, 0, 1 to 0, 1, 2
label_mapping = {-1: 0, 0: 1, 1: 2}
df['category'] = df['category'].map(label_mapping)

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['clean_text'], df['category'], test_size=0.2)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text for BERT
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)

# Convert to torch tensors
train_seq = torch.tensor(train_encodings['input_ids'])
train_mask = torch.tensor(train_encodings['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(val_encodings['input_ids'])
val_mask = torch.tensor(val_encodings['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

# Create data loaders
batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = torch.utils.data.RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = torch.utils.data.SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from transformers import BertForSequenceClassification, AdamW
from torch.cuda.amp import GradScaler, autocast
import torch

# Setup for mixed precision
scaler = GradScaler()

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/806.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.0/806.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/806.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.0 torchmetrics-1.2.1


In [None]:
torch.cuda.empty_cache()


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

# Load your model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Create DataLoader for training data
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop with mixed precision
epochs = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch in train_dataloader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Mixed precision
        with autocast():
            outputs = model(**batch)
            loss = criterion(outputs.logits, batch['labels'])

        # Calculate accuracy
        preds = torch.argmax(outputs.logits, dim=1)
        total_correct += (preds == batch['labels']).sum().item()
        total_samples += batch['labels'].size(0)

        # Scaled backpropagation
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    # Calculate and print training accuracy
    training_accuracy = total_correct / total_samples
    print(f"Epoch {epoch+1} completed. Total loss: {total_loss:.4f}, Training Accuracy: {training_accuracy:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: ignored

In [None]:
# # Training loop with mixed precision
# epochs = 4
# for epoch in range(epochs):
#     model.train()
#     total_loss = 0
#     for batch in train_dataloader:
#         # Move batch to device
#         batch = [r.to(device) for r in batch]
#         inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

#         # Mixed precision
#         with autocast():
#             outputs = model(**inputs)
#             loss = outputs.loss

#         # Scaled backpropagation
#         optimizer.zero_grad()  # Reset gradients accumulation
#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()

#         total_loss += loss.item()
#     print(f"Epoch {epoch+1} completed. Total loss: {total_loss}")



In [None]:
# # Evaluation with mixed precision
# model.eval()
# total_eval_loss = 0
# for batch in val_dataloader:
#     batch = [t.to(device) for t in batch]
#     inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

#     with torch.no_grad(), autocast():
#         outputs = model(**inputs)
#         loss = outputs.loss

#     total_eval_loss += loss.item()

# print("Validation completed. Total loss: ", total_eval_loss)

# BERT

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

# Assuming your DataFrame is named df
# df = pd.read_csv("your_dataset.csv") # Replace with your file path

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Dataset Preparation
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Splitting data into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(df['clean_text'], df['category'], test_size=0.2)



In [None]:
!pip install transformers torch pandas




In [None]:
# Create Dataset
train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist())
test_dataset = TextDataset(test_texts.tolist(), test_labels.tolist())





In [None]:
import accelerate
print(accelerate.__version__)


0.25.0


In [None]:
!pip install transformers torch
!pip install accelerate -U
!pip install transformers[torch]

!pip install accelerate -U
!pip install transformers[torch] -U





In [None]:
# from transformers import BertForSequenceClassification, BertTokenizer
# import torch
# from torch.utils.data import DataLoader
# from sklearn.metrics import accuracy_score

# # Model
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# # Training arguments
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=64,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=10,
# )


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# # Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset
# )

# # Train
# trainer.train()

# # Evaluation
# trainer.evaluate()

RuntimeError: ignored

In [None]:
!pip install torch -U
!pip install transformers -U


In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import Dataset

# Assuming your DataFrame is named df
# df = pd.read_csv("your_dataset.csv") # Replace with your file path

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Dataset Preparation
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



In [None]:
# Assuming df is your DataFrame
df['category'] = df['category'].map({-1: 0, 0: 1, 1: 2})

# Continue with your train-test split and DataLoader preparation


# Splitting data into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(df['clean_text'], df['category'], test_size=0.2)

# Create Dataset
train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist())
test_dataset = TextDataset(test_texts.tolist(), test_labels.tolist())

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to GPU if available
device = torch.device("cpu")
model.to(device)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# Training Loop
model.train()
for epoch in range(3):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} finished")

In [None]:
# Evaluation Loop
model.eval()
total_eval_accuracy = 0
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)
    total_eval_accuracy += (predictions == labels).sum().item()

accuracy = total_eval_accuracy / len(test_dataset)
print(f"Test accuracy: {accuracy}")


RuntimeError: ignored