In [None]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.8 MB/s[0m eta [36m0:00:0

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import sentencepiece as spm

# Load CSV data into pandas DataFrame
train_data = pd.read_csv('/content/GujaratiTrainingData.csv')
test_data = pd.read_csv('/content/Guj_test_data.csv')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load IndicBERT model and tokenizer
model_name = "ai4bharat/indic-bert"
model = AutoModel.from_pretrained(model_name)
model = model.to(device)

# Load SentencePiece tokenizer for IndicBERT
tokenizer = spm.SentencePieceProcessor()
tokenizer.load("/content/spiece.model")

# Tokenize and preprocess Gujarati text
def preprocess_and_tokenize_gujarati(text):
    tokens = tokenizer.encode(text)
    return tokens

train_data['tokenized_text'] = train_data['text'].apply(preprocess_and_tokenize_gujarati)
test_data['tokenized_text'] = test_data['text'].apply(preprocess_and_tokenize_gujarati)

# Padding
max_length_train = max(train_data['tokenized_text'].apply(len))
max_length_test = max(test_data['tokenized_text'].apply(len))
train_data['padded_tokenized_text'] = train_data['tokenized_text'].apply(lambda x: x + [0] * (max_length_train - len(x)))
test_data['padded_tokenized_text'] = test_data['tokenized_text'].apply(lambda x: x + [0] * (max_length_test - len(x)))

# Convert labels to numerical values
label_mapping = {'HOF': 1, 'NOT': 0}
train_data['label'] = train_data['label'].map(label_mapping)

# Convert to tensors
train_input_ids = torch.tensor(train_data['padded_tokenized_text'].tolist())
train_attention_masks = (train_input_ids != 0).float()
train_labels = torch.tensor(train_data['label'].tolist(), dtype=torch.long)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=0.0001)
criterion = torch.nn.CrossEntropyLoss()

# Create DataLoader
batch_size = 32
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)




In [None]:
import torch.nn as nn
# Define classification model
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x.float()))  # Convert input to float type
        x = self.fc2(x)
        return x

class TrainedModel:
    def __init__(self, model, device):
        self.model = model
        self.device = device

    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            test_embeddings = model(input_ids=test_inputs.to(device), attention_mask=test_masks.to(device))[0][:, 0, :]
            test_outputs = classifier(test_embeddings)
            _, predicted_labels = torch.max(test_outputs, 1)
        return predicted_labels.cpu().numpy()


classifier = SimpleClassifier(input_dim=768, hidden_dim=16, output_dim=2)
classifier = classifier.to(device)
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

num_epochs = 10
for epoch in range(num_epochs):
    classifier.train()
    for embeddings_batch, masks_batch, labels_batch in train_loader:
        optimizer.zero_grad()
        embeddings_batch = model(input_ids=embeddings_batch.to(device), attention_mask=masks_batch.to(device))[0][:, 0, :]
        outputs = classifier(embeddings_batch)
        loss = criterion(outputs, labels_batch.to(device))
        loss.backward()
        optimizer.step()

# Create a trained model instance
trained_model = TrainedModel(classifier, device)


test_inputs = torch.tensor(test_data['padded_tokenized_text'].tolist())
test_masks = (test_inputs != 0).float()
# Predict using the trained model
predicted_labels = trained_model.predict((test_inputs, test_masks))


In [None]:
print(predicted_labels)
# Convert the numeric predicted labels to string labels
predicted_labels_string = [key for value in predicted_labels for key, val in label_mapping.items() if val == value]
print(predicted_labels_string)
hof = 0
n = 0
for i in predicted_labels_string:
  if i == 'HOF':
    hof += 1
  else: n += 1
print("HOF: ",str(hof),"\nNOT: ",str(n))

[0 1 1 ... 1 0 0]
['NOT', 'HOF', 'HOF', 'NOT', 'NOT', 'NOT', 'NOT', 'HOF', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'HOF', 'HOF', 'HOF', 'HOF', 'NOT', 'HOF', 'HOF', 'HOF', 'NOT', 'HOF', 'NOT', 'HOF', 'HOF', 'HOF', 'NOT', 'HOF', 'HOF', 'HOF', 'NOT', 'NOT', 'NOT', 'HOF', 'HOF', 'NOT', 'NOT', 'HOF', 'NOT', 'HOF', 'NOT', 'NOT', 'NOT', 'NOT', 'HOF', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'HOF', 'NOT', 'NOT', 'NOT', 'NOT', 'HOF', 'HOF', 'HOF', 'HOF', 'NOT', 'NOT', 'HOF', 'HOF', 'NOT', 'HOF', 'NOT', 'NOT', 'HOF', 'NOT', 'HOF', 'HOF', 'NOT', 'NOT', 'HOF', 'NOT', 'HOF', 'NOT', 'NOT', 'NOT', 'NOT', 'HOF', 'HOF', 'NOT', 'HOF', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'HOF', 'NOT', 'NOT', 'NOT', 'HOF', 'HOF', 'HOF', 'HOF', 'HOF', 'NOT', 'NOT', 'HOF', 'HOF', 'HOF', 'NOT', 'NOT', 'NOT', 'HOF', 'HOF', 'HOF', 'HOF', 'NOT', 'HOF', 'NOT', 'NOT', 'NOT', 'NOT', 'HOF', 'NOT', 'HOF', 'NOT', 'HOF', 'HOF', 'NOT', 'NOT', 'NOT', 'HOF', 'NOT', 'HOF', 'HOF', 'HOF', 'NOT', 'NOT', '

In [None]:
# Write the predictions to a CSV file
submission_df = pd.DataFrame({'id': test_data['tweet_id'], 'label': predicted_labels_string})
submission_df.head()
submission_df.to_csv('/content/Task1B_3_submission.csv', index=False)