In [3]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import  AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from torch import  nn
import torch.nn.functional as F
import re
from sklearn.metrics import classification_report, accuracy_score
import demoji

In [4]:
df = pd.read_csv("cleaned_PS_train.csv")
le = LabelEncoder()
df['encoded_labels'] = le.fit_transform(df['labels'])
num_classes = len(le.classes_)
val_df = pd.read_csv("cleaned_PS_dev.csv")
val_df['encoded_labels'] = le.transform(val_df['labels'])
train_df=df

tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-roberta-xl")
model = AutoModelForSequenceClassification.from_pretrained("facebook/xlm-roberta-xl")

In [5]:
def extract_embeddings(model, tokenizer, texts, device, max_length=256):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Extracting Embeddings"):
            encoding = tokenizer(
                text,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            ).to(device)

            # Extract embeddings from the model
            outputs = model(**encoding, output_hidden_states=True)
            hidden_states = outputs.hidden_states[-1]  # Last hidden layer
            embedding = hidden_states.mean(dim=1).squeeze(0)  # Mean pooling
            embeddings.append(embedding.cpu().numpy())

    return torch.tensor(embeddings)
    

In [10]:
from sklearn.svm import SVC


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_embeddings = extract_embeddings(model, tokenizer, train_df['content'].values, device)
val_embeddings = extract_embeddings(model, tokenizer, val_df['content'].values, device)

# Train an SVM model
svm = SVC(kernel='linear', probability=True)
svm.fit(train_embeddings, train_df['encoded_labels'].values)

# Validate the SVM model
val_predictions = svm.predict(val_embeddings)
val_accuracy = accuracy_score(val_df['encoded_labels'].values, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(val_df['encoded_labels'].values, val_predictions, target_names=le.classes_))

# Function for inference with the SVM model
def predict_svm(text, model, tokenizer, svm, device, max_length=256):
    embedding = extract_embeddings(model, tokenizer, [text], device, max_length=max_length)
    prediction = svm.predict(embedding)
    return le.inverse_transform(prediction)

# Example inference
text = "தென்காசி தொகுதி புதிய தமிழகம் கட்சி வேட்பாளர் டாக்டர்"
predicted_label = predict_svm(text, model, tokenizer, svm, device)
print(f"Predicted Label: {predicted_label[0]}")



Extracting Embeddings: 100%|██████████| 3916/3916 [01:09<00:00, 56.04it/s]
  return torch.tensor(embeddings)
Extracting Embeddings: 100%|██████████| 436/436 [00:08<00:00, 53.33it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Accuracy: 0.3463
                   precision    recall  f1-score   support

         Negative       0.00      0.00      0.00        41
          Neutral       0.00      0.00      0.00        64
None of the above       1.00      0.47      0.64        17
      Opinionated       0.33      0.97      0.49       136
         Positive       0.00      0.00      0.00        58
        Sarcastic       0.48      0.14      0.22        79
    Substantiated       0.00      0.00      0.00        41

         accuracy                           0.35       436
        macro avg       0.26      0.23      0.19       436
     weighted avg       0.23      0.35      0.22       436



Extracting Embeddings: 100%|██████████| 1/1 [00:00<00:00, 20.79it/s]

Predicted Label: Opinionated





In [17]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
import torch.nn.functional as F

# Load the dataset
df = pd.read_csv("cleaned_PS_train.csv")
le = LabelEncoder()
df['encoded_labels'] = le.fit_transform(df['labels'])
num_classes = len(le.classes_)
val_df = pd.read_csv("cleaned_PS_dev.csv")
val_df['encoded_labels'] = le.transform(val_df['labels'])
train_df=df
# Load tokenizer and model for LaBSE
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE", cache_dir='models/LaBSE')
model = AutoModel.from_pretrained("sentence-transformers/LaBSE", cache_dir='models/LaBSE').to('cuda')

In [18]:
def extract_embeddings(model, tokenizer, texts, device, max_length=256):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Extracting Embeddings"):
            encoding = tokenizer(
                text,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            ).to(device)

            # Extract embeddings from the model
            outputs = model(**encoding, output_hidden_states=True)
            hidden_states = outputs.hidden_states[-1]  # Last hidden layer
            embedding = hidden_states.mean(dim=1).squeeze(0)  # Mean pooling
            embeddings.append(embedding.cpu().numpy())

    return torch.tensor(embeddings)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_embeddings = extract_embeddings(model, tokenizer, train_df['content'].values, device)
val_embeddings = extract_embeddings(model, tokenizer, val_df['content'].values, device)

Extracting Embeddings: 100%|██████████| 4352/4352 [01:07<00:00, 64.58it/s]
Extracting Embeddings: 100%|██████████| 544/544 [00:09<00:00, 58.41it/s]


In [20]:
svm = SVC(kernel='linear', probability=True)
svm.fit(train_embeddings, train_df['encoded_labels'].values)

# Validate the SVM model
val_predictions = svm.predict(val_embeddings)
val_accuracy = accuracy_score(val_df['encoded_labels'].values, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(val_df['encoded_labels'].values, val_predictions, target_names=le.classes_))

# Function for inference with the SVM model
def predict_svm(text, model, tokenizer, svm, device, max_length=256):
    embedding = extract_embeddings(model, tokenizer, [text], device, max_length=max_length)
    prediction = svm.predict(embedding)
    return le.inverse_transform(prediction)

# Example inference
text = "தென்காசி தொகுதி புதிய தமிழகம் கட்சி வேட்பாளர் டாக்டர்"
predicted_label = predict_svm(text, model, tokenizer, svm, device)
print(f"Predicted Label: {predicted_label[0]}")


Validation Accuracy: 0.2996
                   precision    recall  f1-score   support

         Negative       0.10      0.10      0.10        51
          Neutral       0.24      0.20      0.22        84
None of the above       0.84      0.80      0.82        20
      Opinionated       0.33      0.47      0.39       153
         Positive       0.18      0.16      0.17        69
        Sarcastic       0.39      0.35      0.37       115
    Substantiated       0.10      0.04      0.05        52

         accuracy                           0.30       544
        macro avg       0.31      0.30      0.30       544
     weighted avg       0.28      0.30      0.29       544



Extracting Embeddings: 100%|██████████| 1/1 [00:00<00:00, 42.93it/s]

Predicted Label: Negative





In [19]:
svm2 = SVC(kernel='rbf', probability=True)
svm2.fit(train_embeddings, train_df['encoded_labels'].values)

# Validate the SVM2 model
val_predictions = svm2.predict(val_embeddings)
val_accuracy = accuracy_score(val_df['encoded_labels'].values, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(val_df['encoded_labels'].values, val_predictions, target_names=le.classes_))

# Function for inference with the SVM2 model
def predict_svm2(text, model, tokenizer, svm2, device, max_length=256):
    embedding = extract_embeddings(model, tokenizer, [text], device, max_length=max_length)
    prediction = svm2.predict(embedding)
    return le.inverse_transform(prediction)

# Example inference
text = "தென்காசி தொகுதி புதிய தமிழகம் கட்சி வேட்பாளர் டாக்டர்"
predicted_label = predict_svm2(text, model, tokenizer, svm2, device)
print(f"Predicted Label: {predicted_label[0]}")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Accuracy: 0.3419
                   precision    recall  f1-score   support

         Negative       0.00      0.00      0.00        51
          Neutral       0.00      0.00      0.00        84
None of the above       0.90      0.90      0.90        20
      Opinionated       0.31      0.91      0.46       153
         Positive       0.00      0.00      0.00        69
        Sarcastic       0.42      0.25      0.32       115
    Substantiated       0.00      0.00      0.00        52

         accuracy                           0.34       544
        macro avg       0.23      0.29      0.24       544
     weighted avg       0.21      0.34      0.23       544



Extracting Embeddings: 100%|██████████| 1/1 [00:00<00:00, 49.44it/s]

Predicted Label: Opinionated





In [26]:
test_df = pd.read_csv("cleaned_PS_test.csv")

embedding = extract_embeddings(model, tokenizer, test_df['content'].values, device)

# Predict with the SVM model
predictions = svm.predict(embedding)
test_df['predicted_labels'] = le.inverse_transform(predictions)

test_df[['Id','predicted_labels']].to_csv('submission.csv', index=False)

Extracting Embeddings: 100%|██████████| 544/544 [00:07<00:00, 72.93it/s]


In [25]:
test_df[['Id','predicted_labels']]

Unnamed: 0,Id,predicted_labels
0,PS_01,Opinionated
1,PS_02,Opinionated
2,PS_03,Opinionated
3,PS_04,Opinionated
4,PS_05,Positive
...,...,...
539,PS_540,Negative
540,PS_541,Opinionated
541,PS_542,Opinionated
542,PS_543,Opinionated


In [4]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import torch
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("cleaned_PS_train.csv")
le = LabelEncoder()
df['encoded_labels'] = le.fit_transform(df['labels'])
num_classes = len(le.classes_)

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['encoded_labels'])

# Load tokenizer and model for MURIL
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased", cache_dir='models/muril')
model = AutoModel.from_pretrained("google/muril-base-cased", cache_dir='models/muril').to('cuda')

# Function to extract embeddings
def extract_embeddings(model, tokenizer, texts, device, max_length=256):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Extracting Embeddings"):
            encoding = tokenizer(
                text,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            ).to(device)

            # Extract embeddings
            outputs = model(**encoding, output_hidden_states=True)
            hidden_states = outputs.hidden_states[-1]  # Last hidden layer
            embedding = hidden_states.mean(dim=1).squeeze(0)  # Mean pooling
            embeddings.append(embedding.cpu().numpy())

    return torch.tensor(embeddings)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_embeddings = extract_embeddings(model, tokenizer, train_df['content'].values, device)
val_embeddings = extract_embeddings(model, tokenizer, val_df['content'].values, device)

# Train an SVM model
svm = SVC(kernel='linear', probability=True)
svm.fit(train_embeddings, train_df['encoded_labels'].values)

# Validate the SVM model
val_predictions = svm.predict(val_embeddings)
val_accuracy = accuracy_score(val_df['encoded_labels'].values, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(val_df['encoded_labels'].values, val_predictions, target_names=le.classes_))

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Extracting Embeddings: 100%|██████████| 3916/3916 [00:51<00:00, 76.17it/s]
Extracting Embeddings: 100%|██████████| 436/436 [00:06<00:00, 66.46it/s]


Validation Accuracy: 0.3119
                   precision    recall  f1-score   support

         Negative       0.00      0.00      0.00        41
          Neutral       0.00      0.00      0.00        64
None of the above       0.00      0.00      0.00        17
      Opinionated       0.31      1.00      0.48       136
         Positive       0.00      0.00      0.00        58
        Sarcastic       0.00      0.00      0.00        79
    Substantiated       0.00      0.00      0.00        41

         accuracy                           0.31       436
        macro avg       0.04      0.14      0.07       436
     weighted avg       0.10      0.31      0.15       436



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
# Load tokenizer and model for IndicBERT
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", cache_dir='models/indic-bert')
model = AutoModel.from_pretrained("ai4bharat/indic-bert", cache_dir='models/indic-bert').to('cuda')

# Function to extract embeddings remains the same
train_embeddings = extract_embeddings(model, tokenizer, train_df['content'].values, device)
val_embeddings = extract_embeddings(model, tokenizer, val_df['content'].values, device)

# Train an SVM model
svm = SVC(kernel='linear', probability=True)
svm.fit(train_embeddings, train_df['encoded_labels'].values)

# Validate the SVM model
val_predictions = svm.predict(val_embeddings)
val_accuracy = accuracy_score(val_df['encoded_labels'].values, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(val_df['encoded_labels'].values, val_predictions, target_names=le.classes_))


Extracting Embeddings: 100%|██████████| 3916/3916 [01:07<00:00, 58.02it/s]
Extracting Embeddings: 100%|██████████| 436/436 [00:09<00:00, 47.98it/s]


Validation Accuracy: 0.3188
                   precision    recall  f1-score   support

         Negative       0.00      0.00      0.00        41
          Neutral       0.26      0.11      0.15        64
None of the above       1.00      0.76      0.87        17
      Opinionated       0.33      0.76      0.46       136
         Positive       0.16      0.05      0.08        58
        Sarcastic       0.22      0.15      0.18        79
    Substantiated       0.00      0.00      0.00        41

         accuracy                           0.32       436
        macro avg       0.28      0.26      0.25       436
     weighted avg       0.24      0.32      0.24       436



In [13]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# tokenizer = AutoTokenizer.from_pretrained("nvidia/embed_sentence_transformer_v2", cache_dir='models/nvidia_embed_v2')
# model = AutoModel.from_pretrained("nvidia/embed_sentence_transformer_v2", cache_dir='models/nvidia_embed_v2').to('cuda')
model = SentenceTransformer('sentence-transformers/stsb-xlm-r-multilingual')

# Function to extract embeddings
def extract_embeddings(model, tokenizer, texts, device, max_length=256):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Extracting Embeddings"):
            # encoding = tokenizer(
            #     text,
            #     add_special_tokens=True,
            #     max_length=max_length,
            #     padding='max_length',
            #     truncation=True,
            #     return_attention_mask=True,
            #     return_tensors='pt'
            # ).to(device)

            # Extract embeddings from the model
            outputs = model.encode(text)
            hidden_states = outputs.hidden_states[-1]  # Last hidden layer
            embedding = hidden_states.mean(dim=1).squeeze(0)  # Mean pooling
            embeddings.append(embedding.cpu().numpy())

    return torch.tensor(embeddings)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
# Function to extract embeddings
def extract_embeddings(model, texts, device):
    model.eval()
    embeddings = []
    for text in tqdm(texts, desc="Extracting Embeddings"):
        # Extract embeddings using the SentenceTransformer model
        embedding = model.encode(text, convert_to_tensor=True, device=device)
        embeddings.append(embedding.cpu().numpy())  # Convert to numpy array for compatibility

    return torch.tensor(embeddings)

train_embeddings = extract_embeddings(model, train_df['content'].values, device)
val_embeddings = extract_embeddings(model, val_df['content'].values, device)

# Train an SVM model
svm = SVC(kernel='linear', probability=True)
svm.fit(train_embeddings, train_df['encoded_labels'].values)

# Validate the SVM model
val_predictions = svm.predict(val_embeddings)
val_accuracy = accuracy_score(val_df['encoded_labels'].values, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(val_df['encoded_labels'].values, val_predictions, target_names=le.classes_))



Extracting Embeddings: 100%|██████████| 4352/4352 [01:06<00:00, 65.45it/s]
Extracting Embeddings: 100%|██████████| 544/544 [00:08<00:00, 64.35it/s]


Validation Accuracy: 0.2904
                   precision    recall  f1-score   support

         Negative       0.16      0.14      0.15        51
          Neutral       0.15      0.12      0.13        84
None of the above       0.90      0.90      0.90        20
      Opinionated       0.34      0.58      0.43       153
         Positive       0.12      0.09      0.10        69
        Sarcastic       0.33      0.23      0.27       115
    Substantiated       0.11      0.06      0.07        52

         accuracy                           0.29       544
        macro avg       0.30      0.30      0.29       544
     weighted avg       0.26      0.29      0.26       544



In [5]:
from transformers import AutoModel

model = AutoModel.from_pretrained('nvidia/NV-Embed-v2', trust_remote_code=True, cache_dir='models')


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:  17%|#6        | 849M/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/789M [00:00<?, ?B/s]

: 