In [27]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import numpy as np
from sklearn.metrics import f1_score
import pandas as pd
import torch
import csv
from gensim.models import KeyedVectors

### Reading the data

In [28]:
train_df = pd.read_csv("train_2024.csv", quoting=csv.QUOTE_NONE)
val_df = pd.read_csv("dev_2024.csv", quoting=csv.QUOTE_NONE)
test_def = pd.read_csv("test_2024.csv", quoting=csv.QUOTE_NONE)

train_text = train_df["text"].values
y_train = torch.tensor(train_df["label"].values)

val_text = val_df["text"].values
y_val = torch.tensor(val_df["label"].values)

test_text = test_def["text"].values

In [29]:
stop_words = set(stopwords.words('english'))
stop_words.update(punctuation)  
stop_words.add("...")

### Cleaning and Tokenization

In [30]:
def clean_and_tokenize(df, stopwords):
    tokens_list = [word_tokenize(i) for i in df]

    lc_tokens_list = []    
    for i in tokens_list: 
        lc_tokens_list.append([token.lower() for token in i]) 
        
    filtered_sentence = []    
    for i in lc_tokens_list: 
        filtered_sentence.append([token for token in i if token not in stopwords]) 
        
    return filtered_sentence

In [31]:
filtered_sentence_train = clean_and_tokenize(train_text, stop_words)
filtered_sentence_val = clean_and_tokenize(val_text, stop_words)
filtered_sentence_test = clean_and_tokenize(test_text, stop_words)

### Loading the pre-trained Wikipedia2Vec model

In [32]:
wv_model = KeyedVectors.load_word2vec_format('enwiki_20180420_100d.txt.bz2', binary=False)

### Generating embeddings

In [33]:
def generate_embeddings(filtered_sentence):
    
    sentence_embeddings = []
    
    missing_embeddings_ids = []
    available_embeddings = []

    
    for i in range(len(filtered_sentence)):

        embeddings_for_one_sentence = []
    
        current_sentence = filtered_sentence[i]
        for j in range(len(current_sentence)):
            try:
                word_vector = wv_model.get_vector(current_sentence[j])
                embeddings_for_one_sentence.append(word_vector)
            except KeyError:
                pass
        
        if len(embeddings_for_one_sentence) == 0:
            missing_embeddings_ids.append(i)
            sentence_embeddings.append([0])
        else:
            embedding_mean = np.mean(embeddings_for_one_sentence, axis=0)
            sentence_embeddings.append(embedding_mean)
            available_embeddings.append(embedding_mean)
            
    for i in missing_embeddings_ids:
        average_sentence_embedding = np.mean(available_embeddings, axis=0)
        sentence_embeddings[i] = average_sentence_embedding

    return np.array(sentence_embeddings)

In [34]:
sentence_emb_train = generate_embeddings(filtered_sentence_train)
sentence_emb_val = generate_embeddings(filtered_sentence_val)
sentence_emb_test = generate_embeddings(filtered_sentence_test)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=10000)

lr.fit(sentence_emb_train, y_train)
ypred = lr.predict(sentence_emb_val)
print(f1_score(ypred, y_val))

### SVM

In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn import svm

scaler = StandardScaler()
sentence_emb_train_scaled = scaler.fit_transform(sentence_emb_train)
sentence_emb_val_scaled = scaler.transform(sentence_emb_val)

svm_model = svm.SVC(max_iter=10000)
svm_model.fit(sentence_emb_train_scaled, y_train)

ypred = svm_model.predict(sentence_emb_val_scaled)
print(f1_score(ypred, y_val))

sentence_emb_test_scaled = scaler.transform(sentence_emb_test)
pred = svm_model.predict(sentence_emb_test_scaled)



### Random Forest Classifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(sentence_emb_train, y_train)
ypred = rf_model.predict(sentence_emb_val)
print(f1_score(ypred, y_val))
pred = rf_model.predict(sentence_emb_test)

### Saving the results

In [48]:
MODEL_NAME = "w2v_rf"
model_file_name = MODEL_NAME + ".pt"

with open("{}.csv".format(MODEL_NAME), "w") as f:
    f.write("id,label\n")
    for i,l in enumerate(pred):
        f.write(str(i)+","+str(l) +"\n")
    f.close()