Train a simple classifier, as a baseline. It could be a traditional classifier (SVM, Random Forest, NB, or other), or using some pre-trained models based on deep learning (pre-trained word embeddings or text embeddings or other models, fine-tuned or not). In fact, there are two baselines provided, based on transformers. You can run at least one of them and explain in your report what method was used and what was the accuracy you obtained.

In [17]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/emilia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/emilia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/emilia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/emilia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
# Import data
train_data = pd.read_json(path_or_buf='./train/subtaskA_train_monolingual.jsonl', lines=True)
test_data = pd.read_json(path_or_buf='./test/subtaskA_monolingual.jsonl', lines=True)
gold_data = pd.read_json(path_or_buf='./gold/subtaskA_monolingual.jsonl', lines=True)

In [19]:
# Preprocess data
def preprocess(df):
    # Make lowercase
    df['text'] = [entry.lower() for entry in df['text']]
    # Tokenize words
    df['text'] = [word_tokenize(entry) for entry in df['text']]
    # Remove stop words and stem/lemmatize
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    for index, entry in enumerate(df['text']):
        processed = []
        lemmatizer = WordNetLemmatizer()
        for word, tag in pos_tag(entry):
            if word not in stopwords.words('english') and word.isalpha():
                final = lemmatizer.lemmatize(word,tag_map[tag[0]])
                processed.append(final)
        df.loc[index,'text_final'] = str(processed)
    df.drop('text', axis=1)
    return df

In [None]:
train_preprocessed = preprocess(train_data)
gold_preprocessed = preprocess(gold_data)

In [ ]:
import pickle

train_preprocessed_file = open('train.p', 'wb')
pickle.dump(train_preprocessed, train_preprocessed_file)
train_preprocessed_file.close()

gold_preprocessed_file = open('gold.p', 'wb')
pickle.dump(gold_preprocessed, gold_preprocessed_file)
gold_preprocessed_file.close()

In [None]:
X_train = train_preprocessed.drop('label', axis=1)
y_train = train_preprocessed['label']
X_gold = gold_preprocessed.drop('label', axis=1)
y_gold = gold_preprocessed['label']

In [ ]:
# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(X_train['text_final'])
train_X = vectorizer.transform(X_train)

vectorizer.fit(X_gold['text_final'])
gold_X = vectorizer.transform(X_gold)

In [None]:
# Train baseline and get accuracy based on gold standard
baseline = svm.SVC()
baseline.fit(train_X, y_train)

In [ ]:
# Calculate accuracy
y_pred = baseline.predict(gold_X)
print(accuracy_score(y_pred, y_gold))

    Train at least two advanced classifiers based on deep learning, such as fine-tuning a type of BERT model for the first method (though not the version from the baseline in part 1);  and using a recent type of generative LLM for the second method (such as Llama or something equivalent).  Use part of the training data for validation (or use the dev data for validation) when building your models and keep aside the test data for the final testing. (Alternatively, you can try prompt-based learning with LLMs for the second method).