In [1]:
import pandas as pd

t1 = pd.read_csv("data/trial/task1.csv", names =["index", "text", "gold"], sep=";", header=0)
p1 = pd.read_csv("data/practice/task1.csv", names =["index", "text", "gold"], sep=";", header=0)
e1 = pd.read_csv("data/evaluation/task1.csv", names =["index", "text"], sep=";", header=0)

In [2]:
print(t1.shape, p1.shape, e1.shape)

(8580, 3) (13478, 3) (7386, 2)


In [3]:
t1.groupby("gold").count()

Unnamed: 0_level_0,index,text
gold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,8011,8011
1,569,569


In [4]:
p1.groupby("gold").count()

Unnamed: 0_level_0,index,text
gold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12468,12468
1,1010,1010


# Simple Baselines

In [15]:
all_text = t1["text"].tolist() + p1["text"].tolist() + e1["text"].tolist()

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(all_text, lowercase=True, ngram_range=(1,3), stop_words="english")
tfidf.fit(all_text)

X_train = tfidf.transform(p1["text"].tolist())
y_train = p1["gold"].tolist()

X_test = tfidf.transform(t1["text"].tolist())
y_test = t1["gold"].tolist()

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

clf1 = MultinomialNB(alpha=.1)
clf1.fit(X_train, y_train)

print("Train acc: ", clf1.score(X_test, y_test))
preds = clf1.predict(X_test)

print(classification_report(y_test, preds))

Train acc:  0.9412587412587412
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      8011
           1       0.62      0.29      0.40       569

    accuracy                           0.94      8580
   macro avg       0.79      0.64      0.68      8580
weighted avg       0.93      0.94      0.93      8580



In [21]:
from sklearn.linear_model import LogisticRegression

clf1 = LogisticRegression()
clf1.fit(X_train, y_train)

print("Train acc: ", clf1.score(X_test, y_test))
preds = clf1.predict(X_test)

print(classification_report(y_test, preds))

Train acc:  0.9375291375291376
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      8011
           1       0.85      0.07      0.13       569

    accuracy                           0.94      8580
   macro avg       0.89      0.53      0.55      8580
weighted avg       0.93      0.94      0.91      8580



In [23]:
from sklearn.ensemble import RandomForestClassifier

clf1 = RandomForestClassifier()
clf1.fit(X_train, y_train)

print("Train acc: ", clf1.score(X_test, y_test))
preds = clf1.predict(X_test)

print(classification_report(y_test, preds))

Train acc:  0.9461538461538461
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      8011
           1       0.86      0.22      0.35       569

    accuracy                           0.95      8580
   macro avg       0.91      0.61      0.66      8580
weighted avg       0.94      0.95      0.93      8580



# Simple Baseline w/ Sent-Bert Features

In [24]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')

In [25]:
X_train = encoder.encode(p1["text"], show_progress_bar=True)

Batches: 100%|██████████| 1685/1685 [02:18<00:00, 12.13it/s]


In [34]:
X_test = encoder.encode(t1["text"], show_progress_bar=True)

Batches: 100%|██████████| 1073/1073 [01:29<00:00, 12.00it/s]


In [35]:
import numpy as np

X_train = np.array(X_train)
y_train = p1["gold"]

X_test = np.array(X_test)
y_test = t1["gold"]

In [36]:
from sklearn.linear_model import LogisticRegression

clf1 = LogisticRegression()
clf1.fit(X_train, y_train)

print("Train acc: ", clf1.score(X_test, y_test))
preds = clf1.predict(X_test)

print(classification_report(y_test, preds))

Train acc:  0.9300699300699301
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      8011
           1       0.48      0.55      0.51       569

    accuracy                           0.93      8580
   macro avg       0.72      0.75      0.74      8580
weighted avg       0.93      0.93      0.93      8580



In [38]:
from sklearn.ensemble import RandomForestClassifier

clf1 = RandomForestClassifier(n_jobs=-1, verbose=True)
clf1.fit(X_train, y_train)

print("Train acc: ", clf1.score(X_test, y_test))
preds = clf1.predict(X_test)

print(classification_report(y_test, preds))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.8s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
Train acc:  0.9382284382284383
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      8011
           1       0.90      0.08      0.14       569

    accuracy                           0.94      8580
   macro avg       0.92      0.54      0.56      8580
weighted avg       0.94      0.94      0.91      8580

[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.2s finish

In [39]:
from sklearn.ensemble import AdaBoostClassifier

clf1 = AdaBoostClassifier()
clf1.fit(X_train, y_train)

print("Train acc: ", clf1.score(X_test, y_test))
preds = clf1.predict(X_test)

print(classification_report(y_test, preds))

Train acc:  0.9222610722610722
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      8011
           1       0.41      0.38      0.40       569

    accuracy                           0.92      8580
   macro avg       0.68      0.67      0.68      8580
weighted avg       0.92      0.92      0.92      8580

