In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
import pickle
import pandas as pd
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error

In [None]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.svm import SVC

In [None]:
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score, pos_label="machine")

In [None]:
import numpy as np

# Data Loading

In [None]:
subsample_train = pd.read_csv("intermediate_data/3k_of_10k_train.csv")
gpt2_small_eval = pd.read_pickle("./intermediate_data/1k_subsample_test.pkl")
gpt2_1532m_eval = pd.read_pickle("./intermediate_data/1k_subsample_gpt2_1532m_test.pkl")
gpt3_eval = pd.read_pickle("./intermediate_data/1k_subsample_gpt3_test.pkl")

# Hyperparameter Search

In [None]:
# rbf does not have a significant performance difference here
#{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},

In [None]:
def get_best_c(model):
    gpt2_train_encodings = model.encode(list(subsample_train['text']))
    param_grid = [
      {'C': [1, 10, 100, 1000]},
     ]

    svc = SVC(kernel='linear', random_state=0)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=0)
    search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring=f1_scorer, cv=cv, verbose=10)

    x = gpt2_train_encodings
    y = subsample_train['class']
    
    search.fit(x, y)
    print(search.best_params_)
    return search.best_params_['C']

# Train and Eval Function

In [None]:
def train_and_eval_model(model, C=100, kernel='linear', gamma=0.001):
    """
    Train and evaluate the model.  Note gamma is unused with linear kernel
    Returns (features, model)
    """
    gpt2_train_encodings = model.encode(list(subsample_train['text']))
        
    x = gpt2_train_encodings
    y = subsample_train['class']
    clf = svm.SVC(gamma=gamma, C=C, kernel=kernel, random_state=0, probability=True)
    clf.fit(x, y)
    
    train_results = clf.predict(x)
    print(f"Train set accuracy: {accuracy_score(subsample_train['class'], train_results):.4f}")
    print(f"Train set F1 score: {f1_score(subsample_train['class'], train_results, pos_label='machine'):.4f}")
    
    gpt2_small_encodings = model.encode(list(gpt2_small_eval['text']))

    test_results = clf.predict(gpt2_small_encodings)
    print(f"GPT-2 355M test set accuracy: {accuracy_score(gpt2_small_eval['class'], test_results):.4f}")
    print(f"GPT-2 355M test set F1 score: {f1_score(gpt2_small_eval['class'], test_results, pos_label='machine'):.4f}")
    
    gpt2_1532m_encodings = model.encode(list(gpt2_1532m_eval['text']))

    test_results = clf.predict(gpt2_1532m_encodings)
    print(f"GPT-2 1532M test set accuracy: {accuracy_score(gpt2_1532m_eval['class'], test_results):.4f}")
    print(f"GPT-2 1532M test set F1 score: {f1_score(gpt2_1532m_eval['class'], test_results, pos_label='machine'):.4f}")
    
    gpt3_encodings = model.encode(list(gpt3_eval['text']))

    test_results = clf.predict(gpt3_encodings)
    print(f"GPT-3 test set accuracy: {accuracy_score(gpt3_eval['class'], test_results):.4f}")
    print(f"GPT-3 test set F1 score: {f1_score(gpt3_eval['class'], test_results, pos_label='machine'):.4f}")
    
    return gpt2_train_encodings, clf

# RoBERTa

In [None]:
roberta_model = SentenceTransformer('all-roberta-large-v1') # 354 million parameter RoBERTa Large

In [None]:
get_best_c(roberta_model) # 10 is best of [1, 10, 100, 1000]

In [None]:
roberta_features, roberta_clf = train_and_eval_model(roberta_model, C=10)

In [None]:
with open("models/roberta_svm_c10.pkl", "wb") as f:
    pickle.dump(roberta_clf, f)

# MPNet

In [None]:
mp_model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
mp_best_c = get_best_c(mp_model) 
#mp_best_c = 10 # C: 10

In [None]:
mp_features, mp_clf = train_and_eval_model(mp_model, C=mp_best_c)

In [None]:
with open(f"models/all-mpnet-base-v2-c{mp_best_c}.pkl", "wb") as f:
    pickle.dump(mp_clf, f)

In [None]:
del(mp_model)

# MSMarco

In [None]:
msmarco_model = SentenceTransformer('msmarco-bert-base-dot-v5')

In [None]:
msmarco_best_c = get_best_c(msmarco_model) 

In [None]:
msmarco_features, msmarco_clf = train_and_eval_model(msmarco_model, C=msmarco_best_c) # C = 1

In [None]:
with open(f"models/msmarco-bert-base-dot-v5-c{msmarco_best_c}.pkl", "wb") as f:
    pickle.dump(msmarco_clf, f)

# MPNet Multi QA

In [None]:
mp_multi_qa_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
mpmulti_best_c = get_best_c(mp_multi_qa_model) 
mqa_mp_features, mqa_mp_clf = train_and_eval_model(mp_multi_qa_model, mpmulti_best_c)

In [None]:
with open(f"models/multi-qa-mpnet-base-dot-v1-c{mpmulti_best_c}.pkl", "wb") as f:
    pickle.dump(mqa_mp_clf, f)

# Mini Model

In [None]:
mini_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
mini_best_c = get_best_c(mini_model) 
mini_features, mini_clf = train_and_eval_model(mini_model, mini_best_c)

In [None]:
with open(f"models/multi-qa-MiniLM-L6-cos-v1-c{mini_best_c}.pkl", "wb") as f:
    pickle.dump(mini_clf, f)

This is pretty interesting.  The model is very small (80MB), but has a long sequence length.  Very optimal, perhaps less overparameterized, and better behaved with the SVM as a classification head.

# Ensemble

Try out an ensemble of neural and statistical features to see if it's interesting.

In [None]:
# Need scaling features for stat, restore from training set
with open("models/linear_svm_3k_of_10k_scaler.pkl", "rb") as f:
    stat_scaler = pickle.load(f)

In [None]:
stat_features = pd.read_csv("features/3k_of_10k_combined_features.csv").to_numpy()
ensemble_features = np.concatenate([roberta_features, stat_scaler.transform(stat_features)], axis=1)

print(stat_features.shape)
print(roberta_features.shape)
print(ensemble_features.shape)

In [None]:
x = ensemble_features
y = subsample_train['class']
clf = svm.SVC(gamma=0.001, C=10., kernel='linear', probability=True)
clf.fit(x, y)

In [None]:
train_results = clf.predict(x)
print(f"Train set accuracy: {accuracy_score(subsample_train['class'], train_results):.4f}")
print(f"Train set F1 score: {f1_score(subsample_train['class'], train_results, pos_label='machine'):.4f}")

In [None]:
gpt2_small_encodings = roberta_model.encode(list(gpt2_small_eval['text']))
stat_gpt2_sm_features = pd.read_csv("features/1k_combined_features_test.csv")

In [None]:
scaled_stat_features_gpt2_355m = stat_scaler.transform(stat_gpt2_sm_features)

In [None]:
stat_gpt2_sm_features.head(1)

In [None]:
ensemble_test_gpt2_355_features = np.concatenate([gpt2_small_encodings, scaled_stat_features_gpt2_355m], axis=1)

In [None]:
test_results = clf.predict(ensemble_test_gpt2_355_features)
print(f"GPT-2 355M test set accuracy: {accuracy_score(gpt2_small_eval['class'], test_results):.4f}")
print(f"GPT-2 355M test set F1 score: {f1_score(gpt2_small_eval['class'], test_results, pos_label='machine'):.4f}")

In [None]:
gpt2_1532m_encodings = roberta_model.encode(list(gpt2_1532m_eval['text']))

In [None]:
stat_gpt2_1532m_features = pd.read_csv("features/1k_combined_features_gpt2_1532m_test.csv")

In [None]:
scaled_stat_features_gpt2_1532m = stat_scaler.transform(stat_gpt2_1532m_features)

In [None]:
ensemble_test_gpt2_1532_features = np.concatenate([gpt2_1532m_encodings, scaled_stat_features_gpt2_1532m], axis=1)

In [None]:
test_results = clf.predict(ensemble_test_gpt2_1532_features)
print(f"GPT-2 1532M test set accuracy: {accuracy_score(gpt2_1532m_eval['class'], test_results):.4f}")
print(f"GPT-2 1532M test set F1 score: {f1_score(gpt2_1532m_eval['class'], test_results, pos_label='machine'):.4f}")

In [None]:
gpt3_encodings = roberta_model.encode(list(gpt3_eval['text']))

In [None]:
stat_gpt3_features = pd.read_csv("features/1k_combined_features_gpt3_test.csv")

In [None]:
scaled_stat_features_gpt3 = stat_scaler.transform(stat_gpt3_features)
ensemble_test_gpt3_features = np.concatenate([gpt3_encodings, scaled_stat_features_gpt3], axis=1)

In [None]:
test_results = clf.predict(ensemble_test_gpt3_features)
print(f"GPT-3 test set accuracy: {accuracy_score(gpt3_eval['class'], test_results):.4f}")
print(f"GPT-3 test set F1 score: {f1_score(gpt3_eval['class'], test_results, pos_label='machine'):.4f}")

In [None]:
with open("features/ensemble_roberta_stat.pkl", "wb") as f:
    pickle.dump(stat_gpt2_sm_features, f)

In [None]:
with open("models/ensemble_roberta_stat.pkl", "wb") as f:
    pickle.dump(clf, f)