# Perceptron Model


In [2]:
# necessary import
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
import json
import csv
from nltk.tokenize import MWETokenizer, word_tokenize
from gensim.models import Word2Vec
import gensim.downloader
import numpy as np


### Load Data

In [6]:
path_adept = "datasets/adept/train-dev-test-split/"
path_pap = "datasets/pap/train-dev-test-split/"


# adept

def no_label(dictionary):
    return {key: value for key, value in dictionary.items() if key!="label"}


with open(path_adept+"/train.json", "r", encoding="utf-8") as file:
    adept_train = json.load(file)    
with open(path_adept+"/val.json", "r", encoding="utf-8") as file:
    adept_val = json.load(file)   
with open(path_adept+"/test.json", "r", encoding="utf-8") as file:
    adept_test = json.load(file)


# pap with multiclasses

with open(path_pap + "/multiclass/train.csv") as file:
    pap_multi_train = list(csv.DictReader(file))
with open(path_pap + "/multiclass/dev.csv") as file:
    pap_multi_dev = list(csv.DictReader(file))
with open(path_pap + "/multiclass/test.csv") as file:
    pap_multi_test = list(csv.DictReader(file))


# pap with binary labels

with open(path_pap + "/binary/train.csv") as file:
    pap_bin_train = list(csv.DictReader(file))
with open(path_pap + "/binary/dev.csv") as file:
    pap_bin_dev = list(csv.DictReader(file))
with open(path_pap + "/binary/test.csv") as file:
    pap_bin_test = list(csv.DictReader(file))

In [7]:
# data-label splits 

# adept
adept_train_data = [no_label(instance) for instance in adept_train]
adept_train_labels = [instance['label'] for instance in adept_train]
adept_val_data = [no_label(instance) for instance in adept_val]
adept_val_labels = [instance['label'] for instance in adept_val]
adept_test_data = [no_label(instance) for instance in adept_test]
adept_test_labels = [instance['label'] for instance in adept_test]

# pap with multiclasses
pap_multi_train_data = [instance['text'] for instance in pap_multi_train]
pap_multi_train_labels = [instance['label'] for instance in pap_multi_train]
pap_multi_dev_data = [instance['text'] for instance in pap_multi_dev]
pap_multi_dev_labels = [instance['label'] for instance in pap_multi_dev]
pap_multi_test_data = [instance['text'] for instance in pap_multi_test]
pap_multi_test_labels = [instance['label'] for instance in pap_multi_test]

# pap with binary labels
pap_bin_train_data = [instance['text'] for instance in pap_bin_train]
pap_bin_train_labels = [instance['label'] for instance in pap_bin_train]
pap_bin_dev_data = [instance['text'] for instance in pap_bin_dev]
pap_bin_dev_labels = [instance['label'] for instance in pap_bin_dev]
pap_bin_test_data = [instance['text'] for instance in pap_bin_test]
pap_bin_test_labels = [instance['label'] for instance in pap_bin_test]


In [31]:

# general tokenizer
def tokenize(sentences):
    tokenized = [word_tokenize(sentence.lower()) for sentence in sentences]

    return tokenized

# tokenizer for ADEPT data with special token <sep>
def tokenize_with_sep(sentences):
    tokenizer = MWETokenizer()
    tokenizer.add_mwe(('<', 'sep', '>'))
    tokenized_with_sep = [tokenizer.tokenize(sentence) for sentence in sentences]

    return tokenized_with_sep

# train CBOW model with vector size = 100
def train_CBOW(data):
    cbow_model = Word2Vec(min_count=1, vector_size=100, window=5)
    cbow_model.build_vocab(data, progress_per=10000)
    cbow_model.train(data, total_examples=cbow_model.corpus_count, epochs=100, report_delay=1)

    return cbow_model


# get sentence vector by calculate average word embeddings
def get_sent_vec(model, sentence, google_pretrained=False):
    
     # if not use Google pretrained Word2Vec
    if google_pretrained is False:
        sent_len = len(sentence)
        get_vec = [model.wv[word] if word in model.wv else np.zeros(100) for word in sentence]
        sent_vec = np.sum(get_vec, axis=0) / sent_len
    
    # if use Google pretrained Word2Vec
    else:
        sent_len = len(sentence)
        get_vec = [model[word] if word in model else np.zeros(300) for word in sentence]
        sent_vec = np.sum(get_vec, axis=0) / sent_len

    return sent_vec




In [3]:
# Load Google pretrained Word2Vec model
google_news = gensim.downloader.load('word2vec-google-news-300')



## PAP Dataset
### Train set

##### Get embeddings from our CBOW Model

In [29]:

# tokenize
pap_bin_train_tokenized = tokenize(pap_bin_train_data)
# train CBOW for pap data
pap_bin_CBOW = train_CBOW(pap_bin_train_tokenized)
# get sentence vector
pap_bin_train_sentence_vectors = [get_sent_vec(pap_bin_CBOW, sentence) for sentence in pap_bin_train_tokenized]

# tokenize
pap_multi_train_tokenized = tokenize(pap_multi_train_data)
# train CBOW for pap data
pap_multi_CBOW = train_CBOW(pap_multi_train_tokenized)
# get sentence vector
pap_multi_train_sentence_vectors = [get_sent_vec(pap_multi_CBOW, sentence) for sentence in pap_multi_train_tokenized]



#### Get Embeddings from Google Pretrained Word2Vec

In [36]:
pap_bin_train_sentence_vectors_gg =  [get_sent_vec(google_news, sentence, True) for sentence in pap_bin_train_tokenized]
pap_multi_train_sentence_vectors_gg = [get_sent_vec(google_news, sentence, True) for sentence in pap_multi_train_tokenized]


##### Train models

In [39]:
'''
Using our Embeddings from CBOW
'''
# train perceptrons using embeddings from our CBOW model
pap_bin_classifier = Perceptron(tol=1e-3, random_state=0)
pap_bin_classifier.fit(pap_bin_train_sentence_vectors, pap_bin_train_labels)

pap_multi_classifier = Perceptron(tol=1e-3, random_state=0)
pap_multi_classifier.fit(pap_multi_train_sentence_vectors, pap_multi_train_labels)

'''
Using Embeddings from Google pretrained Word2Vec
'''
# train perceptrons using pretrained embedding from Google
pap_bin_classifier_gg = Perceptron(tol=1e-3, random_state=0)
pap_bin_classifier_gg.fit(pap_bin_train_sentence_vectors_gg, pap_bin_train_labels)

pap_multi_classifier_gg = Perceptron(tol=1e-3, random_state=0)
pap_multi_classifier_gg.fit(pap_multi_train_sentence_vectors_gg, pap_multi_train_labels)

# print mean accuracy for train set
print("Mean accuracy of the training set:")
print(f"pap binary model: {pap_bin_classifier.score(pap_bin_train_sentence_vectors, pap_bin_train_labels)}")
print(f"pap multi-class model: {pap_multi_classifier.score(pap_multi_train_sentence_vectors, pap_multi_train_labels)}")
print("=============================")
print(f"pap binary model (Google Word2Vec): {pap_bin_classifier_gg.score(pap_bin_train_sentence_vectors_gg, pap_bin_train_labels)}")
print(f"pap multi-class model (Google Word2Vec): {pap_multi_classifier_gg.score(pap_multi_train_sentence_vectors_gg, pap_multi_train_labels)}")

Mean accuracy of the training set:
pap binary model: 0.7123842592592593
pap multi-class model: 0.33738425925925924
pap binary model (Google Word2Vec): 0.7407407407407407
pap multi-class model (Google Word2Vec): 0.48900462962962965


### Dev set

In [40]:
'''
Using our Embeddings from CBOW
'''
# testing binary model with dev set
pap_bin_dev_tokenized = tokenize(pap_bin_dev_data)
pap_bin_dev_sentence_vectors = [get_sent_vec(pap_bin_CBOW, sentence) for sentence in pap_bin_dev_tokenized]
pap_bin_dev_pred = pap_bin_classifier.predict(pap_bin_dev_sentence_vectors)

# testing multi-class models with dev set
pap_multi_dev_tokenized = tokenize(pap_multi_dev_data)
pap_multi_dev_sentence_vectors = [get_sent_vec(pap_multi_CBOW, sentence) for sentence in pap_multi_dev_tokenized]
pap_multi_dev_pred = pap_multi_classifier.predict(pap_multi_dev_sentence_vectors)

'''
Using Embeddings from Google pretrained Word2Vec
'''
# testing binary model with dev set
pap_bin_dev_sentence_vectors_gg = [get_sent_vec(google_news, sentence, True) for sentence in pap_bin_dev_tokenized]
pap_bin_dev_pred_gg = pap_bin_classifier_gg.predict(pap_bin_dev_sentence_vectors_gg)

# testing multi-class models with dev set
pap_multi_dev_sentence_vectors_gg = [get_sent_vec(google_news, sentence, True) for sentence in pap_multi_dev_tokenized]
pap_multi_dev_pred_gg = pap_multi_classifier_gg.predict(pap_multi_dev_sentence_vectors_gg)

# print accuracy
print("Accuracy of dev sets:")
print(f'pap binary model: {accuracy_score(pap_bin_dev_labels, pap_bin_dev_pred)}')
print(f'pap multi-class model: {accuracy_score(pap_multi_dev_labels, pap_multi_dev_pred)}')
print("=============================")
print(f'pap binary model (Google Word2Vec): {accuracy_score(pap_bin_dev_labels, pap_bin_dev_pred_gg)}')
print(f'pap multi-class model (Google Word2Vec): {accuracy_score(pap_multi_dev_labels, pap_multi_dev_pred_gg)}')

Accuracy of dev sets:
pap binary model: 0.6712962962962963
pap multi-class model: 0.2037037037037037
pap binary model (Google Word2Vec): 0.7175925925925926
pap multi-class model (Google Word2Vec): 0.3287037037037037


### Test set

In [51]:
# ONLY RUN THE FOLLOWING CODE FOR THE FINAL EVALUATION

'''
Outputs:
    pap_bin_test_pred: predictions of binary classifier
    pap_multi_test_pred: predictions of multi-classes classifier
'''

'''
Using our Embeddings from CBOW
'''
# testing binary model with test set
pap_bin_test_tokenized = tokenize(pap_bin_test_data)
pap_bin_test_sentence_vectors = [get_sent_vec(pap_bin_CBOW, sentence) for sentence in pap_bin_test_tokenized]
pap_bin_test_pred = pap_bin_classifier.predict(pap_bin_test_sentence_vectors)

# testing multi-class models with test set
pap_multi_test_tokenized = tokenize(pap_multi_test_data)
pap_multi_test_sentence_vectors = [get_sent_vec(pap_multi_CBOW, sentence) for sentence in pap_multi_test_tokenized]
pap_multi_test_pred = pap_multi_classifier.predict(pap_multi_test_sentence_vectors)

'''
Using Embeddings from Google pretrained Word2Vec
'''
# testing binary model with test set
pap_bin_test_sentence_vectors_gg = [get_sent_vec(google_news, sentence, True) for sentence in pap_bin_test_tokenized]
pap_bin_test_pred_gg = pap_bin_classifier_gg.predict(pap_bin_test_sentence_vectors_gg)

# testing multi-class models with test set
pap_multi_test_sentence_vectors_gg = [get_sent_vec(google_news, sentence, True) for sentence in pap_multi_test_tokenized]
pap_multi_test_pred_gg = pap_multi_classifier_gg.predict(pap_multi_test_sentence_vectors_gg)

# print accuracy
print("Accuracy of test sets:")
print(f'pap binary model: {accuracy_score(pap_bin_test_labels, pap_bin_test_pred)}')
print(f'pap multi-class model: {accuracy_score(pap_multi_test_labels, pap_multi_test_pred)}')
print("=============================")
print(f'pap binary model (Google Word2Vec): {accuracy_score(pap_bin_test_labels, pap_bin_test_pred_gg)}')
print(f'pap multi-class model (Google Word2Vec): {accuracy_score(pap_multi_test_labels, pap_multi_test_pred_gg)}')

Accuracy of test sets:
pap binary model: 0.7222222222222222
pap multi-class model: 0.18518518518518517
pap binary model (Google Word2Vec): 0.7083333333333334
pap multi-class model (Google Word2Vec): 0.38425925925925924


## ADEPT Dataset
### Training set

In [42]:
# using both sentence 1 and 2, separated by <SEP> token
def merge_sentence(data):
    adept_sentences = [item['sentence1'] + " <SEP> " + item['sentence2'] for item in data]

    return adept_sentences

# using only sentence 2
def extract_sentence2(data):
    adept_sentences = [item['sentence2'] for item in data]

    return adept_sentences

##### Get embeddings from our CBOW Model

In [43]:
# get sentence vectors with both sentences

# merge sentence 1, and 2
adept_train_sentences_merged = merge_sentence(adept_train_data)
# tokenize
adept_train_tokenized_merged = tokenize_with_sep(tokenize(adept_train_sentences_merged))
# train CBOW for ADEPT data
adept_CBOW_merged = train_CBOW(adept_train_tokenized_merged)
# get sentence vector
adept_train_sentence_vectors_merged = [get_sent_vec(adept_CBOW_merged, sentence) for sentence in adept_train_tokenized_merged]

In [44]:
# get sentence vectors only with sentence 2

# extract only sentence 2
adept_train_sentences_single = extract_sentence2(adept_train_data)
# tokenize
adept_train_tokenized_single = tokenize(adept_train_sentences_single)
# train CBOW for single sentence ADEPT data
adept_CBOW_single = train_CBOW(adept_train_tokenized_single)
# get sentence vector
adept_train_sentence_vectors_single = [get_sent_vec(adept_CBOW_single, sentence) for sentence in adept_train_tokenized_single]


#### Get Embeddings from Google Pretrained Word2Vec

In [45]:
# get sentence vectors with both sentences
adept_train_sentence_vectors_merged_gg = [get_sent_vec(google_news, sentence, True) for sentence in adept_train_tokenized_merged]

# get sentence vectors only with sentence 2
adept_train_sentence_vectors_single_gg = [get_sent_vec(google_news, sentence, True) for sentence in adept_train_tokenized_single]

##### Train models

In [47]:
'''
Using our Embeddings from CBOW
'''
adept_classifier_merged = Perceptron(tol=1e-3, random_state=0, early_stopping=True)
adept_classifier_merged.fit(adept_train_sentence_vectors_merged, adept_train_labels)

adept_classifier_single = Perceptron(tol=1e-3, random_state=0, early_stopping=True)
adept_classifier_single.fit(adept_train_sentence_vectors_single, adept_train_labels) 

'''
Using Embeddings from Google pretrained Word2Vec
'''
adept_classifier_merged_gg = Perceptron(tol=1e-3, random_state=0, early_stopping=True)
adept_classifier_merged_gg.fit(adept_train_sentence_vectors_merged_gg, adept_train_labels)

adept_classifier_single_gg = Perceptron(tol=1e-3, random_state=0, early_stopping=True)
adept_classifier_single_gg.fit(adept_train_sentence_vectors_single_gg, adept_train_labels) 

# print mean accuracy for train set
print("Mean accuracy of the training set:")
print(f"adept multi-classes model (both sentences): {adept_classifier_merged.score(adept_train_sentence_vectors_merged, adept_train_labels)}")
print(f"adept multi-classes model (only sentence 2): {adept_classifier_single.score(adept_train_sentence_vectors_single, adept_train_labels)}")
print("=============================")
print(f"adept multi-classes model (both sentences, Google Word2Vec): {adept_classifier_merged_gg.score(adept_train_sentence_vectors_merged_gg, adept_train_labels)}")
print(f"adept multi-classes model (only sentence 2, Google Word2Vec): {adept_classifier_single_gg.score(adept_train_sentence_vectors_single_gg, adept_train_labels)}")

Mean accuracy of the training set:
adept multi-classes model (both sentences): 0.6111542041576171
adept multi-classes model (only sentence 2): 0.5504964318957493
adept multi-classes model (both sentences, Google Word2Vec): 0.6694073844244492
adept multi-classes model (only sentence 2, Google Word2Vec): 0.6747595408004964


### Val set

In [48]:
'''
Using our Embeddings from CBOW
'''
# using both sentences
adept_val_sentences_merged = merge_sentence(adept_val_data)
adept_val_tokenized_merged = tokenize_with_sep(tokenize(adept_val_sentences_merged))
adept_val_sentence_vectors_merged = [get_sent_vec(adept_CBOW_merged, sentence) for sentence in adept_val_tokenized_merged]

# using only sentence 2
adept_val_sentences_single = merge_sentence(adept_val_data)
adept_val_tokenized_single = tokenize(adept_val_sentences_single)
adept_val_sentence_vectors_single = [get_sent_vec(adept_CBOW_single, sentence) for sentence in adept_val_tokenized_single]

'''
Using Embeddings from Google pretrained Word2Vec
'''

# using both sentences
adept_val_sentence_vectors_merged_gg = [get_sent_vec(google_news, sentence, True) for sentence in adept_val_tokenized_merged]
# using only sentence 2
adept_val_sentence_vectors_single_gg = [get_sent_vec(google_news, sentence, True) for sentence in adept_val_tokenized_single]

In [49]:
'''
Using our Embeddings from CBOW
'''

# test model with val set
adept_val_pred_merged = adept_classifier_merged.predict(adept_val_sentence_vectors_merged)
adept_val_pred_single = adept_classifier_single.predict(adept_val_sentence_vectors_single) 

'''
Using Embeddings from Google pretrained Word2Vec
'''
adept_val_pred_merged_gg = adept_classifier_merged_gg.predict(adept_val_sentence_vectors_merged_gg)
adept_val_pred_single_gg = adept_classifier_single_gg.predict(adept_val_sentence_vectors_single_gg) 


print("Accuracy of val sets:")
print(f'ADEPT multi-classes model (both sentences): {accuracy_score(adept_val_labels, adept_val_pred_merged)}')
print(f'ADEPT multi-classes model (only sentence 2): {accuracy_score(adept_val_labels, adept_val_pred_single)}')
print("=============================")
print(f'ADEPT multi-classes model (both sentences, Google Word2Vec): {accuracy_score(adept_val_labels, adept_val_pred_merged_gg)}')
print(f'ADEPT multi-classes model (only sentence 2, Google Word2Vec): {accuracy_score(adept_val_labels, adept_val_pred_single_gg)}')

Accuracy of val sets:
ADEPT multi-classes model (both sentences): 0.6058348851644941
ADEPT multi-classes model (only sentence 2): 0.5127250155183116
ADEPT multi-classes model (both sentences, Google Word2Vec): 0.6635630043451273
ADEPT multi-classes model (only sentence 2, Google Word2Vec): 0.6635630043451273


### Test set

In [50]:
# ONLY RUN THE FOLLOWING CODE FOR THE FINAL EVALUATION

'''
Outputs:
    adept_test_pred_merged: predictions of adept multi-classes classifier (with both sentences)
    adept_test_pred_single: predictions of adept multi-classes classifier (with only sentence 2)
    adept_test_pred_merged_gg: predictions of adept multi-classes classifier (with both sentences, using Google Word2Vec)
    adept_test_pred_single_gg: predictions of adept multi-classes classifier (with only sentence 2, using Google Word2Vec)
'''

'''
Using our Embeddings from CBOW
'''
# get document vector for test set
adept_test_sentences_merged = merge_sentence(adept_test_data)
adept_test_tokenized_merged = tokenize_with_sep(tokenize(adept_test_sentences_merged))
adept_test_sentence_vectors_merged = [get_sent_vec(adept_CBOW_merged, sentence) for sentence in adept_test_tokenized_merged]

adept_test_sentences_single = extract_sentence2(adept_test_data)
adept_test_tokenized_single = tokenize(adept_test_sentences_single)
adept_test_sentence_vectors_single = [get_sent_vec(adept_CBOW_single, sentence) for sentence in adept_test_tokenized_single]

# test models with test set
adept_test_pred_merged = adept_classifier_merged.predict(adept_test_sentence_vectors_merged)
adept_test_pred_single = adept_classifier_single.predict(adept_test_sentence_vectors_single)

'''
Using Embeddings from Google pretrained Word2Vec
'''

# get document vector for test set
adept_test_sentence_vectors_merged_gg = [get_sent_vec(google_news, sentence, True) for sentence in adept_test_tokenized_merged]
adept_test_sentence_vectors_single_gg = [get_sent_vec(google_news, sentence, True) for sentence in adept_test_tokenized_single]

# test models with test set
adept_test_pred_merged_gg = adept_classifier_merged_gg.predict(adept_test_sentence_vectors_merged_gg)
adept_test_pred_single_gg = adept_classifier_single_gg.predict(adept_test_sentence_vectors_single_gg)

print("Accuracy of test sets:")
print(f'ADEPT multi-classes model (with both sentences): {accuracy_score(adept_test_labels, adept_test_pred_merged)}')
print(f'ADEPT multi-classes model (only sentence 2): {accuracy_score(adept_test_labels, adept_test_pred_single)}')
print("=============================")
print(f'ADEPT multi-classes model (with both sentences, Google Word2Vec): {accuracy_score(adept_test_labels, adept_test_pred_merged_gg)}')
print(f'ADEPT multi-classes model (only sentence 2, Google Word2Vec): {accuracy_score(adept_test_labels, adept_test_pred_single_gg)}')

Accuracy of test sets:
ADEPT multi-classes model (with both sentences): 0.6240694789081885
ADEPT multi-classes model (only sentence 2): 0.5688585607940446
ADEPT multi-classes model (with both sentences, Google Word2Vec): 0.6792803970223326
ADEPT multi-classes model (only sentence 2, Google Word2Vec): 0.6892059553349876
