These are the needed libraries to be imported as well as the boilerplate functions

In [1]:
import pandas as pd
import numpy as np
import nltk
import math
from sklearn.svm import LinearSVC
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from collections import defaultdict


df_part1 = pd.read_csv('raw/data_part1.csv')
df_part2 = pd.read_csv('raw/data_part2.csv')
df_part3 = pd.read_csv('raw/data_part3.csv')
df_part4 = pd.read_csv('raw/data_part4.csv')
df_part5 = pd.read_csv('raw/data_part5.csv')

#Gets all of an original author's rows
def get_orig_rows(author):
    rows = []
    for i,row in df_part1[df_part1['creator'] == author].iterrows():
        rows.append([list(row)[0], 'same'])
    return rows

UNK = "!!!<UNK>!!!"

#Uses ntlk's casual tokenize to tokenize text since these are tweets, emails, and texts
def preprocess(text):
    tokens = nltk.tokenize.casual_tokenize(text, preserve_case=False)
    return [t for t in tokens if t.isalpha()]

#Creates and index of the unique tokens found in the corpus
def make_index(tokens):
    tokens.append(UNK)
    index = {}
    max_index = 0
    for token in tokens:
        if token not in index:
            index[token] = max_index
            max_index += 1
    return index

#Embeds an array in with a simple bag of words model
def embed_bow(tokens, index):
    a = np.zeros(len(index))
    for token in tokens:
        if token in index:
            a[index[token]] += 1
        else:
            a[index[UNK]] += 1
    return a

#creates a reverse index of the tokens in the corpus
def reverse_index(index):
    return [w for i, w in sorted(map(tuple, map(reversed, index.items())))]


#For each text in df, embed it using the bag of words model
def one_row_per_embedding_bow(df, index):
    rows = []
    for i, row in df.iterrows():
        word_vec = embed_bow(preprocess(row['sample']), index)
        new_row = list(word_vec) + [list(row)[1]]
        #print(new_row)
        rows.append(new_row)
    bow_df = pd.DataFrame(rows)
    bow_df.columns = reverse_index(index) + ['!author']
    return bow_df

#Here we embed an array by using log of the relative frequency of a token
#within the message over the relative frequency of the token in the corpus
#NOTE: Not sure what to do with UNKS here, could be important
def embed_bow_freq(tokens, all_tokens, index):
    a = np.zeros(len(index))
    for token in set(tokens):
        if token in index:
            local_freq = tokens.count(token) / len(tokens)
            global_freq = all_tokens.count(token) / len(all_tokens)
            a[index[token]] = math.log(local_freq / (global_freq+1))
        """ 
        else:
            local_freq = tokens.count(token) / len(tokens)
            global_freq = all_tokens.count(token) / len(all_tokens)
            a[index[UNK]] = math.log(local_freq / (global_freq+1))
        """
    return a

#Embeds each message with relative frequencies
def one_row_per_embedding_bow_freq(df, all_tokens, index):
    rows = []
    for i, row in df.iterrows():
        word_vec = embed_bow_freq(preprocess(row['sample']), all_tokens, index)
        new_row = list(word_vec) + [list(row)[1]]
        #print(new_row)
        rows.append(new_row)
    bow_freq_df = pd.DataFrame(rows)
    bow_freq_df.columns = reverse_index(index) + ['!author']
    return bow_freq_df

def P_and_R(df, target, value):
    predicted_value = df[df['!predicted'] == value]
    actually_value = df[df[target] == value]
    predicted_and_actually_value = actually_value[actually_value['!predicted'] == value]
    
    print("Length of df: " + str(len(df)))
    print("Number of actually " + str(value) + ": " + \
      str(len(actually_value)))
    print("Number of predicted " + str(value) + ": " + \
      str(len(predicted_value)))
    print("Number of predicted that is correct: " +\
         str(len(predicted_and_actually_value)))
    
    print("Precision: " + str(len(predicted_and_actually_value)/len(predicted_value)))
    print("Recall: " + str(len(predicted_and_actually_value)/len(actually_value)))

def recall(df, target, value):
    predicted_value = df[df['!predicted'] == value]
    actually_value = df[df[target] == value]
    predicted_and_actually_value = actually_value[actually_value['!predicted'] == value]
    return len(predicted_and_actually_value)/len(actually_value)


This function gets all imitations and identifies the intended athor as well as the original imitator. 

In [2]:
imitator_count = 0

#Get each imitator's imitations and signify the intended author as well
#as the original imitator.
#A littler hacky here, but all tables were checked for validity
def get_imitations(df):
    global imitator_count
    rows = []
    for i, row in df.iterrows():
        if 'participant' in list(row)[0]:
            imitator_count += 1
        else:
            if type(list(row)[1]) != float and list(row)[1] != ' ':
                text = list(row)[1]
                imitator = 'imitator'+str(imitator_count)
                if 'p1' in list(row)[2]:
                    int_author = 'participant 1'
                elif 'p2' in list(row)[2]:
                    int_author = 'participant 2'
                elif 'p3' in list(row)[2]:
                    int_author = 'participant 3'   
                msg_type = list(row)[3]
                row = [text, imitator, int_author, msg_type]
                rows.append(row)
    return rows
                
imitation_rows = get_imitations(df_part2) + get_imitations(df_part3) + get_imitations(df_part4)
df_imit =  pd.DataFrame(imitation_rows)
df_imit.columns = ['sample', 'imitator', 'intended author', 'type']

In [3]:
df_part2

Unnamed: 0,natural writing sample,imitation,imitation condition,message type
0,participant1,,,email
1,I have had the absolute worst day. You serious...,"Wanda, Can you believe winter is finally here?...",cond9_p1_t10 : Hi Wanda. I can't believe winte...,email
2,I hope to be in nursing school in 5 years. I h...,,,email
3,The books you have written go so much further ...,,,email
4,Today is a very hectic day. The baby has been ...,,,email
5,I have considered the future a lot. My husband...,,,email
6,participant10,,,email
7,"During days like today, it’s important to know...",The only way of finding out why your partner d...,cond3_p1_t1 : Hi Bill. Congratulations on bein...,email
8,Congratulations on the completion of the const...,,,email
9,Thank you for pulling me into your imaginary w...,,,email


This block creates a dataframe for all imitator samples 

In [4]:
imitator_sample_count = 0

def get_imitation_samples(df):
    global imitator_sample_count
    rows = []
    for i, row in df.iterrows():
        if 'participant' in list(row)[0]:
            imitator_sample_count += 1
        else:
            text = list(row)[0]
            imitator = 'imitator'+str(imitator_sample_count)   
            msg_type = list(row)[3]
            row = [text, imitator, msg_type]
            rows.append(row)
    return rows

imit_sample_rows = get_imitation_samples(df_part2) + get_imitation_samples(df_part3) + get_imitation_samples(df_part4)
df_imit_samples = pd.DataFrame(imit_sample_rows)
df_imit_samples.columns = ['sample', 'imitator', 'type']
df_imit

Unnamed: 0,sample,imitator,intended author,type
0,"Wanda, Can you believe winter is finally here?...",imitator1,participant 1,email
1,The only way of finding out why your partner d...,imitator2,participant 1,email
2,I love reading murder mysteries and your lates...,imitator3,participant 3,email
3,there was a little tussle between spring and f...,imitator4,participant 2,email
4,I'd like a barbecue with my family and some fr...,imitator5,participant 3,email
5,Hi Sarah. Congratulations on becoming Presiden...,imitator6,participant 1,email
6,Hi Emily. Congratulations on this great milest...,imitator7,participant 1,email
7,"You know, I would love to have a barbecue with...",imitator8,participant 3,email
8,Hi Kaitlyn. I cannot believe it is already Jun...,imitator9,participant 1,email
9,Hey Cam. I can't wait until summer arrives. It...,imitator10,participant 1,email


More boilerplate functions for imitators samples and for imitations themselves. 

In [5]:
#Get a particular imitaor's samples
def get_imitator_sample_rows(df, imitator):
    rows = []
    for i,row in df[df['imitator'] == imitator].iterrows():
        sample = list(row)[0]
        rows.append([sample, 'different'])
    return rows

#Get an author's samples with a specific message type
def get_orig_rows_with_type(df, author, msgType):
    rows = []
    for i,row in df[df['creator'] == author].iterrows():
        if list(row)[2] == msgType:
            rows.append([list(row)[0], 'different'])
    return rows

#Get all the imitations from a particular imitator on a particular author
def get_imitation_test_set(df, imitator, intended_author):
    rows = []
    for i,row in df.iterrows():
        r = list(row)
        if r[2] == intended_author and r[1] == imitator:
            rows.append([r[0], "same"])
    df_final = pd.DataFrame(rows)
    return df_final

#Get all imitations of a particular author
def get_attacks_on_author(df, intended_author):
    rows = []
    for i,row in df.iterrows():
        r = list(row)
        if r[2] == intended_author:
            rows.append([r[0], 'different'])
    return rows

# Sanity Checks:

13 participants in group 1 of imitators (emails), 11 in group 2 (tweets), and 12 in group 3 (texts)

In [12]:
print(len([row for i, row in df_part2.iterrows() if 'participant' in list(row)[0]]))
print(len([row for i, row in df_part3.iterrows() if 'participant' in list(row)[0]]))
print(len([row for i, row in df_part4.iterrows() if 'participant' in list(row)[0]]))


13
11
11


5 samples each for group 1 and 50 samples each for groups 2 and 3

In [16]:
print(len(get_imitation_samples(df_part2)) / 5)
print(len(get_imitation_samples(df_part3)) / 50)
print(len(get_imitation_samples(df_part4)) / 50)

13.0
11.0
11.0


1 imitation each for group 1, 10 each for group 2, and 11 each for group 3

In [20]:

print(len(get_imitations(df_part2)) / 1)
print(len(get_imitations(df_part3)) / 10)
print(len(get_imitations(df_part4)) / 11)

13.0
11.0
11.0


All information was taken from Lazbin's thesis as well as from looking at the datasets themselves. 

# Baseline

Here, we will create the training set for the baseline, namely the first two authors and their respective imitators. 

In [None]:
auth1_auth2_rows = get_orig_rows("participant 1") + get_orig_rows("participant 2")
auth1_auth2_imit_rows = get_attacks_on_author(df_imit[0:134], "participant 1") + get_attacks_on_author(df_imit[0:134], "participant 2")
baseline_df_train = pd.DataFrame(auth1_auth2_rows + auth1_auth2_imit_rows).sample(frac=1)
baseline_df_train.columns = ['sample', 'author']

Now, we create the test set, namely the originaal message and attempted imitations of author 3. 

In [None]:
auth3_rows = get_orig_rows("participant 3")
auth3_imit_rows = get_attacks_on_author(df_imit[134:], "participant 3")
baseline_df_test = pd.DataFrame(auth3_rows + auth3_imit_rows).sample(frac=1)
baseline_df_test.columns = ['sample', 'author']

We then create bag of words embedding for the training and test sets.

In [None]:
all_train = " ".join(baseline_df_train['sample'])
all_train_tokens = preprocess(all_train)
train_index = make_index(all_train_tokens)
all_train_cols = reverse_index(train_index)

df_train_bow = one_row_per_embedding_bow(baseline_df_train, train_index)
df_test_bow = one_row_per_embedding_bow(baseline_df_test, train_index)

Now we train the One Class SVM and evaluate thtraining and test sets. 

In [None]:
svm_train_bow = df_train_bow.copy()
svm_test_bow = df_test_bow.copy()

baseline_svm = LinearSVC()
baseline_svm.fit(svm_train_bow[all_train_cols], svm_train_bow['!author'])

svm_train_bow['!predicted'] = baseline_svm.predict(svm_train_bow[all_train_cols])
svm_test_bow['!predicted'] = baseline_svm.predict(svm_test_bow[all_train_cols])


In [None]:
print("Known Authors P/R")
P_and_R(svm_train_bow, "!author", 'different')
print("Rand Index: " + str(accuracy_score(svm_train_bow["!author"], svm_train_bow["!predicted"])))
print("Number Correct: " + str(accuracy_score(svm_train_bow["!author"], svm_train_bow["!predicted"], normalize=False)))

print()

print("Unknown Authors P/R")
P_and_R(svm_test_bow, "!author", 'different')
print("Rand Index: " + str(accuracy_score(svm_test_bow["!author"], svm_test_bow["!predicted"])))
print("Number Correct: " + str(accuracy_score(svm_test_bow["!author"], svm_test_bow["!predicted"], normalize=False)))

# Baseline Pt2

Now we perform the same baseline test but on a different embedding (relative frequency ratios on a log scaling). 

In [None]:
freq_train_bow = one_row_per_embedding_bow_freq(baseline_df_train, all_train_tokens, train_index)
freq_test_bow = one_row_per_embedding_bow_freq(baseline_df_test, all_train_tokens, train_index)

In [None]:
freq_svm_train_bow = freq_train_bow.copy()
freq_svm_test_bow = freq_test_bow.copy()

freq_baseline_svm = LinearSVC()
freq_baseline_svm.fit(freq_svm_train_bow[all_train_cols],freq_svm_train_bow['!author'])

freq_svm_train_bow['!predicted'] = \
freq_baseline_svm.predict(freq_svm_train_bow[all_train_cols])

freq_svm_test_bow['!predicted'] = \
freq_baseline_svm.predict(freq_svm_test_bow[all_train_cols])

In [None]:
print("Known Authors P/R")
P_and_R(freq_svm_train_bow, "!author", 'same')
print("Rand Index: " + str(accuracy_score(freq_svm_train_bow["!author"], freq_svm_train_bow["!predicted"])))
print("Number Correct: " + str(accuracy_score(freq_svm_train_bow["!author"], freq_svm_train_bow["!predicted"], normalize=False)))
print()

print("Unknown Authors P/R")
P_and_R(freq_svm_test_bow, "!author", 'different')
print("Rand Index: " + str(accuracy_score(freq_svm_test_bow["!author"], freq_svm_test_bow["!predicted"])))
print("Number Correct: " + str(accuracy_score(freq_svm_test_bow["!author"], freq_svm_test_bow["!predicted"], normalize=False)))

# Comparison Using SVM

Here, we will first preprocess each author and imitator's reference sets and store them into a dictionary. 

In [None]:
author_dict = defaultdict(list)
imitator_dict = defaultdict(list)

for i,row in df_part1.iterrows():
    r = list(row)
    author_dict[r[1]] += preprocess(r[0])

for i,row in df_imit_samples.iterrows():
    r = list(row)
    imitator_dict[r[1]] += preprocess(r[0])



Now, we go through each imitation and train an SVM scaled on the original author and another SVM scaled on the imitator. We then record the decision functions results of those SVMs and compare them. 

In [None]:
df_imit_comp = df_imit.copy()
svm1_distance = []
svm2_distance = []

svm1_class = []
svm2_class = []

for i,row in df_imit_comp.iterrows():
    r = list(row)
    svm1 = LinearSVC()
    svm2 = LinearSVC()
    msg_tokens = preprocess(r[0])
    
    #Here, we grab the imitator samples and the author samples 
    #for this particualr imitation
    messages = get_orig_rows(r[2]) + \
    get_imitator_sample_rows(df_imit_samples, r[1])
    
    #Create a dataframe for these messages
    df_messages = pd.DataFrame(messages).sample(frac=1)
    df_messages.columns = ['sample', '!author']
    
    #Preprocess all the messages
    all_msg = " ".join(df_messages['sample'])
    all_msg_tokens = preprocess(all_msg)
    msg_index = make_index(all_msg_tokens)
    
    #Use the frequency embedding but log-scale it against:
    #Original author reference set for SVM1 
    #imitator reference set for SVM2
    df_train_svm1 = one_row_per_embedding_bow_freq(df_messages, author_dict[r[2]], msg_index)
    df_train_svm2 = one_row_per_embedding_bow_freq(df_messages, imitator_dict[r[1]], msg_index)
    
    #Fit a one-class SVM on both embeddings
    svm1.fit(df_train_svm1[msg_index], df_train_svm1["!author"])
    svm2.fit(df_train_svm2[msg_index], df_train_svm2["!author"])
    
    #print(svm1.decision_function(embed_bow_freq(msg_tokens, author_dict[r[2]], msg_index)))
    
    #Add the result of the decision function to the dataframe
    svm1_distance.append(svm1.decision_function(embed_bow_freq(msg_tokens, author_dict[r[2]], msg_index).reshape(1, -1)))
    svm2_distance.append(svm2.decision_function(embed_bow_freq(msg_tokens, imitator_dict[r[1]], msg_index).reshape(1, -1)))
    
    svm1_class.append(svm1.predict(embed_bow_freq(msg_tokens, author_dict[r[2]], msg_index).reshape(1, -1)))
    svm2_class.append(svm2.predict(embed_bow_freq(msg_tokens, imitator_dict[r[1]], msg_index).reshape(1, -1)))

In [None]:
df_imit_comp["SVM1_dist"] = svm1_distance
df_imit_comp["SVM2_dist"] = svm2_distance
svm_diff = [(svm1_distance[i][0] - svm2_distance[i][0])\
            for i in range(len(svm1_distance))]
df_imit_comp["Distance"] = svm_diff

df_imit_comp["svm1_pred"] = [c[0] for c in svm1_class]
df_imit_comp["svm2_pred"] = [c[0] for c in svm2_class]
df_imit_comp

In [None]:
imitations_per_imitator = defaultdict(int)
imitators_per_author = defaultdict(int)

imit_closer_to_author = defaultdict(int)
auth_farther_from_imit = defaultdict(int)

for i in list(df_imit_comp["imitator"]):
    imitations_per_imitator[i] += 1
    
for i in list(df_imit_comp["intended author"]):
    imitators_per_author[i] += 1

for i,r in df_imit_comp.iterrows():
    imit = list(r)[1]
    auth = list(r)[2]
    
    imit_dist = list(r)[5]
    imit_diff = list(r)[6]
    
    if imitations_per_imitator[imit] > 1:
        if imit_diff < 0:
            imit_closer_to_author[imit] += 1
            auth_farther_from_imit[auth] += 1
            
auth_hit_rate = {key: auth_farther_from_imit[key] / imitators_per_author[key] \
                for key in auth_farther_from_imit.keys()} 
imit_hit_rate = {key: imit_closer_to_author[key] / imitations_per_imitator[key] \
                for key in imit_closer_to_author.keys()} 

print(sorted(imit_hit_rate.items(), key = lambda p: (-p[1], p[0]))[0:10])
print()
print(auth_hit_rate)

# GloVe / word2vec

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
def one_row_per_embedding_cbow(df):
    rows = []
    for i, row in df.iterrows():
        tokens = nlp(row['sample'])
        word_vec = np.array([t.vector for t in tokens])
        word_vec_avg = np.mean(word_vec, axis=0)
        new_row = list(word_vec_avg) + [list(row)[1]]
        rows.append(new_row)
    cbow_df = pd.DataFrame(rows)
    cbow_df.columns = list(range(0,300)) + ["!author"]
    return cbow_df

In [None]:
train_cbow = one_row_per_embedding_cbow(baseline_df_train)
test_cbow = one_row_per_embedding_cbow(baseline_df_test)


In [None]:
CBOW_COLS = list(range(0,300))
baseline_svm_cbow = LinearSVC()
baseline_svm_cbow.fit(train_cbow[CBOW_COLS], train_cbow["!author"])

train_cbow['!predicted'] = baseline_svm_cbow.predict(train_cbow[CBOW_COLS])
test_cbow['!predicted'] = baseline_svm_cbow.predict(test_cbow[CBOW_COLS])

In [None]:
print("Known Authors P/R")
P_and_R(train_cbow, "!author", 'different')
print("Rand Index: " + str(accuracy_score(train_cbow["!author"], train_cbow["!predicted"])))
print("Number Correct: " + str(accuracy_score(train_cbow["!author"], train_cbow["!predicted"], normalize=False)))
print()

print("Unknown Authors P/R")
P_and_R(test_cbow, "!author", 'different')
print("Rand Index: " + str(accuracy_score(test_cbow["!author"], test_cbow["!predicted"])))
print("Number Correct: " + str(accuracy_score(test_cbow["!author"], test_cbow["!predicted"], normalize=False)))

In [None]:
test_cbow