# Testing the extent of memorization for the tokens in generated text and the training set used to train the generation models

In [1]:
import pandas as pd
import numpy as np

In [2]:
def make_unique_ngram_dict(generations):
    
    dict_1 = {}
    
    bigrams, trigrams, quadgrams, pentagrams = set(), set(), set(), set()
    #total_words = 0
    for gen in generations:
        o = str(gen).split(' ')
        #total_words += len(o)
        #unigrams.update(o)
        for i in range(len(o) - 1):
            bigrams.add(o[i] + '_' + o[i+1])
        for i in range(len(o) - 2):
            trigrams.add(o[i] + '_' + o[i+1] + '_' + o[i+2])
        for i in range(len(o) - 3):
            quadgrams.add(o[i] + '_' + o[i+1] + '_' + o[i+2] + '_' + o[i+3])
        for i in range(len(o) - 4):
            pentagrams.add(o[i] + '_' + o[i+1] + '_' + o[i+2] + '_' + o[i+3]+ '_' + o[i+4])
            
    dict_1["2-grams"] = bigrams
    dict_1["3-grams"] = trigrams
    dict_1["4-grams"] = quadgrams
    dict_1["5-grams"] = pentagrams
            
    
    return dict_1

In [3]:
def calc_overlap_percentage(train_ngram_dict, test_ngram__dict): 
    
    
    overlap_dict = {}
    results_dict = {}
    for i in zip(train_ngram_dict, test_ngram__dict):

        inter_set = None
        if i[0] == i[1]:

            inter_set = train_ngram_dict[i[0]].intersection(test_ngram__dict[i[1]])
            
            overlap_dict[str(i[0])+"_overlap"] = inter_set

            overlap_perc = len(inter_set)/len(test_ngram__dict[i[1]])

            results_dict[str(i[0])+"_overlap_perc"] = overlap_perc

        else:
            raise Exception("Sorry, the train and test n-gram keys don't match")
            
            
    return results_dict, overlap_dict

In [4]:
df_neg_file = pd.read_csv("../data/2_data_remove_duplicates_25_neg_sampled.csv", encoding='utf-8')

neg_predict_train = df_neg_file["reviewText"].tolist()

del df_neg_file

df_pos_file = pd.read_csv("../data/2_data_remove_duplicates_25_pos_sampled.csv", encoding='utf-8')

pos_predict_train = df_pos_file["reviewText"].tolist()

del df_pos_file

In [5]:
tot_predict_train = pos_predict_train+neg_predict_train

In [6]:
train_ngram_dict = make_unique_ngram_dict(tot_predict_train)

## GPT-2 training testing n-gram overalp

In [7]:
df_neg_file = pd.read_csv("../data/GPT2_Neg_Pred.csv", encoding='utf-8')

neg_predict_test = df_neg_file["reviewText"].tolist()

del df_neg_file

df_pos_file = pd.read_csv("../data/GPT2_Pos_Pred.csv", encoding='utf-8')

pos_predict_test = df_pos_file["reviewText"].tolist()

del df_pos_file

In [8]:
tot_predict_test = pos_predict_test+neg_predict_test

In [9]:
test_ngram__dict = make_unique_ngram_dict(tot_predict_test)

In [10]:
results, overlaps = calc_overlap_percentage(train_ngram_dict, test_ngram__dict)

In [11]:
results

{'2-grams_overlap_perc': 0.268855685783305,
 '3-grams_overlap_perc': 0.11655416876044057,
 '4-grams_overlap_perc': 0.03213572058360914,
 '5-grams_overlap_perc': 0.00671748429753585}

## T-5 single prompt at encoder training testing n-gram overalp

In [13]:
df_neg_file = pd.read_csv("../data/T5_enc_Neg_Pred.csv", encoding='utf-8')

neg_predict_test = df_neg_file["reviewText"].tolist()

del df_neg_file

df_pos_file = pd.read_csv("../data/T5_enc_Pos_Pred.csv", encoding='utf-8')

pos_predict_test = df_pos_file["reviewText"].tolist()

del df_pos_file

In [14]:
tot_predict_test = pos_predict_test+neg_predict_test

In [15]:
test_ngram__dict = make_unique_ngram_dict(tot_predict_test)

In [16]:
results, overlaps = calc_overlap_percentage(train_ngram_dict, test_ngram__dict)

In [17]:
results

{'2-grams_overlap_perc': 0.43236069513441777,
 '3-grams_overlap_perc': 0.21972744295720925,
 '4-grams_overlap_perc': 0.08567938749366938,
 '5-grams_overlap_perc': 0.027607683297303988}

## T-5 single prompt at decoder training testing n-gram overalp

In [18]:
df_neg_file = pd.read_csv("../data/T5_dec_Neg_Pred.csv", encoding='utf-8')

neg_predict_test = df_neg_file["reviewText"].tolist()

del df_neg_file

df_pos_file = pd.read_csv("../data/T5_dec_Pos_Pred.csv", encoding='utf-8')

pos_predict_test = df_pos_file["reviewText"].tolist()

del df_pos_file

In [19]:
tot_predict_test = pos_predict_test+neg_predict_test

In [20]:
test_ngram__dict = make_unique_ngram_dict(tot_predict_test)

In [21]:
results, overlaps = calc_overlap_percentage(train_ngram_dict, test_ngram__dict)

In [22]:
results

{'2-grams_overlap_perc': 0.34907514606499557,
 '3-grams_overlap_perc': 0.17381974248927037,
 '4-grams_overlap_perc': 0.06631647804293686,
 '5-grams_overlap_perc': 0.020387812040577647}

## T-5 prompt at encoder and decoder training testing n-gram overalp

In [23]:
df_neg_file = pd.read_csv("../data/T5_enc_dec_Neg_Pred.csv", encoding='utf-8')

neg_predict_test = df_neg_file["reviewText"].tolist()

del df_neg_file

df_pos_file = pd.read_csv("../data/T5_enc_dec_Pos_Pred.csv", encoding='utf-8')

pos_predict_test = df_pos_file["reviewText"].tolist()

del df_pos_file

In [24]:
tot_predict_test = pos_predict_test+neg_predict_test

In [25]:
test_ngram__dict = make_unique_ngram_dict(tot_predict_test)

In [26]:
results, overlaps = calc_overlap_percentage(train_ngram_dict, test_ngram__dict)

In [27]:
results

{'2-grams_overlap_perc': 0.45852478347497255,
 '3-grams_overlap_perc': 0.24712267438177096,
 '4-grams_overlap_perc': 0.10049643925312282,
 '5-grams_overlap_perc': 0.03402215662000444}

In [None]:
overlaps["2-grams_overlap"]

In [None]:
overlaps["3-grams_overlap"]

In [None]:
overlaps["4-grams_overlap"]

In [None]:
overlaps["5-grams_overlap"]

## T-5 steer prompt at encoder and decoder training testing n-gram overalp

In [23]:
df_neg_file = pd.read_csv("../data/T5_steer_both_Neg_Pred.csv", encoding='utf-8')

neg_predict_test = df_neg_file["reviewText"].tolist()

del df_neg_file

df_pos_file = pd.read_csv("../data/T5_steer_both_Pos_Pred.csv", encoding='utf-8')

pos_predict_test = df_pos_file["reviewText"].tolist()

del df_pos_file

In [24]:
tot_predict_test = pos_predict_test+neg_predict_test

In [25]:
test_ngram__dict = make_unique_ngram_dict(tot_predict_test)

In [26]:
calc_overlap_percentage(train_ngram_dict, test_ngram__dict)

{'2-grams_overlap_perc': 0.2567053364269142,
 '3-grams_overlap_perc': 0.11202739943712389,
 '4-grams_overlap_perc': 0.03623420158013811,
 '5-grams_overlap_perc': 0.009896305485312595}