In [None]:
import pandas as pd

In [None]:
train_fn = "data/new_pol/ampol_train.tsv"
test_fn = "data/new_pol/ampol_test.tsv"
val_fn = "data/new_pol/ampol_val.tsv"
vocab_fn = 'data/bert_vocab.txt'

In [None]:
df = pd.read_csv(train_fn, sep='\t')
df = df.fillna('')

## Compute user clusters

In [None]:
from collections import defaultdict

userInsertions = defaultdict(int)
userDeletions = defaultdict(int)
userSkips = defaultdict(int)
userReplaces = defaultdict(int)

users = defaultdict(list)

for ridx, row in df.iterrows():
    user = row['user']
    user = user.rstrip()
    user = user.rstrip('\\')
    users[user] = []
    
    edit_string = row['edit_string']
    u_edits = edit_string.split(' ')
    for u_edit in u_edits:
        if u_edit == 'SKIP':
            userSkips[user] += 1
        elif u_edit == 'DELETE':
            userDeletions[user] += 1
        elif u_edit == 'INSERT':
            userInsertions[user] += 1
        elif u_edit == 'REPLACE':
            userReplaces[user] += 1

In [None]:
userTuples = {}
for user in users:
    totalNonSkip = userInsertions[user] + userDeletions[user] + userReplaces[user]
    totalEdits = totalNonSkip + userSkips[user]

    # What percent of interesting edits are delete/insert/replace
    deleteRate = float(userDeletions[user]) / float(totalNonSkip)
    insertRate = float(userInsertions[user]) / float(totalNonSkip)
    replaceRate = float(userReplaces[user]) / float(totalNonSkip)

    # What is the total rate of skips
    skipRate = float(userSkips[user]) / float(totalEdits)
    
    userTuples[user] = (deleteRate, insertRate, replaceRate, skipRate)

In [None]:
import numpy as np
userOrder = [user for user in userTuples]
X = np.array([userTuples[user] for user in userOrder], np.float)

In [None]:
from sklearn.cluster import Birch

model = Birch(threshold=0.01, n_clusters=16)
# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)

In [None]:
wiki_ucl = pd.DataFrame(list(zip(userOrder,yhat)),columns=['user','cluster'])
for i in range(16):
    print(wiki_ucl[wiki_ucl['cluster'] == i].size)

In [None]:
userClusters = {}
for ridx,row in wiki_ucl.iterrows():
    userClusters[row['user']] = row['cluster']
userClusters

In [None]:
xClusters = defaultdict(list)
for idx in range(len(yhat)):
    cluster = yhat[idx]
    xClusters[cluster] += [X[idx]]

In [None]:
cSums = {}
for cluster in xClusters:
    cList = xClusters[cluster]
    cSum = np.zeros([1,4])
    for item in cList:
        cSum += item
    cSums[cluster] = (cSum) / len(cList)

In [None]:
cSums

## Compute user tags

In [None]:
with open(vocab_fn,'r') as vfiler:
    vocaball = vfiler.readlines()

In [None]:
vocab = []
for iid, item in enumerate(vocaball):
    item = item.strip()
    if not item.isalpha():
        continue
    try:
        item.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        continue
    vocab += [item]

In [None]:
vocab

In [None]:
for ridx, row in df.iterrows():
    user = row['user']
    user = user.rstrip()
    user = user.rstrip('\\')
    users[user] = []

In [None]:
len(users)

In [None]:
import random
random.shuffle(vocab)

In [None]:
# give each user a random tag from the vocabulary
for uidx, user in enumerate(users):
    users[user] += [random.choice(vocab)]
    users[user] += [random.choice(vocab)]
    users[user] += ['user']

## Code to add line_id index to tsv files if they are absent

In [None]:
# if no line ids in tsv_in, first add them. Also replace underscores in titles with spaces, and randomize rows
tsv_file_in_no_lids = 'data/new_pol/ampol_train.tsv'
tsv_out_w_lids = "data/new_pol/ampol_train_wlids.tsv"

df = pd.read_csv(tsv_file_in_no_lids, sep='\t')
df = df.fillna('')

In [None]:
df = df.sample(frac=1)

In [None]:
df

In [None]:
if 'line_id' not in df.columns:
    line_ids = range(df.shape[0])
    df['line_id'] = line_ids

In [None]:
for ridx, row in df.iterrows():
    title = row['article_title']
    title_words = title.split('_')
    title = ' '.join(title_words)
    df.at[ridx, 'article_title'] = title

In [None]:
df = df.set_index('line_id')
df.to_csv(tsv_out_w_lids, sep='\t')

In [None]:
df

## Code to add annotator model outputs to tsv file

In [None]:
t5_generated = "data/new_pol/ampol_annotator_output_train.txt" # annotator edit string outputs generated by t5
line_id_file = 'data/new_pol/ampol_train_LIDS.txt'  # output by jsonl code below
all_data_file = "data/new_pol/ampol_train.tsv"
outfile = 'data/new_pol/ampol_annotator_temp_train.txt'
final_outfile = 'data/new_pol/ampol_generator_input_train.tsv'

In [None]:
newlines = []

with open(t5_generated, 'r') as t5f:
    for lidx, line in enumerate(t5f.readlines()):
        line = line.strip()
        newlines += [line]
    lcount = lidx
    
with open(line_id_file, 'r') as lidf:
    for lidx, line in enumerate(lidf.readlines()):
        line = line.strip()
        newlines[lidx] = newlines[lidx] + '\t' + line + '\n'
    rcount = lidx
    
assert lcount == rcount

with open(outfile, 'w') as outf:
    outf.write("edit_string_predicted\tline_id\n")
    for line in newlines:
        outf.write(line)

In [None]:
df_edit_preds = pd.read_csv(outfile, sep='\t')
df_all_data = pd.read_csv(all_data_file, sep='\t')

In [None]:
df_all_data

In [None]:
df_edit_preds = df_edit_preds.set_index('line_id')
df_all_data = df_all_data.set_index('line_id')

In [None]:
df_all = df_edit_preds.join(df_all_data)

In [None]:
df_all

In [None]:
df_all.to_csv(final_outfile, sep='\t')

## Produce jsonlines file

In [None]:
# Json files for t5

import pandas as pd

tsv_file_in = 'data/new_pol/ampol_train_wlids.tsv'
jsonl_file_out = 'data/new_pol/ampol_train_generator.json'
lid_file_out = 'data/new_pol/ampol_train_generator_LIDS.txt' # remember line ids in separate file

# EDIT THESE BOOLEANS to append tags, etc
# ---------------------------------------

# ANNOTATE MODEL INPUT
edit_outputs = False           # expects data field edit_string, computed by Levenshtein notebook
edit_outputs_no_skip = False   # ignore skips

# GENERATOR MODEL INPUT
append_edit_string = False     # expects data field edit_string, computed by Levenshtein notebook (ground truth)
append_edit_summary = False    # ignore skips
append_pred_edit_string = True # expects data field edit_string_predicted, output from annotator model

# PERSONALIZATION
add_user_tags = True
add_user_cluster = True

# ---------------------------------------

df = pd.read_csv(tsv_file_in, sep='\t')
df = df.fillna('')
df = df.set_index('line_id')

In [None]:
num2words = {0: 'Zero', 1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five', \
             6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten', \
            11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen', \
            15: 'Fifteen'}

In [None]:
empty_refs_count = 0
same_count=0
user_not_count=0
unknownCount = 0
# Output in JSONL format
with open(jsonl_file_out,'w') as outfile, open(lid_file_out,'w') as lidoutfile:
    for ridx, row in df.iterrows():
        user = row['user'].strip()
        article_title = row['article_title']
        in_text = row['parent text']
        out_text = row['revision text']

        if append_pred_edit_string:
            in_text = in_text + ', metadata: ' + row['edit_string_predicted']
        if append_edit_summary:
            in_text = in_text + ', metadata: '
            edit_items = row['edit_string'].split(' ')
            for edit_item in edit_items:
                if edit_item != 'SKIP':
                    in_text = in_text + ' ' + edit_item
        
        line_id = ridx
        
        if in_text.strip() == '':
            continue
            
        if out_text.strip() == '':
            empty_refs_count += 1
            continue
            
        if in_text.strip() == out_text.strip():
            same_count += 1
            continue
        
        if add_user_tags:
            user_tags_str = ' '.join(users[user])
            in_text = user_tags_str + ' ' + in_text
            
        if add_user_cluster:
            cluster = userClusters[user]
            in_text = in_text + ', metadata: user cluster ' + num2words[cluster]
            
        # instead of en_out as target string, en_out is edit string
        if edit_outputs_no_skip:
            out_text = ''
            edit_items = row['edit_string'].split(' ')
            for edit_item in edit_items:
                if edit_item != 'SKIP':
                    out_text = out_text + edit_item + ' '
            out_text = out_text[:-1]
            if out_text.strip() == '':
                out_text = "SKIP"
            
        if edit_outputs:
            out_text = ''
            edit_items = row['edit_string'].split(' ')
            for edit_item in edit_items:
                out_text = out_text + edit_item + ' '
            out_text = out_text[:-1]
         
        # format should be e.g.:
        # "translation": { "en_in": "this is prev text", "en_out": "this is post text"} }
        line = '{"translation": { "en_in": "' + in_text + '", "en_out": "' + out_text + '"} }\n'
        
        if add_user_tags and 'user' not in in_text.split(' '):
            user_not_count += 1
            continue
        
        outfile.write(line)
        lidoutfile.write(str(line_id) + '\n')