In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import pandas as pd
from tqdm.auto import tqdm



In [2]:
DEVICE = 'cuda:4' if torch.cuda.is_available() else 'cpu'

In [3]:
def choose_from_top_k_top_n(probs, k=50, p=0.8):
    ind = np.argpartition(probs, -k)[-k:]
    top_prob = probs[ind]
    top_prob = {i: top_prob[idx] for idx,i in enumerate(ind)}
    sorted_top_prob = {k: v for k, v in sorted(top_prob.items(), key=lambda item: item[1], reverse=True)}
    
    t=0
    f=[]
    pr = []
    for k,v in sorted_top_prob.items():
        t+=v
        f.append(k)
        pr.append(v)
        if t>=p:
            break
    top_prob = pr / np.sum(pr)
    token_id = np.random.choice(f, 1, p = top_prob)

    return int(token_id)

In [4]:
def load_models(model_name):
    print ('Loading Trained GPT-2 Model')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
    model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
    model_path = model_name
    model.load_state_dict(torch.load(model_path))
    return tokenizer, model

In [5]:
def generate(tokenizer, model, sentences, label):
    with torch.no_grad():
        for idx in range(sentences):
            finished = False
            cur_ids = torch.tensor(tokenizer.encode(label)).unsqueeze(0).to('cpu')
            for i in range(128):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]

                softmax_logits = torch.softmax(logits[0,-1], dim=0)

                if i < 5:
                    n = 10
                else:
                    n = 5

                next_token_id = choose_from_top_k_top_n(softmax_logits.to('cpu').numpy()) #top-k-top-n sampling
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1)

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    finished = True
                    break

            if finished:         
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)
                print (output_text)
            else:
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)
                print (output_text)

In [6]:
trainbt = pd.read_csv('../data/paraphrase/train_prep_paraphrase.txt', sep='\t')
devbt = pd.read_csv('../data/paraphrase/valid_prep_paraphrase.txt', sep='\t')
testbt = pd.read_csv('../data/paraphrase/test_prep_paraphrase.txt', sep='\t')

In [7]:
trainbt.head()

Unnamed: 0,source,translate,backtranslate,bt_preds,similarity
0,take your admin powers and stick them up your ...,Бери свои административные способности и засун...,Take your administrative skills and shove them...,1,0.654044
1,you just made a fool of yourself .,Ты только что выставил себя дураком.,You just made a fool of yourself.,1,0.999817
2,this animal killer is paying his share .,Этот зверь-убийца платит свою долю.,This killer animal is paying its share.,1,0.9032
3,so you voted for the loser ?,"Значит, вы проголосовали за неудачника?",So you voted for the loser?,1,0.999822
4,", but we would not expect a bias lib troll lik...",", но мы бы не ожидали, что такой пристрастный ...",but we wouldn't expect a biased troll like you...,1,0.425269


In [147]:
# load paraphraser model
tokenizer, model = load_models('../model/full-finetune-paraphraser.pth')
model = model.to(DEVICE)

Loading Trained GPT-2 Model


In [46]:
train_outs = []
trainbt_fil = trainbt[trainbt['bt_preds'] == 1]
pb = tqdm(range(len(trainbt_fil)))

with torch.no_grad():
    for row in trainbt_fil.iterrows():
        cur_ids = torch.tensor(tokenizer.encode(f'paraphrase: {row[1]["source"]} >>> ')).unsqueeze(0).to(DEVICE)
        for i in range(128):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            
            softmax_logits = torch.softmax(logits[0, -1], dim=0)
                
            next_token_id = choose_from_top_k_top_n(softmax_logits.cpu().numpy())
            cur_ids = torch.cat([cur_ids, torch.ones((1, 1)).long().to(DEVICE) * next_token_id], dim=1)
            
            if next_token_id in tokenizer.encode('<|endoftext|>'):
                finished = True
                break
        
        pb.update(1)
        
        output_list = list(cur_ids.squeeze().cpu().numpy())
        output_text = tokenizer.decode(output_list)
        train_outs.append(output_text)

  0%|          | 0/47045 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

In [165]:
out_logits = logits[0, -1].unsqueeze(0)
torch.cat([out_logits, logits[0, -2].unsqueeze(0)], dim=0).shape

torch.Size([2, 50257])

In [47]:
train_outs

['take your admin powers and stick them up your ass you fool. >>> _______________________________________________________________ Sent from my cell >>> _______________________________________________________________ <|endoftext|>',
 "you just made a fool of yourself. >>> 」\n\n「 Ah, yeah, just kidding. 」\n\n「 I don't know why you're so angry. 」\n\n <|endoftext|>",
 'this animal killer is paying his share. >>> <|endoftext|>',
 'so you voted for the loser? >>> 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 �',
 ', but we would not expect a bias lib troll like you shore to understand what actually is corrupt >>> 개국해 정어했스테 과한국의 아택요 과한국 그력한 그능한 그력한 국업하는 걸업하는 걸업하는 걸업하는 ',
 'gee kathleen, sounds like you described every church sex abusers, meth heads, burglars, thieves fornicators. >>>  <|endoftext|>',
 "trump is a disgusting human being. >>> _____ He's a horrible human being. <|endoftext|>",
 'registering right there with screwloose palin on the imbecile meter. >>> _________________________

In [148]:
dev_outs = []
devbt_fil = devbt[devbt['bt_preds'] == 1]
pb = tqdm(range(len(devbt_fil)))

with torch.no_grad():
    for row in devbt_fil.iterrows():
        cur_ids = torch.tensor(tokenizer.encode(f'paraphrase: {row[1]["source"]} >>> ')).unsqueeze(0).to(DEVICE)
        for i in range(128):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            
            softmax_logits = torch.softmax(logits[0, -1], dim=0)
                
            next_token_id = choose_from_top_k_top_n(softmax_logits.cpu().numpy())
            cur_ids = torch.cat([cur_ids, torch.ones((1, 1)).long().to(DEVICE) * next_token_id], dim=1)
            
            if next_token_id in tokenizer.encode('<|endoftext|>'):
                finished = True
                break
        
        pb.update(1)
        
        output_list = list(cur_ids.squeeze().cpu().numpy())
        output_text = tokenizer.decode(output_list)
        dev_outs.append(output_text)

  0%|          | 0/2294 [00:00<?, ?it/s]

In [149]:
dev_outs

['paraphrase: i think youre just pathetic whiner. >>> ive just got a little bit of an asshole. <|endoftext|>',
 'paraphrase: now that you know what i meant you must feel foolish asking how low can i sink. >>> _______________________________________________________________________ <|endoftext|>',
 "paraphrase: ah, there is nothing like a swift kick in the groin from the representativesk pfeffer. >>> 一, there's nothing like a violent kick in the groin from the representatives of the pimper. <|endoftext|>",
 'paraphrase: gays should stop deluding themselves into thinking that the average canadian really cares about gay rights. >>> _________ should stop wasting their lives fooling themselves that the average Canadian really cares about gays. <|endoftext|>',
 'paraphrase: facts are not kind to white bigots and the lies they tell themselves about themselves. >>> ices are not kind to the white-hatted racists and lies they tell themselves. <|endoftext|>',
 'paraphrase: fire those football play

In [52]:
test_outs = []
testbt_fil = testbt[testbt['bt_preds'] == 1]
pb = tqdm(range(len(testbt_fil)))

with torch.no_grad():
    for row in testbt_fil.iterrows():
        cur_ids = torch.tensor(tokenizer.encode(f'paraphrase: {row[1]["source"]} >>> ')).unsqueeze(0).to(DEVICE)
        for i in range(128):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            
            softmax_logits = torch.softmax(logits[0, -1], dim=0)
                
            next_token_id = choose_from_top_k_top_n(softmax_logits.cpu().numpy())
            cur_ids = torch.cat([cur_ids, torch.ones((1, 1)).long().to(DEVICE) * next_token_id], dim=1)
            
            if next_token_id in tokenizer.encode('<|endoftext|>'):
                finished = True
                break
        
        pb.update(1)
        
        output_list = list(cur_ids.squeeze().cpu().numpy())
        output_text = tokenizer.decode(output_list)
        test_outs.append(output_text)

  0%|          | 0/3576 [00:00<?, ?it/s]

In [53]:
test_outs

['paraphrase: gleasuck my fuckin gawdam dick you faggotsming >>> <|endoftext|>',
 'paraphrase: how braindead do you have to be to not understand that sentence. >>> <|endoftext|>',
 'paraphrase: still the elitist, we know better than you dumb alaskans, arrogant, liberal, nytwannabe rag. >>> _______, we know better than you, arrogant, liberal, naysayabe. <|endoftext|>',
 'paraphrase: : shit, now variety is saying >>> 그못하나 현신것이 있습니다. <|endoftext|>',
 'paraphrase: gay sex traces my lips with his finger. >>> <|endoftext|>',
 'paraphrase: why are you all so gay. >>> 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 고 ',
 "paraphrase:, fuck that asshole cunt, this school is a bag of wank, fuck every damn teacher >>> _____, fuck that asshole, that's a bag of wank, it's getting a hell of a lot harder every day. <|endoftext|>",
 'paraphrase: the garbage middle east religions ( judaism, christianity, islam ) are the ones that condone slavery, condemn sex, and have violent gods. >>> <|

In [56]:
train_gen_pair = pd.DataFrame(train_outslumn, columns=['generated'])
dev_gen_pair = pd.DataFrame(dev_outs, cos=['generated'])
test_gen_pair = pd.DataFrame(test_outs, columns=['generated'])

In [58]:
train_gen_pair.to_csv('../data/paraphrase/train_gen_pair.txt', sep='\t', index=False, header=True)
dev_gen_pair.to_csv('../data/paraphrase/valid_gen_pair.txt', sep='\t', index=False, header=True)
test_gen_pair.to_csv('../data/paraphrase/test_gen_pair.txt', sep='\t', index=False, header=True)

In [79]:
train_gen_pair.head(10).values

array([['take your admin powers and stick them up your ass you fool. >>> _______________________________________________________________ Sent from my cell >>> _______________________________________________________________ <|endoftext|>'],
       ["you just made a fool of yourself. >>> 」\n\n「 Ah, yeah, just kidding. 」\n\n「 I don't know why you're so angry. 」\n\n <|endoftext|>"],
       ['this animal killer is paying his share. >>> <|endoftext|>'],
       ['so you voted for the loser? >>> 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 �'],
       [', but we would not expect a bias lib troll like you shore to understand what actually is corrupt >>> 개국해 정어했스테 과한국의 아택요 과한국 그력한 그능한 그력한 국업하는 걸업하는 걸업하는 걸업하는 '],
       ['gee kathleen, sounds like you described every church sex abusers, meth heads, burglars, thieves fornicators. >>>  <|endoftext|>'],
       ["trump is a disgusting human being. >>> _____ He's a horrible human being. <|endoftext|>"],
       ['registering right there with screwl

In [63]:
s = 'so you voted for the loser? >>> 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 �'
s.split(' >>> ')

['so you voted for the loser?',
 '국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 국귀야원하에 �']

In [115]:
import re

# clean
def rm_ip_address(text):
    return re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)

def rm_link(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)

def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

def rm_inappropriate_sym(text):
    return re.sub(r'[\:\%\=\~\_\n]', ' ', text)

def rm_money(text):
    return re.sub(r'\$\s?((?:\d+[A-z])|((?:\d+[\,\.])+\d+(?=\s))|((?:\d+)))', r'', text)

def space_between_sym(text):    
    return re.sub(r'([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~])', r' \1 ', text)

def rm_additional_space(text):
    return re.sub(r' +', ' ', text)
    
def rm_email(text):
    return re.sub(r'(?:(\S+)?\@\S+)', r'', text)

def rm_middle_dot(text):
    return re.sub(r'(?<=\w)\.(?=\w+)', '', text)

def rm_middle_spaces(text):
    return re.sub(r'(?<=\w)\s(?=\w+)', '', text)

def clean_pipeline(text):
    no_sym = rm_inappropriate_sym(text)
    no_ip_address = rm_ip_address(no_sym)
    no_link = rm_link(no_ip_address)
    no_emoji = rm_emoji(no_link)
    no_nonascii = rm_nonascii(no_emoji)
    no_email = rm_email(no_nonascii)    
    no_mid_dots = rm_middle_dot(no_email)
#     space_between = space_between_sym(no_mid_dots)
    single_space = rm_additional_space(no_mid_dots)

    return single_space

In [133]:
def clean_generated(sentence):
    bos = 'paraphrase:'
    eos = '<|endoftext|>'
    # remove 'paraphrase:' suffix
    if bos in sentence:
        sentence = sentence[len(bos):]
    if eos in sentence:
        sentence = sentence[:sentence.index(eos)]
    # split by '>>>' token
    sentence = sentence.split('>>>')
    # save original sentence
    ori = sentence[0]
    # loop through generation (if any)
    try:
        gen = sentence[1]
        gen = clean_pipeline(gen)
    except:
        gen = ''
        
    return pd.Series([ori, gen], index=['ori', 'gen'])

In [134]:
tqdm.pandas()
train_gen_pair_ = train_gen_pair.generated.progress_apply(clean_generated)
dev_gen_pair_ = dev_gen_pair.generated.progress_apply(clean_generated)
test_gen_pair_ = test_gen_pair.generated.progress_apply(clean_generated)

  0%|          | 0/47045 [00:00<?, ?it/s]

  0%|          | 0/2294 [00:00<?, ?it/s]

  0%|          | 0/3576 [00:00<?, ?it/s]

In [140]:
train_gen_pair = pd.concat([train_gen_pair, train_gen_pair_], axis=1)
dev_gen_pair = pd.concat([dev_gen_pair, dev_gen_pair_], axis=1)
test_gen_pair = pd.concat([test_gen_pair, test_gen_pair_], axis=1)

In [141]:
train_gen_pair

Unnamed: 0,generated,ori,gen
0,take your admin powers and stick them up your ...,take your admin powers and stick them up your ...,Sent from my cell
1,you just made a fool of yourself. >>> 」\n\n「 A...,you just made a fool of yourself.,"Ah, yeah, just kidding. I don't know why you'..."
2,this animal killer is paying his share. >>> <|...,this animal killer is paying his share.,
3,so you voted for the loser? >>> 국귀야원하에 국귀야원하에 ...,so you voted for the loser?,
4,", but we would not expect a bias lib troll lik...",", but we would not expect a bias lib troll lik...",
...,...,...,...
47040,hillarys nasty lady shits are the proof! >>> <...,hillarys nasty lady shits are the proof!,
47041,"are you gonna eat your hat, or dance in the ra...","are you gonna eat your hat, or dance in the ra...",",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"
47042,those white supremacists dont think the civil ...,those white supremacists dont think the civil ...,They think the civil war is over.
47043,so that means you do think the us military wer...,so that means you do think the us military wer...,I can't stop talking It's impossible for me t...


In [144]:
dev_gen_pair

Unnamed: 0,generated,ori,gen
0,paraphrase: i think youre just pathetic whiner...,i think youre just pathetic whiner.,ive got to be sad about you.
1,paraphrase: now that you know what i meant you...,now that you know what i meant you must feel ...,
2,"paraphrase: ah, there is nothing like a swift ...","ah, there is nothing like a swift kick in the...",
3,paraphrase: gays should stop deluding themselv...,gays should stop deluding themselves into thi...,
4,paraphrase: facts are not kind to white bigots...,facts are not kind to white bigots and the li...,
...,...,...,...
2289,paraphrase: armed and willing to kill or injur...,armed and willing to kill or injure anyone wh...,Who will protect him if he tries to move out?
2290,paraphrase: damned if you do and damned if you...,damned if you do and damned if you dont.,"if you do it, and if you don't, well, damn."
2291,paraphrase: it might not kill them but it coul...,it might not kill them but it could turn them...,
2292,"paraphrase: except the beleaguered white, chri...","except the beleaguered white, christian male,...","except the white, Christian, Man, does that m..."


In [145]:
test_gen_pair

Unnamed: 0,generated,ori,gen
0,paraphrase: gleasuck my fuckin gawdam dick you...,gleasuck my fuckin gawdam dick you faggotsming,
1,paraphrase: how braindead do you have to be to...,how braindead do you have to be to not unders...,
2,"paraphrase: still the elitist, we know better ...","still the elitist, we know better than you du...",", we know better than you, arrogant, liberal,..."
3,"paraphrase: : shit, now variety is saying >>> ...",": shit, now variety is saying",.
4,paraphrase: gay sex traces my lips with his fi...,gay sex traces my lips with his finger.,
...,...,...,...
3571,paraphrase: is your workplace that idiotic it ...,is your workplace that idiotic it plays the s...,is your workplace that sits in the middle of ...
3572,paraphrase: and since well never see trumps ta...,"and since well never see trumps tax returns, ...",
3573,"paraphrase: what a stupid post, contempt of co...","what a stupid post, contempt of court for the...","What a stupid mistake, contempt for the head ..."
3574,paraphrase: flag burning is a great way for je...,flag burning is a great way for jerks to self...,Flag burning is a wonderful way for a man to ...


In [146]:
train_gen_pair.to_csv('../data/paraphrase/train_gen_pair.txt', sep='\t', index=False, header=True)
dev_gen_pair.to_csv('../data/paraphrase/valid_gen_pair.txt', sep='\t', index=False, header=True)
test_gen_pair.to_csv('../data/paraphrase/test_gen_pair.txt', sep='\t', index=False, header=True)

##### generation samples using full fine-tuned model

In [None]:
tokenizer, model = load_models('../model/full-finetune-paraphraser.pth')
model = model.to(DEVICE)

In [None]:
dev_outs = []
devbt_fil = devbt[devbt['bt_preds'] == 1]
pb = tqdm(range(len(devbt_fil)))

with torch.no_grad():
    for row in devbt_fil.iterrows():
        cur_ids = torch.tensor(tokenizer.encode(f'paraphrase: {row[1]["source"]} >>> ')).unsqueeze(0).to(DEVICE)
        for i in range(128):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            
            softmax_logits = torch.softmax(logits[0, -1], dim=0)
                
            next_token_id = choose_from_top_k_top_n(softmax_logits.cpu().numpy())
            cur_ids = torch.cat([cur_ids, torch.ones((1, 1)).long().to(DEVICE) * next_token_id], dim=1)
            
            if next_token_id in tokenizer.encode('<|endoftext|>'):
                finished = True
                break
        
        pb.update(1)
        
        output_list = list(cur_ids.squeeze().cpu().numpy())
        output_text = tokenizer.decode(output_list)
        dev_outs.append(output_text)

In [150]:
dev_outs

['paraphrase: i think youre just pathetic whiner. >>> ive just got a little bit of an asshole. <|endoftext|>',
 'paraphrase: now that you know what i meant you must feel foolish asking how low can i sink. >>> _______________________________________________________________________ <|endoftext|>',
 "paraphrase: ah, there is nothing like a swift kick in the groin from the representativesk pfeffer. >>> 一, there's nothing like a violent kick in the groin from the representatives of the pimper. <|endoftext|>",
 'paraphrase: gays should stop deluding themselves into thinking that the average canadian really cares about gay rights. >>> _________ should stop wasting their lives fooling themselves that the average Canadian really cares about gays. <|endoftext|>',
 'paraphrase: facts are not kind to white bigots and the lies they tell themselves about themselves. >>> ices are not kind to the white-hatted racists and lies they tell themselves. <|endoftext|>',
 'paraphrase: fire those football play

In [152]:
dev_fft_gen_pair = pd.DataFrame(dev_outs, columns=['generated'])
dev_fft_gen_pair_ = dev_fft_gen_pair.generated.progress_apply(clean_generated)
dev_fft_gen_pair = pd.concat([dev_fft_gen_pair, dev_fft_gen_pair_], axis=1)
dev_fft_gen_pair.to_csv('../data/paraphrase/valid_fft_gen_pair.txt', sep='\t', index=False, header=True)

  0%|          | 0/2294 [00:00<?, ?it/s]

In [153]:
dev_fft_gen_pair.head()

Unnamed: 0,generated,ori,gen
0,paraphrase: i think youre just pathetic whiner...,i think youre just pathetic whiner.,ive just got a little bit of an asshole.
1,paraphrase: now that you know what i meant you...,now that you know what i meant you must feel ...,
2,"paraphrase: ah, there is nothing like a swift ...","ah, there is nothing like a swift kick in the...",", there's nothing like a violent kick in the ..."
3,paraphrase: gays should stop deluding themselv...,gays should stop deluding themselves into thi...,should stop wasting their lives fooling thems...
4,paraphrase: facts are not kind to white bigots...,facts are not kind to white bigots and the li...,ices are not kind to the white-hatted racists...


##### Clean generated rephrasing data

In [178]:
train_rephrase = pd.read_csv('../data/paraphrase/train_gen_pair_rephrase.txt', sep='\t')
dev_rephrase = pd.read_csv('../data/paraphrase/valid_gen_pair_rephrase.txt', sep='\t')
test_rephrase = pd.read_csv('../data/paraphrase/test_gen_pair_rephrase.txt', sep='\t')

In [179]:
train_rephrase.head()

Unnamed: 0,generated
0,paraphrase: Sent from my cell >>>
1,"paraphrase: Ah, yeah, just kidding. I don't k..."
2,paraphrase: >>>
3,paraphrase: >>>
4,paraphrase: >>>


In [182]:
train_rephrase = train_rephrase.generated.progress_apply(clean_generated)
dev_rephrase = dev_rephrase.generated.progress_apply(clean_generated)
test_rephrase = test_rephrase.generated.progress_apply(clean_generated)

  0%|          | 0/47045 [00:00<?, ?it/s]

  0%|          | 0/2294 [00:00<?, ?it/s]

  0%|          | 0/3576 [00:00<?, ?it/s]

In [184]:
train_rephrase.rename(columns={'gen': 'gen.2'}, inplace=True)
dev_rephrase.rename(columns={'gen': 'gen.2'}, inplace=True)
test_rephrase.rename(columns={'gen': 'gen.2'}, inplace=True)

In [187]:
train_gen_pair_ = pd.concat([train_gen_pair, train_rephrase[['gen.2']]], axis=1)
dev_gen_pair_ = pd.concat([dev_gen_pair, dev_rephrase[['gen.2']]], axis=1)
test_gen_pair_ = pd.concat([test_gen_pair, test_rephrase[['gen.2']]], axis=1)

In [190]:
train_gen_pair_.to_csv('../data/paraphrase/train_gen_pair.txt', sep='\t', index=False, header=True)
dev_gen_pair_.to_csv('../data/paraphrase/valid_gen_pair.txt', sep='\t', index=False, header=True)
test_gen_pair_.to_csv('../data/paraphrase/test_gen_pair.txt', sep='\t', index=False, header=True)