In [1]:
import numpy as np
from nltk.tokenize import word_tokenize
import pandas as pd
import pickle
import tqdm
import torch
from nltk import word_tokenize, pos_tag, ne_chunk

Using OpenNMT required both significant preprocessing in Python to put the files into the right format for the software, and use of the command line to start actual training. The former was done here, and the latter is also recorded here for posterity and examination.

# prepare text files to use with OpenNMT

In [3]:
all_df = pd.read_pickle('data/opennmt/rule_based_corrected_df.pkl')
all_df.head()

Unnamed: 0,Original,Target 0,Target 1,Target 2,Target 3,Category,Dataset
0,I mean that you have to really be her friend.,And I mean Really be her friend.,Just be her BFF 4 real.,you have to be her friend.,"You have to actually be her friend, for real.",Family_Relationships,test
1,Are you posing a rhetorical question?,Sounds like a rhetorical question :),Do you really want an answer?,That sounds more like a rhetorical question th...,Are you asking me a rhetorical question?,Family_Relationships,test
2,Men pretend to love in order to have intercour...,"Men play at love to get sex, women play at sex...","Men fake love to get laid, women fake orgasms ...","Guys PRETEND to love so they can get laid, wom...",Dudes just act like they love a chick to get b...,Family_Relationships,test
3,I do not intend to be mean.,I don't want to be mean.,I wasn't trying to be a jerk.,I'm not tryin to be mean...,I didn't want to be mean,Family_Relationships,test
4,I would estimate an average of 45% initially b...,On average I'd say about 45% at first but than...,"It's a little less than 50/50 at the start, bu...",Prolly 45% at the start but when you get to no...,"I guess it'd be around 45% to start with, but ...",Family_Relationships,test


In [18]:
train_src = all_df[all_df['Dataset'] == 'train']['Original']
train_tgt = all_df[all_df['Dataset'] == 'train']['Target 0']
val_src = all_df[all_df['Dataset'] == 'tune']['Original']
val_tgt = all_df[all_df['Dataset'] == 'tune']['Target 0']

assert len(train_src) == len(train_tgt)
assert len(val_src) == len(val_tgt)

In [5]:
def series_to_txt(series, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for item in series:
            tokens = word_tokenize(item)
            tokens = ' '.join(tokens)
            f.write(tokens + '\n')

In [27]:
series_to_txt(val_src, 'data/src-val.txt')
series_to_txt(val_tgt, 'data/tgt-val.txt')
series_to_txt(train_src, 'data/src-train.txt')
series_to_txt(train_tgt, 'data/tgt-train.txt')

In [6]:
test_src = all_df[all_df['Dataset'] == 'test']['Original']
test_tgt = all_df[all_df['Dataset'] == 'test']['Target 0']

assert len(test_src) == len(test_tgt)

series_to_txt(test_src, 'data/src-test.txt')
series_to_txt(test_tgt, 'data/tgt-test.txt')

# openNMT use

This is a record of the varying command line codes required to train the models used in this thesis which were done using OpenNMT.

Training took an extremely long time because CUDA could not be used to access the GPU for training locally.

In [28]:
torch.cuda.is_available()

False

The commands below were to train the most basic model, using the GloVe embeddings but no special bells or whistles.

The second version, similar to the above but just with copy attention turned on.

In [None]:
# preprocess.py -train_src data/YahooCorpus/src-train.txt -train_tgt data/YahooCorpus/tgt-train.txt -valid_src data/YahooCorpus/src-val.txt -valid_tgt data/YahooCorpus/tgt-val.txt -save_data data/model_pretrained_embed_with_copy/data -dynamic_dict
# python3 train.py -save_model data/model_pretrained_embed_with_copy/ -data data/model_pretrained_embed_with_copy/data -word_vec_size 300 -pre_word_vecs_enc "data/glove_pretrained/embeddings.enc.pt" -pre_word_vecs_dec "data/glove_pretrained/embeddings.dec.pt" -copy_attn

The third version, also with copy attention, and with all named entities removed (see below) from the training dataset.

In [None]:
# OpenNMT-py/preprocess.py -train_src make-it-sound-less-formal/data/src-train-no-ent.txt -train_tgt make-it-sound-less-formal/data/tgt-train-no-ent.txt -valid_src make-it-sound-less-formal/data/src-val-no-ent.txt -valid_tgt make-it-sound-less-formal/data/tgt-val-no-ent.txt -save_data OpenNMT-py/data/model_pretrained_no_ent_with_copy/data -dynamic_dict
# python3 train.py -save_model data/model_pretrained_no_ent_with_copy/model -data data/model_pretrained_no_ent_with_copy/data -word_vec_size 300 -pre_word_vecs_enc "data/glove_pretrained/embeddings.enc.pt" -pre_word_vecs_dec "data/glove_pretrained/embeddings.dec.pt" -copy_attn

A single reversed model (as in, informal to formal instead of formal to informal) trained to use on the Acrolinx dataset.

In [None]:
# informal to formal
# OpenNMT-py/preprocess.py -train_src make-it-sound-less-formal/data/OpenNMT\ files/tgt-train-no-ent.txt -train_tgt make-it-sound-less-formal/data/OpenNMT\ files/src-train-no-ent.txt -valid_src make-it-sound-less-formal/data/OpenNMT\ files/tgt-val-no-ent.txt -valid_tgt make-it-sound-less-formal/data/OpenNMT\ files/src-val-no-ent.txt -save_data make-it-sound-less-formal/data/OpenNMT\ files/formal-to-informal-data -dynamic_dict
# python3 OpenNMT-py/train.py -save_model informal-to-formal -data make-it-sound-less-formal/data/OpenNMT\ files/informal-to-formal-data -word_vec_size 300 -pre_word_vecs_enc "embeddings.enc.pt" -pre_word_vecs_dec "embeddings.dec.pt" -copy_attn -world_size 1 -gpu_ranks 0

# Collect the results from the OpenNMT files into pandas dataframes

This takes all the results - from each of the three resulting models - and puts them all together in order to better compare how each one did.

In [4]:
% cd /home/rebekah/Documents/OpenNMT-py/data/YahooCorpus

/home/rebekah/Documents/OpenNMT-py/data/YahooCorpus


In [7]:
pred_file = open('pred-test.txt')
pred = [line.rstrip() for line in pred_file.readlines()]
pred_file.close()

source_file = open('src-test.txt')
source = [line.rstrip() for line in source_file.readlines()]
source_file.close()

target_file = open('tgt-test.txt')
target = [line.rstrip() for line in target_file.readlines()]
target_file.close()

In [11]:
v1_results = pd.DataFrame({'Source': source, 'Target': target, 'Prediction': pred})

In [13]:
% cd /home/rebekah/Documents/make-it-sound-less-formal/data/

/home/rebekah/Documents/make-it-sound-less-formal/data


In [16]:
writer = pd.ExcelWriter('version_1_test.xlsx')
v1_results.to_excel(writer, engine='xlsxwriter')
writer.save()

In [2]:
results_df = pd.read_pickle('data/v1_results.pkl')
results_df.head()

Unnamed: 0,Source,Target,Prediction,source_lf,target_lf,pred_lf,src_tgt_diff,src_pred_diff,Copy Prediction
0,I mean that you have to really be her friend .,And I mean Really be her friend .,I mean you have to really be her friend .,-0.270857,-0.27162,-0.275536,0.000763,0.00468,I mean you have to really be her friend .
1,Are you posing a rhetorical question ?,Sounds like a rhetorical question : ),What kind of question is that ?,-0.142981,-0.178207,-0.207015,0.035226,0.064035,What are you asking a question ?
2,Men pretend to love in order to have intercour...,"Men play at love to get sex , women play at se...","Men play love to have sex , women play for sex...",-0.195864,-0.237748,-0.216562,0.041885,0.020698,"Men pretend to love in order to have sex , wom..."
3,I do not intend to be mean .,I do n't want to be mean .,I do n't mean to be mean .,-0.204783,-0.242434,-0.219974,0.037651,0.015191,I do n't mean to be mean .
4,I would estimate an average of 45 % initially ...,On average I 'd say about 45 % at first but th...,"I would say a 15 % of 45 % , then once you get...",-0.126478,-0.218232,-0.182693,0.091754,0.056215,"45 % of them , but once you get to know the pe..."


In [6]:
copy_pred_file = open('data/copy-pred-test.txt')
copy_pred = [line.rstrip() for line in copy_pred_file.readlines()]
copy_pred_file.close()

In [8]:
results_df['Copy Prediction'] = copy_pred

In [3]:
copy_no_ent_pred_file = open('data/pred-test-no-ent.txt')
copy_no_ent_pred = [line.rstrip() for line in copy_no_ent_pred_file.readlines()]
copy_no_ent_pred_file.close()

In [5]:
results_df['No Entity Copy Prediction'] = copy_no_ent_pred

In [6]:
results_df.head()

Unnamed: 0,Source,Target,Prediction,source_lf,target_lf,pred_lf,src_tgt_diff,src_pred_diff,Copy Prediction,No Entity Copy Prediction
0,I mean that you have to really be her friend .,And I mean Really be her friend .,I mean you have to really be her friend .,-0.270857,-0.27162,-0.275536,0.000763,0.00468,I mean you have to really be her friend .,I mean you have to be her friend .
1,Are you posing a rhetorical question ?,Sounds like a rhetorical question : ),What kind of question is that ?,-0.142981,-0.178207,-0.207015,0.035226,0.064035,What are you asking a question ?,What kind of question is that ?
2,Men pretend to love in order to have intercour...,"Men play at love to get sex , women play at se...","Men play love to have sex , women play for sex...",-0.195864,-0.237748,-0.216562,0.041885,0.020698,"Men pretend to love in order to have sex , wom...","Men play to love in order to have sex , women ..."
3,I do not intend to be mean .,I do n't want to be mean .,I do n't mean to be mean .,-0.204783,-0.242434,-0.219974,0.037651,0.015191,I do n't mean to be mean .,I do n't mean to be mean .
4,I would estimate an average of 45 % initially ...,On average I 'd say about 45 % at first but th...,"I would say a 15 % of 45 % , then once you get...",-0.126478,-0.218232,-0.182693,0.091754,0.056215,"45 % of them , but once you get to know the pe...",45 % of 45 % and then after you know the perso...


In [7]:
results_df.to_pickle('data/v1_results.pkl')

# Remove NER from dataset for last model type

As mentioned above, one attempt to better the NMT model results was to remove all named entities. This was in the hope of having the model better adapt to out-of-domain data.

In [2]:
all_df = pd.read_pickle('data/rule_based_corrected_df.pkl')
all_df.head()

Unnamed: 0,Original,Target 0,Target 1,Target 2,Target 3,Category,Dataset
0,I mean that you have to really be her friend.,And I mean Really be her friend.,Just be her BFF 4 real.,you have to be her friend.,"You have to actually be her friend, for real.",Family_Relationships,test
1,Are you posing a rhetorical question?,Sounds like a rhetorical question :),Do you really want an answer?,That sounds more like a rhetorical question th...,Are you asking me a rhetorical question?,Family_Relationships,test
2,Men pretend to love in order to have intercour...,"Men play at love to get sex, women play at sex...","Men fake love to get laid, women fake orgasms ...","Guys PRETEND to love so they can get laid, wom...",Dudes just act like they love a chick to get b...,Family_Relationships,test
3,I do not intend to be mean.,I don't want to be mean.,I wasn't trying to be a jerk.,I'm not tryin to be mean...,I didn't want to be mean,Family_Relationships,test
4,I would estimate an average of 45% initially b...,On average I'd say about 45% at first but than...,"It's a little less than 50/50 at the start, bu...",Prolly 45% at the start but when you get to no...,"I guess it'd be around 45% to start with, but ...",Family_Relationships,test


In [99]:
def ents(sent):
    entities = []
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk))
    return entities

def replace_ents(sent):
    entities = ents(sent)
    for ent in entities:
        sent = sent.replace(ent, 'ENT')
    return sent

In [108]:
all_df['Original'].progress_apply(replace_ents)

100%|██████████| 111266/111266 [07:59<00:00, 232.28it/s]


0             I mean that you have to really be her friend.
1                     Are you posing a rhetorical question?
2         Men pretend to love in order to have intercour...
3                               I do not intend to be mean.
4         I would estimate an average of 45% initially b...
5         Because some women send subtle messages to men...
6         Let us purchase coffee and converse and procee...
7             Also, i dislike it when my father is unhappy.
8                    Ask him if you should go see a doctor.
9             You can post more questioins on ENT! answers.
10        He probably has many things to worry about rig...
11        But I do not believe that he will be unfaithfu...
12                             Will I always feel that way?
13        However, he may enjoy all of the waiting, drea...
14                         Also, I would like to try again.
15                 Some men shave, it depends on the woman.
16        Well, if you are really attrac

In [109]:
all_df['Target 0'].progress_apply(replace_ents)

100%|██████████| 111266/111266 [07:41<00:00, 241.14it/s]


0                          And I mean Really be her friend.
1                         ENT like a rhetorical question :)
2         ENT play at love to get sex, women play at sex...
3                                  I don't want to be mean.
4         On average I'd say about 45% at first but than...
5         Because some women send men tiny messages with...
6         let's get coffee and chat and take it from there!
7                        I also hate seeing my dad unhappy.
8                                  Ask him to go see a doc.
9                               Post more questions on ENT!
10        Besides don't you think he has enough to worry...
11        But I don't htink that by not having sex with ...
12        Anyway my question is... Will I always feel th...
13        But he might enjoy all the waiting, dreaming &...
14                                 And I want to try again.
15        Not all the time, sometimes men shave them, it...
16        Well if you really like this g

In [110]:
train_src = all_df[all_df['Dataset'] == 'train']['Original']
train_tgt = all_df[all_df['Dataset'] == 'train']['Target 0']
val_src = all_df[all_df['Dataset'] == 'tune']['Original']
val_tgt = all_df[all_df['Dataset'] == 'tune']['Target 0']

assert len(train_src) == len(train_tgt)
assert len(val_src) == len(val_tgt)

def series_to_txt(series, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for item in series:
            tokens = word_tokenize(item)
            tokens = ' '.join(tokens)
            f.write(tokens + '\n')
            
series_to_txt(val_src, 'data/src-val-no-ent.txt')
series_to_txt(val_tgt, 'data/tgt-val-no-ent.txt')
series_to_txt(train_src, 'data/src-train-no-ent.txt')
series_to_txt(train_tgt, 'data/tgt-train-no-ent.txt')