In [1]:
import pandas as pd
import re
from nltk import word_tokenize
import nltk
import numpy as np
import random

This notebook was used to reduce the presence of the more shallow aspects of formality in the Grammarly/Yahoo dataset. Examples of these aspects are slang, misspellings and the like: we do not want our models to learn these characteristics in the place of lexical and syntactic markers of formality.

# import data

In [2]:
informal_to_formal_df = pd.read_pickle('data/informal_to_formal_df.pkl')
formal_to_informal_df = pd.read_pickle('data/formal_to_informal_df.pkl')

In [3]:
formal_to_informal_df[formal_to_informal_df['Dataset'] == 'train'].head()

Unnamed: 0,Original,Target 0,Target 1,Target 2,Target 3,Category,Dataset
3266,I prefer to let the guy ask me.\n,"Sure, it's ok, but I always have let the guy a...",,,,Family_Relationships,train
3267,I suffer through verbal abuse from my wife.\n,"Hmmm, I'm a guy suffering from verbal abuse fr...",,,,Family_Relationships,train
3268,You will have more friends than you want.\n,You will have more friends that you want... ;)\n,,,,Family_Relationships,train
3269,It's nice that you get to see pictures of who ...,"It's nice, you get to see pictures of who you ...",,,,Family_Relationships,train
3270,I need to know what to do.\n,I NEED TO KNOW WHAT 2 DO\n,,,,Family_Relationships,train


# implement rule-based approach

This section used both manual inspection of the dataset and inspiration from the paper published about the dataset to decide what should be normalized or changed about the sentences on a consistent basis.

In [4]:
informal = formal_to_informal_df['Target 0']
formal = formal_to_informal_df['Original']

In [4]:
def normalize(sentence):
    # "chatspeak" to "normal" text
    sentence = re.sub(r'\bim\b', "I'm", sentence, flags=re.I) #re.I = ignore case when matching
    sentence = re.sub(r"\bi'm\b", "I'm", sentence)
    sentence = re.sub(r"\bive\b", "I've", sentence, flags=re.I)
    sentence = re.sub(r"\bi've\b", "I've", sentence)
    sentence = re.sub(r"\bill\b", "I'll", sentence, flags=re.I)
    sentence = re.sub(r"\bi'll\b", "I'll", sentence)
    sentence = re.sub(r"\bi\b", "I", sentence)
    sentence = re.sub(r"\bId\b", "I'd", sentence)
    sentence = re.sub(r"\br\b", "are", sentence, flags=re.I)
    sentence = re.sub(r"\bu\b", "you", sentence, flags=re.I)
    sentence = re.sub(r"\bur\b", "your", sentence, flags=re.I)
    sentence = re.sub(r"\burself\b", "yourself", sentence, flags=re.I)
    sentence = re.sub(r"\byoure\b", "you're", sentence, flags=re.I)
    sentence = re.sub(r"\buve\b", "you've", sentence, flags=re.I)
    sentence = re.sub(r"\bu've\b", "you've", sentence, flags=re.I)
    sentence = re.sub(r"\bull\b", "you'll", sentence, flags=re.I)
    sentence = re.sub(r"\bu'll\b", "you'll", sentence, flags=re.I)
    sentence = re.sub(r"\balot\b", "a lot", sentence, flags=re.I)
    sentence = re.sub(r"\bcant\b", "can't", sentence, flags=re.I)
    sentence = re.sub(r"\bisnt\b", "isn't", sentence, flags=re.I)
    sentence = re.sub(r"\bdont\b", "don't", sentence, flags=re.I)
    sentence = re.sub(r"\bthats\b", "that's", sentence, flags=re.I)
    sentence = re.sub(r"\btheyll\b", "they'll", sentence, flags=re.I)
    sentence = re.sub(r"\btheyre\b", "they're", sentence, flags=re.I)
    sentence = re.sub(r"\bhes\b", "he's", sentence, flags=re.I)
    sentence = re.sub(r"\bshes\b", "she's", sentence, flags=re.I)
    sentence = re.sub(r"\bshud\b", "should", sentence, flags=re.I)
    sentence = re.sub(r"\bshld\b", "should", sentence, flags=re.I)
    sentence = re.sub(r"\bcya\b", "see you", sentence, flags=re.I)
    sentence = re.sub(r"\bluv\b", "love", sentence, flags=re.I)
    sentence = re.sub(r"\bbtw\b", "by the way", sentence, flags=re.I)
    sentence = re.sub(r"\bb/c ", "because ", sentence, flags=re.I)
    sentence = re.sub(r"\bw/ ", "with ", sentence, flags=re.I)
    sentence = re.sub(r"\bw/o ", "without ", sentence, flags=re.I)
    sentence = re.sub(r"\bluv\b", "love", sentence, flags=re.I)
    sentence = re.sub(r"\bhavent\b", "haven't", sentence, flags=re.I)
    sentence = re.sub(r"\btuff\b", "tough", sentence, flags=re.I)
    sentence = re.sub(r"\bgurl\b", "girl", sentence, flags=re.I)
    sentence = re.sub(r"\bdoesnt\b", "doesn't", sentence, flags=re.I)
    sentence = re.sub(r"\blets\b", "let's", sentence, flags=re.I)
    sentence = re.sub(r"\btheres\b", "there's", sentence, flags=re.I)
    sentence = re.sub(r"\bwhats\b", "what's", sentence, flags=re.I)
    sentence = re.sub(r"\bwhut\b", "what", sentence, flags=re.I)
    sentence = re.sub(r"\balot\b", "a lot", sentence, flags=re.I)
    sentence = re.sub(r"\bcos\b", "because", sentence, flags=re.I)
    sentence = re.sub(r"\bcuz\b", "because", sentence, flags=re.I)
    sentence = re.sub(r"\bbcuz\b", "because", sentence, flags=re.I)
    sentence = re.sub(r"\bteh\b", "the", sentence, flags=re.I)
    sentence = re.sub(r"\bkewl\b", "cool", sentence, flags=re.I)
    sentence = re.sub(r"\bdat\b", "that", sentence, flags=re.I)
    sentence = re.sub(r"\bda\b", "the", sentence, flags=re.I)
    sentence = re.sub(r"\brite\b", "right", sentence, flags=re.I)
    sentence = re.sub(r"\bwont\b", "won't", sentence, flags=re.I)
    sentence = re.sub(r"\bryt\b", "right", sentence, flags=re.I)
    sentence = re.sub(r"\bcud\b", "could", sentence, flags=re.I)
    sentence = re.sub(r"\bcood\b", "could", sentence, flags=re.I)
    sentence = re.sub(r"\byur\b", "your", sentence, flags=re.I)
    sentence = re.sub(r"\bnd\b", "and", sentence, flags=re.I)
    sentence = re.sub(r"\bbout\b", "about", sentence, flags=re.I)
    sentence = re.sub(r"\bwat\b", "what", sentence, flags=re.I)
    sentence = re.sub(r"\bwht\b", "what", sentence, flags=re.I)
    sentence = re.sub(r"\bnite\b", "night", sentence, flags=re.I)
    sentence = re.sub(r"\bwanna\b", "want to", sentence, flags=re.I)
    sentence = re.sub(r"\bjuz\b", "just", sentence, flags=re.I)
    sentence = re.sub(r"\bb4\b", "before", sentence, flags=re.I)
    sentence = re.sub(r"\bdats\b", "that's", sentence, flags=re.I)
    sentence = re.sub(r"\bdey\b", "they", sentence, flags=re.I)
    sentence = re.sub(r"\bdeyr\b", "they're", sentence, flags=re.I)
    sentence = re.sub(r"\bcoz\b", "because", sentence, flags=re.I)
    
    if ' id is ' in sentence:
        sentence = sentence.replace(' id ', " ID ")
    else:
        sentence = sentence.replace(' id ', " I'd ")
    return sentence

def capital(sentence):
    # check if entire sentence is capitalized: if so lowercase all but 1st character
    if sentence == sentence.upper():
        sentence = sentence.capitalize()
    else:
        sentence = sentence[0].upper() + sentence[1:] # capitalize 1st letter
        # capitalize words with POS tag of NNP or NNPs
        # this doesn't seem to do anything for the moment, because NLTK mostly
        # only tags proper nouns which are ALREADY capitalized
#         text = word_tokenize(sentence)
#         tags = nltk.pos_tag(text)
#         for idx in range(len(text)):
#             if tags[idx][1] in ['NNP', 'NNPS']:
#                 sentence = sentence.replace(tags[idx][0],
#                                  tags[idx][0][0].upper() + tags[idx][0][1:])
#                 print(tags[idx][1] + ' ' + tags[idx][0])
    return sentence

def punctuate(sentence):
    # too much, none at all, spacing and punctuation issues
    sentence = re.sub(r'[\?\!]+(?=[\?\!])', '', sentence) # reduce extra punctuation marks
    sentence = re.sub(r'[\.]+[\.]', '... ', sentence) # turn more than 1 period into 3
    sentence = re.sub(r'\s+', ' ', sentence).strip() # remove extra whitespace
    sentence = re.sub(r'(?<=[\.,])(?=[^\s\.])', ' ', sentence) # handle comma and period with no following space
    sentence = re.sub(r"\. com", ".com", sentence) # fix URLs changed by previous command
    sentence = re.sub(r" , ", ", ", sentence)
    sentence = re.sub(r" \. ", ". ", sentence)
    return sentence

def repeat(sentence):
    # more than 3 of the same letter in a row -> make either 1 or 2 letters from it
    # not perfect, but likely to help
    # i'd find out what realllllllllllllly happened -> I would determine what really happened
    sentence = re.sub(r'([a-zA-Z])\1{2}', "\1", sentence)
    return sentence

def correct(sentence):
    # make sure input actually is a sentence, and apply above functions
    if type(sentence) == float:
        if np.isnan(sentence):
            return sentence
    sentence = capital(sentence)
    sentence = normalize(sentence)
    sentence = repeat(sentence)
    sentence = punctuate(sentence)
    return sentence

In [93]:
for i in range(10):
    n = random.randint(1, len(informal))
    print(informal[n] + correct(informal[n])+ '\n' + formal[n])

girls gotta do what they gotta do.
Girls gotta do what they gotta do.
Girls must do what they must.

I go for a guys personality.
I go for a guys personality.
I go for men's personality.

none, but i think their both corny
None, but I think their both corny
None, but it is my belief that they are both trite.

I don't know but he can take me over anytime!
I don't know but he can take me over anytime!
I do not know, however he is welcome to take me over at any time.

Ah, the old hump & dump.
Ah, the old hump & dump.
THE FAMILIAR SCENARIO OF HAVING SEX AND LEAVING THEM.

ask her why she sez makes her feel giddy and alive.
Ask her why she sez makes her feel giddy and alive.
If you ask her why, she will say that it allows her to feel happy and alive.

i just want to ask for opinions that would best fit my question.
I just want to ask for opinions that would best fit my question.
I merely would like to ask for opinions that best match my question.

THE ONLY REASON YOU LIKE IT IS BECAUSE YOU 

In [6]:
corrected_df = formal_to_informal_df.copy()

In [7]:
corrected_df['Target 0'] =corrected_df['Target 0'].apply(correct)
corrected_df['Target 1'] =corrected_df['Target 1'].apply(correct)
corrected_df['Target 2'] =corrected_df['Target 2'].apply(correct)
corrected_df['Target 3'] =corrected_df['Target 3'].apply(correct)

In [8]:
def strip_whitespace(sentence):
    # remove any leading or trailing whitespace after the above changes
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    return sentence

corrected_df['Original'] = corrected_df['Original'].apply(strip_whitespace)

In [9]:
corrected_df

Unnamed: 0,Original,Target 0,Target 1,Target 2,Target 3,Category,Dataset
0,I mean that you have to really be her friend.,And I mean Really be her friend.,Just be her BFF 4 real.,you have to be her friend.,"You have to actually be her friend, for real.",Family_Relationships,test
1,Are you posing a rhetorical question?,Sounds like a rhetorical question :),Do you really want an answer?,That sounds more like a rhetorical question th...,Are you asking me a rhetorical question?,Family_Relationships,test
2,Men pretend to love in order to have intercour...,"Men play at love to get sex, women play at sex...","Men fake love to get laid, women fake orgasms ...","Guys PRETEND to love so they can get laid, wom...",Dudes just act like they love a chick to get b...,Family_Relationships,test
3,I do not intend to be mean.,I don't want to be mean.,I wasn't trying to be a jerk.,I'm not tryin to be mean...,I didn't want to be mean,Family_Relationships,test
4,I would estimate an average of 45% initially b...,On average I'd say about 45% at first but than...,"It's a little less than 50/50 at the start, bu...",Prolly 45% at the start but when you get to no...,"I guess it'd be around 45% to start with, but ...",Family_Relationships,test
5,Because some women send subtle messages to men...,Because some women send men tiny messages with...,Some ladies just hint at what they want and ex...,Women send men mixed messages sometimes,Chicks like to beat around the bush instead of...,Family_Relationships,test
6,Let us purchase coffee and converse and procee...,let's get coffee and chat and take it from there!,"How about we grab some Starbucks and talk, and...",Up fr coffee and chat then see what happens?,"Let's grab some coffee, talk, and go from there",Family_Relationships,test
7,"Also, i dislike it when my father is unhappy.",I also hate seeing my dad unhappy.,I hate seeing my daddy sad :(,Hate it when my dad is bummed out.,"When my dad is not happy, it tears me up.",Family_Relationships,test
8,Ask him if you should go see a doctor.,Ask him to go see a doc.,Ask him if you should go see a dr.,Ask him if you should see a doc or sumthin',Just ask him if he thinks you should go to the...,Family_Relationships,test
9,You can post more questioins on Yahoo! answers.,Post more questions on Yahoo Answers!,Post more ?s on Yahoo Answers,"Hey, go hit up Yahoo answers with a bunch of Qs",You can use yahoo answers to ask many more que...,Family_Relationships,test


In [10]:
corrected_df.to_pickle('data/rule_based_corrected_df.pkl')