In [13]:
import os
import pandas as pd
import numpy as np
import settings
pd.set_option('display.max_colwidth', -1)
import json
import re

In [2]:
df = pd.read_csv(os.path.join(settings.DATA_DIR, 'train.csv'))

In [3]:
df[['comment_text', 'target']].sample(10)

Unnamed: 0,comment_text,target
938535,Donald Duck? ;-),0.166667
221209,"Most of these surveys are actually quite silly and superficial.\n\nA better survey would require respondents to detail the principles and the chained logic of their reported opinions, vs. just skimming them for emotions.\n\nHow many of these Catholic respondents could work their opinions down to a set of underpinning and stable (across circumstances) moral theological principles?\n\nCan they provide solid descriptions of their ""opinions"" in terms of \n-the law of love?\n- justice (can they give an operational definition of justice, without waving their arms?)\n- intention?\n- the duties for which our freedoms are given?\n- the object of a moral act?\n- the circumstances of the action?\n- protection of the common good?\n- last resort?",1.0
1699737,"Defending a SJW narrative on grounds of free speech seems very ironic in today's supercharged politically correct environment where it's become commonplace to advocate for (further) limiting free speech. When reading about this particular controversy the other day I thought of Mahatma Gandhi, who said ""It is unwise to be too sure of one's own wisdom."" Perhaps it's a mantra that should be adopted by all who consider the pursuit of objectivity worthwhile, and particularly so in academia.",0.0
1323332,"Sat listening in the LIO and noticed it wasn't until the end they called on the Mat-Su. By then, most people had to go home.",0.0
1768188,Mahalo,0.0
1049327,Replays clearly showed Rinne being bumped on the winning goal and unable to recover in time. HNIC announcers were all in agreement. Why wasn't it goaltender interference? The ending left a sour taste in my mouth and I'm sure the mouths of many others.,0.0
118262,...although the Legislature could spend it however it wants\n\nThis is what worries me the most,0.0
1327631,Good anniversary for it...,0.0
26602,"I would expect someone named ""defendfreespeech"" to have a better understanding of the first amendment.",0.0
1557683,Equifax should provide a minimum of 10 years of free identity monitoring. And should pay for any expenses any victim has due to their willful neglect. As is hiring a company similar to Lifelock where a company expert will work to clean up any financial issues at no cost to the victim.,0.0


In [4]:
def remove_space(text: str, spaces: list, only_clean: bool = True):
    """
    Remove extra spaces and ending space if any.

    :param text: text to clean
    :param text: spaces
    :param only_clean: simply clean texts or also replace texts
    :return: cleaned text
    """
    if not only_clean:
        for space in spaces:
            text = text.replace(space, ' ')

    text = text.strip()
    text = re.sub('\s+', ' ', text)

    return text


def replace_words(text: str, mapping: dict):
    """
    Replaces unusual punctuation with normal.

    :param text: text to clean
    :param mapping: dict with mapping
    :return: cleaned text
    """
    for word in mapping:
        if word in text:
            text = text.replace(word, mapping[word])

    return text

def clean_number(text: str):
    """
    Cleans numbers.

    :param text: text to clean
    :return: cleaned text
    """
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    text = re.sub(r'(\d+),', '\g<1>', text)
    text = re.sub(r'(\d+)(e)(\d+)', '\g<1> \g<3>', text)

    return text


def spacing_punctuation(text: str, punctuation: str):
    """
    Add space before and after punctuation and symbols.

    :param text: text to clean
    :param punctuation: string with symbols
    :return: cleaned text
    """
    for punc in punctuation:
        if punc in text:
            text = text.replace(punc, f' {punc} ')

    return text

In [5]:
def fixing_with_regex(text):
    """
    Additional fixing of words.

    :param text: text to clean
    :return: cleaned text
    """

    mis_connect_list = ['\b(W|w)hat\b', '\b(W|w)hy\b', '(H|h)ow\b', '(W|w)hich\b', '(W|w)here\b', '(W|w)ill\b']
    mis_connect_re = re.compile('(%s)' % '|'.join(mis_connect_list))

    text = re.sub(r" (W|w)hat+(s)*[A|a]*(p)+ ", " WhatsApp ", text)
    text = re.sub(r" (W|w)hat\S ", " What ", text)
    text = re.sub(r" \S(W|w)hat ", " What ", text)
    text = re.sub(r" (W|w)hy\S ", " Why ", text)
    text = re.sub(r" \S(W|w)hy ", " Why ", text)
    text = re.sub(r" (H|h)ow\S ", " How ", text)
    text = re.sub(r" \S(H|h)ow ", " How ", text)
    text = re.sub(r" (W|w)hich\S ", " Which ", text)
    text = re.sub(r" \S(W|w)hich ", " Which ", text)
    text = re.sub(r" (W|w)here\S ", " Where ", text)
    text = re.sub(r" \S(W|w)here ", " Where ", text)
    text = mis_connect_re.sub(r" \1 ", text)
    text = text.replace("What sApp", ' WhatsApp ')

    # Clean repeated letters.
    text = re.sub(r"(I|i)(I|i)+ng", "ing", text)
    text = re.sub(r"(-+|\.+)", " ", text)

    text = re.sub(r'[\x00-\x1f\x7f-\x9f\xad]', '', text)
    text = re.sub(r'(\d+)(e)(\d+)', r'\g<1> \g<3>', text)  # is a dup from above cell...
    text = re.sub(r"(-+|\.+)\s?", "  ", text)
    text = re.sub("\s\s+", " ", text)
    text = re.sub(r'ᴵ+', '', text)

    text = re.sub(r"(W|w)on(\'|\’)t ", "will not ", text)
    text = re.sub(r"(C|c)an(\'|\’)t ", "can not ", text)
    text = re.sub(r"(Y|y)(\'|\’)all ", "you all ", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", text)

    text = re.sub(r"(I|i)(\'|\’)m ", "i am ", text)
    text = re.sub(r"(A|a)in(\'|\’)t ", "is not ", text)
    text = re.sub(r"n(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)re ", " are ", text)
    text = re.sub(r"(\'|\’)s ", " is ", text)
    text = re.sub(r"(\'|\’)d ", " would ", text)
    text = re.sub(r"(\'|\’)ll ", " will ", text)
    text = re.sub(r"(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)ve ", " have ", text)

    text = re.sub(
        r'(by|been|and|are|for|it|TV|already|justhow|some|had|is|will|would|should|shall|must|can|his|here|there|them|these|their|has|have|the|be|that|not|was|he|just|they|who)(how)',
        '\g<1> \g<2>', text)

    return text


In [9]:
def load_preprocessing_data():
    """
    Loads dict with various mappings and strings for cleaning.

    :return:
    """
        
    with open('mapping_dict.json', 'r') as f:
        mapping_dict = json.load(f)

    # combine several dicts into one
    replace_dict = {**mapping_dict['contraction_mapping'],
                    **mapping_dict['mispell_dict'],
                    **mapping_dict['special_punc_mappings'],
                    **mapping_dict['rare_words_mapping'],
                    **mapping_dict['bad_case_words'],
                    **mapping_dict['mis_spell_mapping']}

    mapping_dict = {'spaces': mapping_dict['spaces'],
                    'punctuation': mapping_dict['punctuation'],
                    'words_to_replace': replace_dict}

    return mapping_dict

In [16]:
import swifter



In [26]:
def preprocess(text: str):
    """
    Apply all preprocessing.

    :param text: text to clean.
    :return: cleaned text
    """

    text = remove_space(text, mapping_dict['spaces'], only_clean=False)
    text = clean_number(text)
    
    text = fixing_with_regex(text)
    text = replace_words(text, mapping_dict['words_to_replace'])
    
    text = spacing_punctuation(text, mapping_dict['punctuation'])

    for punct in "/-'":
        if punct in text:
            text = text.replace(punct, ' ')

    text = clean_number(text)
    text = remove_space(text, mapping_dict['spaces'])

    return text




In [27]:
mapping_dict = load_preprocessing_data()

In [28]:
mapping_dict.keys()

dict_keys(['spaces', 'punctuation', 'words_to_replace'])

In [29]:
from pandarallel import pandarallel

In [30]:
def text_clean_wrapper_swifter(df):
    df["comment_text"] = df["comment_text"].swifter.apply(preprocess).astype(str)
    return df

In [36]:
pandarallel.initialize()
def text_clean_wrapper(df):
    df["comment_text"] = df["comment_text"].parallel_apply(preprocess).astype(str)
    return df

New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 24 workers


In [37]:
df_cleaned = text_clean_wrapper(df)

In [38]:
df = pd.read_csv(os.path.join(settings.DATA_DIR, 'train.csv'))

In [39]:
df.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [40]:
df['cleaned'] = df_cleaned.comment_text

In [44]:
df[['comment_text', 'cleaned']].sample(10)

Unnamed: 0,comment_text,cleaned
210037,"Well we're on the subject of Democrat party and the KKK. Will the new African-American history Museum on the mall in DC reflect the Democrat party's role with slavery, segregation, The Civil Rights Act and Congress, and above all the organization and implementing the KKK?","Well we are on the subject of Democrat party and the KKK Will the new African American history Museum on the mall in DC reflect the Democrat party is role with slavery , segregation , The Civil Rights Act and Congress , and above all the organization and implementing the KKK ?"
1320413,"Bubbles, \nGo ahead and keep watching those polls there. It 'll give you something to focus on. In the meantime, in the real world, looks like good old Loretta Lynch was using an alias e-mail after her very improper meeting with Bill Clinton last summer. Alias? to hide something? Yep. An investigation is almost assured. Mueller has all the 'tools' in place that are necessary to go after Lynch, Hillary and Susan Rice, possibly former President Obama and others? And being there is absolutely nothing on Trump, he may as well. This is going to be big. I do believe their was 'obstruction of justice' last summer. Loretta Lynch used her power to squash the Clinton investigation. I do believe Comey will be brought back to testify as will people involved with Hillary's disastrous campaign. Yee-haw!","Bubbles , Go ahead and keep watching those polls there It will give you something to focus on In the meantime , in the real world , looks like good old Loretta Lynch was using an alias e mail after her very improper meeting with Bill Clinton last summer Alias ? to hide something ? Yep An investigation is almost assured Mueller has all the tools in place that are necessary to go after Lynch , Hillary and Susan Rice , possibly former President Obama and others ? And being there is absolutely nothing on Trump , he may as well This is going to be big I do believe their was obstruction of justice last summer Loretta Lynch used her power to squash the Clinton investigation I do believe Comey will be brought back to testify as will people involved with Hillary is disastrous campaign Yee haw !"
1799002,Might be Micro boyfriends of the McDonald's female workers on the graveyard shift. Grill all of them and find out who was NOT working that night.,Might be Micro boyfriends of the McDonald is female workers on the graveyard shift Grill all of them and find out who was NOT working that night
1223531,Maybe they like that the dogs are aggressive in their protection of them. It could be an ego thing -- see how important I am to my dogs?!,Maybe they like that the dogs are aggressive in their protection of them It could be an ego thing see how important I am to my dogs ? !
1405115,Poor argument.\nOne has absolutely nothing to do with the other.,Poor argument One has absolutely nothing to do with the other
988105,why? may be he is family doctor. more money in the economy does not increase his sales volume.\n\nAnd why higher sales volume means higher profit? profit is volume times price minus cost.,why ? may be he is family doctor more money in the economy does not increase his sales volume And why higher sales volume means higher profit ? profit is volume times price minus cost
28574,"You wrote, ""this is by far a more dangerous world on many levels."" But on so many more levels this is by far, a safer and more prosperous world where people at least in ""1st world countries"" enjoy the benefits of modern medicine that allows them to live twice as long as their ancestors. Yeah, it might be fun to go back in time for a few days, but I can guarantee, you would end up missing the modern day society that we live in, like a newborn misses her mother's teat.","You wrote , "" this is by far a more dangerous world on many levels "" But on so many more levels this is by far , a safer and more prosperous world where people at least in "" 1st world countries "" enjoy the benefits of modern medicine that allows them to live twice as long as their ancestors Yeah , it might be fun to go back in time for a few days , but I can guarantee , you would end up missing the modern day society that we live in , like a newborn misses her mother is teat"
1010502,"Yeah, that's right. Stop screaming and throwing fits every time Trump lies. How many lies does Trump need to tell before you all (the opposition) just shut up and accept them?","Yeah , that is right Stop screaming and throwing fits every time Trump lies How many lies does Trump need to tell before you all ( the opposition ) just shut up and accept them ?"
121947,"I personally believe that the first thing that should be dealt with is plastic sacs at grocery stores. I'm surprised at this day and age that stores like Fred Meyer, Walmart and Carrs/Safeway don't charge for plastic sacs. Or I'm more surprised that public policy hasn't made them do that yet. I'm from Homer, and I can tell you that the most common thing to find on the coastline there is plastic sacs from the local Safeway (I used to pick up trash at the harbor during the summer season).","I personally believe that the first thing that should be dealt with is plastic sacs at grocery stores i am surprised at this day and age that stores like Fred Meyer , Walmart and Carrs Safeway do not charge for plastic sacs Or i am more surprised that public policy has not made them do that yet i am from Homer , and I can tell you that the most common thing to find on the coastline there is plastic sacs from the local Safeway ( I used to pick up trash at the harbor during the summer season )"
180959,"Yup, ""Pot Got More Votes Than Walker"". Just like ""Pot Got More Votes Than Hickel"", except that the latter bumper sticker appeared awfully quickly and the former bumper sticker never appeared at all, further evidence of just how spineless we've become.","Yup , "" Pot Got More Votes Than Walker "" Just like "" Pot Got More Votes Than Hickel "" , except that the latter bumper sticker appeared awfully quickly and the former bumper sticker never appeared at all , further evidence of just how spineless we have become"


In [45]:
df_cleaned.to_csv(os.path.join(settings.DATA_DIR, 'train_clean.csv'), header=True, index=False)

In [46]:
df_test = pd.read_csv(os.path.join(settings.DATA_DIR, 'test.csv'))

In [47]:
df_test.head()

Unnamed: 0,id,comment_text
0,7000000,Jeff Sessions is another one of Trump's Orwellian choices. He believes and has believed his entire career the exact opposite of what the position requires.
1,7000001,"I actually inspected the infrastructure on Grand Chief Stewart Philip's home Penticton First Nation in both 2010 and 2013. Exactly Zero projects that had been identified in previous inspection reports had been funded by the federal government, and the entire band was housed in ATCO trailers. Clearly the Harper Conservatives had already reduced the cash his band was sent to zero."
2,7000002,"No it won't . That's just wishful thinking on democrats fault . For the 100 th time , Walker cited the cost of drug users treatment as being lost with Obamacare . I laugh every time I hear a liberal claim republicans want to hurt people , and that's why they dumped Obamacare."
3,7000003,"Instead of wringing our hands and nibbling the periphery of the issue, how about we face the actual issue head on? I would support a city ordinance against loitering, and applaud city councilors who champion a real and permanent solution.\n\nThe details could be determined, but would include a limit to persons sitting, standing, lying, smoking, conversing over a certain amount of time, perhaps 10 minutes. Exceptions, of course, would be for shopping, dining, attending a licensed and approved event, etc.\n\nIt is noble to provide some services for the truly needy in our community, but that, in my estimation, is a separate issue. I do not wish for my city to provide for nor encourage idle and harassing behaviors in our city core.\n\nEnough is enough!"
4,7000004,"how many of you commenters have garbage piled high in your yard, bald tires, dead batteries, rotten pallets, car parts, blah blah blah. this town is a pigpen. drive around and look for yourself, its pathetic."


In [48]:
df_test_clean = text_clean_wrapper(df_test)

In [49]:
df_test_clean.head()

Unnamed: 0,id,comment_text
0,7000000,Jeff Sessions is another one of Trump is Orwellian choices He believes and has believed his entire career the exact opposite of what the position requires
1,7000001,"I actually inspected the infrastructure on Grand Chief Stewart Philip is home Penticton First Nation in both 2010 and 2013 Exactly Zero projects that had been identified in previous inspection reports had been funded by the federal government , and the entire band was housed in ATCO trailers Clearly the Harper Conservatives had already reduced the cash his band was sent to zero"
2,7000002,"No it will not That is just wishful thinking on democrats fault For the 100th time , Walker cited the cost of drug users treatment as being lost with Obamacare I laugh every time I hear a liberal claim republicans want to hurt people , and that is why they dumped Obamacare"
3,7000003,"Instead of wringing our hands and nibbling the periphery of the issue , how about we face the actual issue head on ? I would support a city ordinance against loitering , and applaud city councilors who champion a real and permanent solution The details could be determined , but would include a limit to persons sitting , standing , lying , smoking , conversing over a certain amount of time , perhaps 10 minutes Exceptions , of course , would be for shopping , dining , attending a licensed and approved event , etc It is noble to provide some services for the truly needy in our community , but that , in my estimation , is a separate issue I do not wish for my city to provide for nor encourage idle and harassing behaviors in our city core Enough is enough !"
4,7000004,"how many of you commenters have garbage piled high in your yard , bald tires , dead batteries , rotten pallets , car parts , blah blah blah this town is a pigpen drive around and look for yourself , its pathetic"


In [50]:
df_test_clean.to_csv(os.path.join(settings.DATA_DIR, 'test_clean.csv'), header=True, index=False)