In [13]:
import os
import pandas as pd
import numpy as np
import settings
pd.set_option('display.max_colwidth', -1)
import json
import re

In [2]:
df = pd.read_csv(os.path.join(settings.DATA_DIR, 'train.csv'))

In [3]:
df[['comment_text', 'target']].sample(10)

Unnamed: 0,comment_text,target
938535,Donald Duck? ;-),0.166667
221209,"Most of these surveys are actually quite silly and superficial.\n\nA better survey would require respondents to detail the principles and the chained logic of their reported opinions, vs. just skimming them for emotions.\n\nHow many of these Catholic respondents could work their opinions down to a set of underpinning and stable (across circumstances) moral theological principles?\n\nCan they provide solid descriptions of their ""opinions"" in terms of \n-the law of love?\n- justice (can they give an operational definition of justice, without waving their arms?)\n- intention?\n- the duties for which our freedoms are given?\n- the object of a moral act?\n- the circumstances of the action?\n- protection of the common good?\n- last resort?",1.0
1699737,"Defending a SJW narrative on grounds of free speech seems very ironic in today's supercharged politically correct environment where it's become commonplace to advocate for (further) limiting free speech. When reading about this particular controversy the other day I thought of Mahatma Gandhi, who said ""It is unwise to be too sure of one's own wisdom."" Perhaps it's a mantra that should be adopted by all who consider the pursuit of objectivity worthwhile, and particularly so in academia.",0.0
1323332,"Sat listening in the LIO and noticed it wasn't until the end they called on the Mat-Su. By then, most people had to go home.",0.0
1768188,Mahalo,0.0
1049327,Replays clearly showed Rinne being bumped on the winning goal and unable to recover in time. HNIC announcers were all in agreement. Why wasn't it goaltender interference? The ending left a sour taste in my mouth and I'm sure the mouths of many others.,0.0
118262,...although the Legislature could spend it however it wants\n\nThis is what worries me the most,0.0
1327631,Good anniversary for it...,0.0
26602,"I would expect someone named ""defendfreespeech"" to have a better understanding of the first amendment.",0.0
1557683,Equifax should provide a minimum of 10 years of free identity monitoring. And should pay for any expenses any victim has due to their willful neglect. As is hiring a company similar to Lifelock where a company expert will work to clean up any financial issues at no cost to the victim.,0.0


In [4]:
def remove_space(text: str, spaces: list, only_clean: bool = True):
    """
    Remove extra spaces and ending space if any.

    :param text: text to clean
    :param text: spaces
    :param only_clean: simply clean texts or also replace texts
    :return: cleaned text
    """
    if not only_clean:
        for space in spaces:
            text = text.replace(space, ' ')

    text = text.strip()
    text = re.sub('\s+', ' ', text)

    return text


def replace_words(text: str, mapping: dict):
    """
    Replaces unusual punctuation with normal.

    :param text: text to clean
    :param mapping: dict with mapping
    :return: cleaned text
    """
    for word in mapping:
        if word in text:
            text = text.replace(word, mapping[word])

    return text

def clean_number(text: str):
    """
    Cleans numbers.

    :param text: text to clean
    :return: cleaned text
    """
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    text = re.sub(r'(\d+),', '\g<1>', text)
    text = re.sub(r'(\d+)(e)(\d+)', '\g<1> \g<3>', text)

    return text


def spacing_punctuation(text: str, punctuation: str):
    """
    Add space before and after punctuation and symbols.

    :param text: text to clean
    :param punctuation: string with symbols
    :return: cleaned text
    """
    for punc in punctuation:
        if punc in text:
            text = text.replace(punc, f' {punc} ')

    return text

In [5]:
def fixing_with_regex(text):
    """
    Additional fixing of words.

    :param text: text to clean
    :return: cleaned text
    """

    mis_connect_list = ['\b(W|w)hat\b', '\b(W|w)hy\b', '(H|h)ow\b', '(W|w)hich\b', '(W|w)here\b', '(W|w)ill\b']
    mis_connect_re = re.compile('(%s)' % '|'.join(mis_connect_list))

    text = re.sub(r" (W|w)hat+(s)*[A|a]*(p)+ ", " WhatsApp ", text)
    text = re.sub(r" (W|w)hat\S ", " What ", text)
    text = re.sub(r" \S(W|w)hat ", " What ", text)
    text = re.sub(r" (W|w)hy\S ", " Why ", text)
    text = re.sub(r" \S(W|w)hy ", " Why ", text)
    text = re.sub(r" (H|h)ow\S ", " How ", text)
    text = re.sub(r" \S(H|h)ow ", " How ", text)
    text = re.sub(r" (W|w)hich\S ", " Which ", text)
    text = re.sub(r" \S(W|w)hich ", " Which ", text)
    text = re.sub(r" (W|w)here\S ", " Where ", text)
    text = re.sub(r" \S(W|w)here ", " Where ", text)
    text = mis_connect_re.sub(r" \1 ", text)
    text = text.replace("What sApp", ' WhatsApp ')

    # Clean repeated letters.
    text = re.sub(r"(I|i)(I|i)+ng", "ing", text)
    text = re.sub(r"(-+|\.+)", " ", text)

    text = re.sub(r'[\x00-\x1f\x7f-\x9f\xad]', '', text)
    text = re.sub(r'(\d+)(e)(\d+)', r'\g<1> \g<3>', text)  # is a dup from above cell...
    text = re.sub(r"(-+|\.+)\s?", "  ", text)
    text = re.sub("\s\s+", " ", text)
    text = re.sub(r'ᴵ+', '', text)

    text = re.sub(r"(W|w)on(\'|\’)t ", "will not ", text)
    text = re.sub(r"(C|c)an(\'|\’)t ", "can not ", text)
    text = re.sub(r"(Y|y)(\'|\’)all ", "you all ", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", text)

    text = re.sub(r"(I|i)(\'|\’)m ", "i am ", text)
    text = re.sub(r"(A|a)in(\'|\’)t ", "is not ", text)
    text = re.sub(r"n(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)re ", " are ", text)
    #text = re.sub(r"(\'|\’)s ", " is ", text)
    text = re.sub(r"(\'|\’)d ", " would ", text)
    text = re.sub(r"(\'|\’)ll ", " will ", text)
    text = re.sub(r"(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)ve ", " have ", text)

    text = re.sub(
        r'(by|been|and|are|for|it|TV|already|justhow|some|had|is|will|would|should|shall|must|can|his|here|there|them|these|their|has|have|the|be|that|not|was|he|just|they|who)(how)',
        '\g<1> \g<2>', text)

    return text


In [9]:
def load_preprocessing_data():
    """
    Loads dict with various mappings and strings for cleaning.

    :return:
    """
        
    with open('mapping_dict.json', 'r') as f:
        mapping_dict = json.load(f)

    # combine several dicts into one
    replace_dict = {**mapping_dict['contraction_mapping'],
                    **mapping_dict['mispell_dict'],
                    **mapping_dict['special_punc_mappings'],
                    **mapping_dict['rare_words_mapping'],
                    **mapping_dict['bad_case_words'],
                    **mapping_dict['mis_spell_mapping']}

    mapping_dict = {'spaces': mapping_dict['spaces'],
                    'punctuation': mapping_dict['punctuation'],
                    'words_to_replace': replace_dict}

    return mapping_dict

In [16]:
import swifter



In [26]:
def preprocess(text: str):
    """
    Apply all preprocessing.

    :param text: text to clean.
    :return: cleaned text
    """

    text = remove_space(text, mapping_dict['spaces'], only_clean=False)
    text = clean_number(text)
    
    text = fixing_with_regex(text)
    text = replace_words(text, mapping_dict['words_to_replace'])
    
    text = spacing_punctuation(text, mapping_dict['punctuation'])

    for punct in "/-'":
        if punct in text:
            text = text.replace(punct, ' ')

    text = clean_number(text)
    text = remove_space(text, mapping_dict['spaces'])

    return text




In [27]:
mapping_dict = load_preprocessing_data()

In [28]:
mapping_dict.keys()

dict_keys(['spaces', 'punctuation', 'words_to_replace'])

In [59]:
with open('mapping_dict.json', 'r') as f:
    raw_map = json.load(f)

In [60]:
raw_map.keys()

dict_keys(['contraction_mapping', 'mispell_dict', 'special_punc_mappings', 'spaces', 'rare_words_mapping', 'bad_case_words', 'punctuation', 'mis_spell_mapping'])

In [78]:
raw_map['bad_case_words']

{'nationalpost': 'national post',
 'businessinsider': 'business insider',
 'jewprofits': 'jew profits',
 'QMAS': 'Quality Migrant Admission Scheme',
 'casterating': 'castrating',
 'Kashmiristan': 'Kashmir',
 'CareOnGo': 'India first and largest Online distributor of medicines',
 'Setya Novanto': 'a former Indonesian politician',
 'TestoUltra': 'male sexual enhancement supplement',
 'rammayana': 'ramayana',
 'Badaganadu': 'Brahmin community that mainly reside in Karnataka',
 'bitcjes': 'bitches',
 'mastubrate': 'masturbate',
 'Français': 'France',
 'Adsresses': 'address',
 'flemmings': 'flemming',
 'intermate': 'inter mating',
 'feminisam': 'feminism',
 'cuckholdry': 'cuckold',
 'Niggor': 'black hip-hop and electronic artist',
 'narcsissist': 'narcissist',
 'Genderfluid': 'Gender fluid',
 ' Im ': ' I am ',
 ' dont ': ' do not ',
 'Qoura': 'Quora',
 'ethethnicitesnicites': 'ethnicity',
 'Namit Bathla': 'Content Writer',
 'What sApp': 'WhatsApp',
 'Führer': 'Fuhrer',
 'covfefe': 'coverage

In [58]:
mapping_dict['words_to_replace']

{"Trump's": 'trump is',
 "'cause": 'because',
 ',cause': 'because',
 ';cause': 'because',
 "ain't": 'am not',
 'ain,t': 'am not',
 'ain;t': 'am not',
 'ain´t': 'am not',
 'ain’t': 'am not',
 "aren't": 'are not',
 'aren,t': 'are not',
 'aren;t': 'are not',
 'aren´t': 'are not',
 'aren’t': 'are not',
 "can't": 'cannot',
 "can't've": 'cannot have',
 'can,t': 'cannot',
 'can,t,ve': 'cannot have',
 'can;t': 'cannot',
 'can;t;ve': 'cannot have',
 'can´t': 'cannot',
 'can´t´ve': 'cannot have',
 'can’t': 'cannot',
 'can’t’ve': 'cannot have',
 "could've": 'could have',
 'could,ve': 'could have',
 'could;ve': 'could have',
 "couldn't": 'could not',
 "couldn't've": 'could not have',
 'couldn,t': 'could not',
 'couldn,t,ve': 'could not have',
 'couldn;t': 'could not',
 'couldn;t;ve': 'could not have',
 'couldn´t': 'could not',
 'couldn´t´ve': 'could not have',
 'couldn’t': 'could not',
 'couldn’t’ve': 'could not have',
 'could´ve': 'could have',
 'could’ve': 'could have',
 "didn't": 'did not',
 'd

In [29]:
from pandarallel import pandarallel

In [30]:
def text_clean_wrapper_swifter(df):
    df["comment_text"] = df["comment_text"].swifter.apply(preprocess).astype(str)
    return df

In [36]:
pandarallel.initialize()
def text_clean_wrapper(df):
    df["comment_text"] = df["comment_text"].parallel_apply(preprocess).astype(str)
    return df

New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 24 workers


In [37]:
df_cleaned = text_clean_wrapper(df)

In [38]:
df = pd.read_csv(os.path.join(settings.DATA_DIR, 'train.csv'))

In [39]:
df.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [40]:
df['cleaned'] = df_cleaned.comment_text

In [56]:
df[df['target']>=0.5][['comment_text', 'cleaned']].sample(10)

Unnamed: 0,comment_text,cleaned
165615,There was no foul language... obviously my post did not meet the standards of the left-wing agenda... shame on you Civil Beat!,There was no foul language obviously my post did not meet the standards of the left wing agenda shame on you Civil Beat !
1028364,"The problem with the alt-right is freedom of speech for them only means the right to spew their hatred, not the right for anyone to call them out. They are the biggest whiners and hypocrits out there.","The problem with the alt right is freedom of speech for them only means the right to spew their hatred , not the right for anyone to call them out They are the biggest whiners and hypocrits out there"
387721,"Alaska is so poor thar it now taxes children?\nAlaska is so poor it robs youn people of their college funds?\nAlaska i so poor it needs to take incme from the poor?\nWalker, you do undersan that those on welfare depend on the Fund?","Alaska is so poor thar it now taxes children ? Alaska is so poor it robs youn people of their college funds ? Alaska i so poor it needs to take incme from the poor ? Walker , you do undersan that those on welfare depend on the Fund ?"
644976,You can't fix Libtard stupid. It's a permanent mental disorder.,You can not fix Libtard stupid It is a permanent mental disorder
728661,"If that part of the electorate believe Trump should be President of the United States, how can one not have contempt for their judgement. Trump is an intellectual and moral cretin.","If that part of the electorate believe Trump should be President of the United States , how can one not have contempt for their judgement Trump is an intellectual and moral cretin"
489292,"That was my reaction too. He may be a Semi Fascist now, but with all his "" violent"" threats against certain groups we may see some actions follow those fear-producing threats. I also do not believe that our constitution and form of government fascist-proofs our future. The GOP is already part fascist and Trump will be its puppet. The American people are too naïve and vulnerable to propaganda as well as delusional.","That was my reaction too He may be a Semi Fascist now , but with all his "" violent "" threats against certain groups we may see some actions follow those fear producing threats I also do not believe that our constitution and form of government fascist proofs our future The GOP is already part fascist and Trump will be its puppet The American people are too na ï ve and vulnerable to propaganda as well as delusional"
270925,WTF,WTF
1335131,"I wonder aloud why minorities are so over represented on CNN. Charles is a far left leaning black bisexual. That is precisely the opinion he brings to both his written op eds and his commentary on CNN. Don is a black homosexual, who is always giggling about how weird he thinks the POTUS is. And before you paint me with the homophobic brush, I think Anderson Cooper is a terrific journalist, and someone who might be fun to have a beer with.","I wonder aloud why minorities are so over represented on CNN Charles is a far left leaning black bisexual That is precisely the opinion he brings to both his written op eds and his commentary on CNN Don is a black homosexual , who is always giggling about how weird he thinks the POTUS is And before you paint me with the homophobic brush , I think Anderson Cooper is a terrific journalist , and someone who might be fun to have a beer with"
709898,Trump has certainly got the journalists/academics/financiers/ Hollywood/leftists/ extreme rightists all running around in circles chasing their own tail. Time for the masses to wake and recognize the stupidity and power manipulation and self interest of those six groups .,Trump has certainly got the journalists academics financiers Hollywood leftists extreme rightists all running around in circles chasing their own tail Time for the masses to wake and recognize the stupidity and power manipulation and self interest of those six groups
1156211,How does this liar get away with calling people idiot?,How does this liar get away with calling people idiot ?


In [45]:
df_cleaned.to_csv(os.path.join(settings.DATA_DIR, 'train_clean.csv'), header=True, index=False)

In [46]:
df_test = pd.read_csv(os.path.join(settings.DATA_DIR, 'test.csv'))

In [47]:
df_test.head()

Unnamed: 0,id,comment_text
0,7000000,Jeff Sessions is another one of Trump's Orwellian choices. He believes and has believed his entire career the exact opposite of what the position requires.
1,7000001,"I actually inspected the infrastructure on Grand Chief Stewart Philip's home Penticton First Nation in both 2010 and 2013. Exactly Zero projects that had been identified in previous inspection reports had been funded by the federal government, and the entire band was housed in ATCO trailers. Clearly the Harper Conservatives had already reduced the cash his band was sent to zero."
2,7000002,"No it won't . That's just wishful thinking on democrats fault . For the 100 th time , Walker cited the cost of drug users treatment as being lost with Obamacare . I laugh every time I hear a liberal claim republicans want to hurt people , and that's why they dumped Obamacare."
3,7000003,"Instead of wringing our hands and nibbling the periphery of the issue, how about we face the actual issue head on? I would support a city ordinance against loitering, and applaud city councilors who champion a real and permanent solution.\n\nThe details could be determined, but would include a limit to persons sitting, standing, lying, smoking, conversing over a certain amount of time, perhaps 10 minutes. Exceptions, of course, would be for shopping, dining, attending a licensed and approved event, etc.\n\nIt is noble to provide some services for the truly needy in our community, but that, in my estimation, is a separate issue. I do not wish for my city to provide for nor encourage idle and harassing behaviors in our city core.\n\nEnough is enough!"
4,7000004,"how many of you commenters have garbage piled high in your yard, bald tires, dead batteries, rotten pallets, car parts, blah blah blah. this town is a pigpen. drive around and look for yourself, its pathetic."


In [48]:
df_test_clean = text_clean_wrapper(df_test)

In [57]:
df_test_clean.sample(10)

Unnamed: 0,id,comment_text
12504,7012504,Correction to my comment below 30 hours PER WEEK of telephone solicitations of donations is NOW expected of Congressman far more time than they spend on Congressional business
13546,7013546,"Simple answer No Just another money making scam under the guise of "" for your own safety "" Look up how the these things have been abused all around the country They create more accidents also This has been documented as well"
73004,7073004,"Al Gore Ice Free by 2014 http : content usatoday com communities ondeadline post 2009 12 gore new study sees nearly ice free arctic summer ice cap as early as 2014 1 # VbDxe9 JVhBc Alaska Dispatch 11 4 2014 Ice Free by 2020 http : barentsobserver com en arctic 2014 11 expert predicts ice free arctic 2020 un releases climate report 04 11 Peter Wadhams Ice Free 2016 http : www atlasobscura com articles climate scientist claims the arctic could be icefree this summer And here is a collection of Ice Free Predictions by "" Scientists "" https : stevengoddard wordpress com ice free arctic forecasts Yawn , same old song and dance"
45167,7045167,"Stealing raw assets from Canada to sell to foreign countries and also have an American company profit doing that is the opposite of "" economics "" it is market fraud manipulation and nearing treason"
33113,7033113,""" Allowed "" ?"
71133,7071133,"Good job , Buff n Blue Your kids really know a lot"
76298,7076298,"No , she said "" her hourly tips fell by off by $ 2 hr or more "" ; the story is not clear on how much her hourly wage went up , as it only specifies the rate in 2024 However , it is clearly implied that the increase in hourly wage offset at least part of the reduction in tips"
29684,7029684,"Why not address toxic agri business that contributes tremendously to climate change ? Why not provide for what you remove from sustainable use by Native People so casually in this pretense of concern for the environment ? Sure , under ESA rich Americans can still go and head hunt two elephants a year , even with the big fund raising beasts supposedly on the brink of extinction The Baka who were evicted from their lands and even killed for conservation of elephants , who now live in dire poverty alongside the roads , have a thing or two to teach us about conservation A premature listing in the ESA will destroy the sustainable subsistence based arts and crafts products market of people without providing for an alternative to self reliance The public has been conditioned to think of all hunting as murder by these wealthy "" non profits "" whose food comes in styrofoam and plastic , will end co management by Tribes , who have been successful for countless centuries in conservation of environment"
86844,7086844,"Trump will get the wall built , as it has proven extremely effective on the California border He will get some payment from Mexico , most likely a 1 5 % tax on net imports , which over 3 6 years will pay for it I for one want LEGAL , VETTED immigrants who have gone through the system , to immigrate to the US My ancestors did and I am better for it Every illegal immigrant that jumps the line and comes into the country is breaking the law that has been on the books forever and no president after Eisenhower has enforced This wall will not only keep out illegals , it will help cut down on the drug trade I personally know a person from Eastern Europe who flew to Mexico and paid a mule to cross the border Nice person but he has taken a job that a high school graduate should have gotten"
72610,7072610,"As soon as I find out which candidate leaked this , I will launch a complaint with the PIPEDA Privacy Commissioner against him or her personally , the Conservative Party , and the NFA , Which according to the self addressed envelope I received , is located in Edmonton This Red Tory is not happy with the social conservatism of the winning candidate , the management of the party is electoral process , and especially not with the NFA solicitation I just received which is in the form of a survey I personally support the current firearm regulatory regime and I do not appreciate the NFA having my name and my party affiliation in their records nor their use of this personal information I am close to quitting the Conservative Party and joining with other Red Tories to form a new party , or joining the Liberals"


In [50]:
df_test_clean.to_csv(os.path.join(settings.DATA_DIR, 'test_clean.csv'), header=True, index=False)