In [1]:
import pandas as pd
import re

pd.plotting.register_matplotlib_converters()
%matplotlib inline


%load_ext autoreload
%autoreload 2
from help_tool import help_tool


pd.set_option('display.max_colwidth', None)

Data has no duplicates and no missing values, there's 159571 observations and 8 features

In [2]:
train = help_tool.csv_download(
    r'Archive\train.csv')

help_tool.first_look(train)

Dataset has 159571 observations and 8 features
Columns with all empty values []
Dataset has 0 duplicates


Unnamed: 0,dtypes,"Null values, Count","Null values, %"
id,object,0,0.0
comment_text,object,0,0.0
toxic,int64,0,0.0
severe_toxic,int64,0,0.0
obscene,int64,0,0.0
threat,int64,0,0.0
insult,int64,0,0.0
identity_hate,int64,0,0.0


In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


# Cleaning Data

In [4]:
train['comment_text_original'] = train['comment_text']

# Talk
Extracting the part that starts with '(talk' and ends with ')'

In [5]:
train['extracted_text'] = train['comment_text'].str.extract(r"(\(talk.*?\))")
train[train['extracted_text'].notnull()]['extracted_text'].value_counts().head(10)

extracted_text
(talk)                       3099
(talk • contribs)             501
(talk · contribs)             154
(talk | contribs)             101
(talk to me)                   62
(talk · contribs)              58
(talk|contribs)                54
(talk · contribs · email)      48
(talk - contribs)              38
(talk • contrib)               34
Name: count, dtype: int64

In [6]:
train['comment_text'] = train['comment_text'].str.replace(
    r"\(talk.*?\)", "", regex=True)

# UTC
UTC is another highly used string.

In [7]:
train['comment_text'].str.extract(r"(\(UTC.*?\))").drop_duplicates()

Unnamed: 0,0
0,
1,(UTC)
4598,(UTC+5:30)
19501,(UTC+1)
23640,(UTC−5)
24048,(UTC/GMT)
43415,(UTC+01)
61267,(UTC+0)
67088,(UTC−4)
86619,(UTC )


In [8]:
train['comment_text'] = train['comment_text'].str.replace(
    r"\(UTC.*?\)", "", regex=True)

## Extract Information from URLs
In some cases, the domain or specific keywords within the URL might provide context (e.g., links to specific news sites or social media platforms)

In [9]:
train['comment_text'] = train['comment_text'].apply(
    help_tool.replace_url_with_domain)


train[train['id'] == 'fbf8672ea3b4ddf7']

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_original,extracted_text
159315,fbf8672ea3b4ddf7,nysun - public interest crap..,1,0,0,0,0,0,http://www.nysun.com/article/23698 - public interest crap..,


## Time and API adress

In [10]:
combined_pattern = (
    r'\b(?:\d{1,2}, \d{1,2} [A-Za-z]{3,10} \d{4}|'  # "31, 19 July 2006"
    r'\d{1,2} \d{1,2} [A-Za-z]{3,10} \d{4}|'  # "31 19 July 2006"
    r'\d{1,2} [A-Za-z]{3,10}, \d{4}|'  # "31 July, 2006"
    r'\d{1,2} [A-Za-z]{3,10} \d{4}|'  # "31 July 2006"
    r'\d{1,2}, [A-Za-z]{3,10} \d{4}, \d{1,2}:\d{2}|'  # "31, July 2006, 18:47"
    r'\d{1,2} [A-Za-z]{3,10} \d{4}, \d{1,2}:\d{2}|'  # "31 July 2006, 18:47"
    r'\d{1,2} [A-Za-z]{3,10} \d{4} \d{1,2}:\d{2}|'  # "31 July 2006 18:47"
    r'\d{1,2}:\d{2}, [A-Za-z]{3,10} \d{1,2}, \d{4}|'  # "18:47, July 31, 2006"
    r'\d{1,2}:\d{2}, \d{1,2} [A-Za-z]{3,10} \d{4}|'  # "18:47, 31 July 2006"
    r'\d{1,2}:\d{2} \d{1,2} [A-Za-z]{3,10} \d{4}|'  # "18:47 31 July 2006"
    r'\d{1,2} [A-Za-z]{3,10} \d{1,2}:\d{2}|'  # "26 July 17:03 UTC"
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'  # IP address
    r')\b'
)


# Remove the date pattern from the 'comment_text' column
train['comment_text'] = train['comment_text'].str.replace(
    combined_pattern, '', regex=True).str.strip()

train[train['id'] == '000103f0d9cfb60f']

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_original,extracted_text
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm seemingly stuck with. Thanks.,0,0,0,0,0,0,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",(talk)


# NOTs 
Updating the contractions so they don't get interpretted as the different word.

In [11]:
def expand_contractions(text):
    contractions_dict = {
        "can't": "can not",
        "won't": "will not",
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "hasn't": "has not",
        "haven't": "have not",
        "hadn't": "had not",
        "doesn't": "does not",
        "don't": "do not",
        "didn't": "did not",
        "won't": "will not",
        "wouldn't": "would not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not",
        "couldn't": "could not"
    }

    # Regular expression for finding contractions
    contractions_re = re.compile(
        r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')

    # Replace contractions using the dictionary
    expanded_text = contractions_re.sub(
        lambda match: contractions_dict[match.group(0)], text)

    return expanded_text


train['comment_text'] = train['comment_text'].apply(
    help_tool.expand_contractions)

## Wikipedia shortcuts and images

In [12]:
train['comment_text'] = train['comment_text'].replace(
    {'Wikipedia:': '', 'WP:': '', 'Category:': '', 'disambiguation': '', 'otheruses': ''
     })

In [13]:
train = train[~ train['comment_text'].str.contains("Navbox")]

In [14]:
train['comment_text'] = train['comment_text'].apply(
    help_tool.replace_filenames)

## Text normalization

In [15]:
train['comment_text'] = train['comment_text'].apply(help_tool.clean_text)
train[train['id'] == 'fbf8672ea3b4ddf7']

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_original,extracted_text
159315,fbf8672ea3b4ddf7,nysun public interest crap,1,0,0,0,0,0,http://www.nysun.com/article/23698 - public interest crap..,


## Remove Stop Words
Stop words (e.g., "and", "the", "is") are common words that may not carry much information.

In [16]:
train['comment_text'] = train['comment_text'].apply(help_tool.remove_stopwords)

## English leters
Removing all non engligh words

In [17]:
train['comment_text'] = train['comment_text'].apply(help_tool.remove_non_ascii)

In [18]:
train = train[train['comment_text'] != '']

# Dublicated values
After cleaning, 1707 duplicates are left.

In [19]:
train[train['comment_text'].duplicated()].shape

(1707, 10)

In [20]:
train = train[~train['comment_text'].duplicated()]

# Word Count
Checing the average word count in comments, this will be used in model training. We'll take word count of 100 as it's long enought to cover 75% of cases, but small enought to run.

In [21]:
train['word_count'] = train['comment_text'].apply(lambda x: len(x.split(" ")))

In [22]:
word_df = train[['toxic', 'severe_toxic', 'obscene', 'threat',
                 'insult', 'identity_hate',
                 'word_count']]

In [23]:
word_df[(word_df['toxic'] == 0) & (word_df['severe_toxic'] == 0) & (word_df['obscene'] == 0) & (
    word_df['threat'] == 0) & (word_df['insult'] == 0) & (word_df['identity_hate'] == 0)]['word_count'].describe()

count    141544.000000
mean         34.601219
std          50.650285
min           1.000000
25%           9.000000
50%          19.000000
75%          39.000000
max        1250.000000
Name: word_count, dtype: float64

In [24]:
word_df[(word_df['toxic'] == 1)]['word_count'].describe()

count    15102.000000
mean        27.753211
std         67.215564
min          1.000000
25%          6.000000
50%         12.000000
75%         24.000000
max       1250.000000
Name: word_count, dtype: float64

Word count in toxic and non toxic comments seam similar, meaning model will not interpret long comments as toxic or the other way arround.

In [25]:
train[train['word_count'] > 100][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum() / train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum() * 100

toxic            3.972984
severe_toxic     9.148666
obscene          4.068163
threat           3.870968
insult           3.536068
identity_hate    4.542177
dtype: float64

However, severe_toxic has 10% cases longer than 100 words. This means, that we are moving the cases from a minority class. 

# Label Distirbution
89% of cases do not belong to any class.

In [26]:
train[
    (train['toxic'] == 0)
    &
    (train['severe_toxic'] == 0)
    &
    (train['obscene'] == 0)
    &
    (train['threat'] == 0)
    &
    (train['insult'] == 0)
    &
    (train['identity_hate'] == 0)
].shape[0] / train.shape[0] * 100

89.82814205569517

10% are toxic, 5% obscene, insults. The rest of classes are minirity (~1%) among minority and should not give good results in model.

In [31]:
train[['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']].sum() / train.shape[0] * 100

toxic            9.584190
severe_toxic     0.998908
obscene          5.288376
threat           0.295103
insult           4.935522
identity_hate    0.880233
dtype: float64

In [40]:
train[['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']].to_csv(r'Archive\clean_data.csv')

# Conclusion
Training data was cleaned by removing common Wikipedia words, timestamps, non Englist words. Word count max length was chosen as 100 words, as 75% of cases are below this threshold. However, severe_toxic is a minority class and had 10% of cases longer than 100 words, which could result in poor model performance.