In [2]:
import pandas as pd

# import helper functions
%run -i helper_functions.py

# for standardization
from sklearn.preprocessing import MinMaxScaler

In [3]:
test = pd.read_csv("Data/test.csv")
print(test.shape)
test.head()

(153164, 2)


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [4]:
test_labels = pd.read_csv("Data/test_labels.csv")
print(test_labels.shape)
test_labels.head()

(153164, 7)


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [5]:
# join text data and labels
test_labeled = pd.concat([test, test_labels.drop('id', axis=1)], axis=1)

-1 indicates that the current row is not used for scoring, so we can filter out those rows first.

In [6]:
# filter out unlabeled observations
masking = (test_labeled[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]==-1).sum(axis=1)==0
test_labeled = test_labeled[masking].reset_index(drop=True)

In [7]:
print(test_labeled.shape)
test_labeled.head()

(63978, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
1,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0


In [9]:
test_labeled.to_csv('Data/test_labeled.csv', index=False)

## Data Cleaning

In [108]:
test_cleaned = data_cleaning(test_labeled)

removing noise
further cleaning the text


In [109]:
test_cleaned.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,noise_removed_text,clean_text
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,Thank you for understanding. I think very high...,thank you for understanding. i think very high...
1,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,:Dear god this site is horrible.,:dear god this site is horrible.
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,"""::: Somebody will invariably try to add Relig...",""": somebody will invariably try to add religio..."
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,""" It says it right there that it IS a type. Th...",""" it says it right there that it is a type. th..."
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,""" == Before adding a new product to the list, ...",""" = before adding a new product to the list, m..."


## Data Preprocessing

In [114]:
test_processed = data_preprocessing(test_cleaned)

tokenizing the text
tagging the text
lemmatizing the text


In [115]:
test_processed.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,noise_removed_text,clean_text,text_tokenized,text_tokenized_nopunc,POS_tagging,POS_tagging_flat,lemmatization
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,Thank you for understanding. I think very high...,thank you for understanding. i think very high...,"[thank, you, for, understanding, ., i, think, ...","[thank, you, for, understanding, i, think, ver...","[[(thank, NN), (you, PRP), (for, IN), (underst...","[(thank, NN), (you, PRP), (for, IN), (understa...","[thank, understanding, i, think, very, highly,..."
1,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,:Dear god this site is horrible.,:dear god this site is horrible.,"[:d, ear, god, this, site, is, horrible, .]","[dear, god, this, site, is, horrible]","[[(:d, JJ), (ear, NN), (god, NN), (this, DT), ...","[(:d, JJ), (ear, NN), (god, NN), (this, DT), (...","[:d, ear, god, site, is, horrible]"
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,"""::: Somebody will invariably try to add Relig...",""": somebody will invariably try to add religio...","["", :, somebody, will, invariably, try, to, ad...","[somebody, will, invariably, try, to, add, rel...","[[("", NN), (:, :), (somebody, NN), (will, MD),...","[("", NN), (:, :), (somebody, NN), (will, MD), ...","[somebody, invariably, try, add, religion, rea..."
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,""" It says it right there that it IS a type. Th...",""" it says it right there that it is a type. th...","["", it, says, it, right, there, that, it, is, ...","[it, says, it, right, there, that, it, is, a, ...","[[("", IN), (it, PRP), (says, VBZ), (it, PRP), ...","[("", IN), (it, PRP), (says, VBZ), (it, PRP), (...","[say, right, is, type, type, institution, is, ..."
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,""" == Before adding a new product to the list, ...",""" = before adding a new product to the list, m...","["", =, before, adding, a, new, product, to, th...","[before, adding, a, new, product, to, the, lis...","[[("", JJ), (=, NN), (before, IN), (adding, VBG...","[("", JJ), (=, NN), (before, IN), (adding, VBG)...","[adding, new, product, list, make, sure, is, r..."


## Create New Features

### Text Features

In [122]:
test_featured = add_features(test_processed)

creating sentence features
creating word features
creating exclamation mark features
creating unique and repeated word features
creating word tag features
creating uppercase features
creating text analysis features
creating bad word features


In [123]:
test_featured.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,noise_removed_text,clean_text,...,num_bad_severe_toxic,bad_severe_toxic_vs_length,num_bad_obscene,bad_obscene_vs_length,num_bad_threat,bad_threat_vs_length,num_bad_insult,bad_insult_vs_length,num_bad_identity_hate,bad_identity_hate_vs_length
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,Thank you for understanding. I think very high...,thank you for understanding. i think very high...,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,:Dear god this site is horrible.,:dear god this site is horrible.,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,"""::: Somebody will invariably try to add Relig...",""": somebody will invariably try to add religio...",...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,""" It says it right there that it IS a type. Th...",""" it says it right there that it is a type. th...",...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,""" == Before adding a new product to the list, ...",""" = before adding a new product to the list, m...",...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [134]:
featured_train = pd.read_csv("Data/featured_train.csv",
               usecols=['min_length', 'num_unique_words', 'num_uppercase'])

In [None]:
# minmax scaling
scaler = MinMaxScaler()
min_length_scaled = scaler.fit_transform(featured_train[['min_length']])

In [136]:
# minmax scaling - fit on train
scaler = MinMaxScaler()
scaler.fit(featured_train[['min_length', 'num_unique_words', 'num_uppercase']])

In [138]:
# minmax scaling - transform on test
test_featured[['min_length_scaled', 'num_unique_words_scaled', 'num_uppercase_scaled']] = scaler.transform(test_featured[['min_length', 'num_unique_words', 'num_uppercase']])

In [140]:
# filter out the selected features
columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
selected_features = columns+['min_length_scaled', 'num_words_vs_length', 'exclamation_marks_vs_length',
       'num_unique_words_scaled', 'verbs_vs_length', 'num_uppercase_scaled', 'uppercase_vs_length', 
        'sentiment','bad_toxic_vs_length', 'bad_severe_toxic_vs_length', 'bad_obscene_vs_length',
        'bad_threat_vs_length', 'bad_insult_vs_length', 'bad_identity_hate_vs_length']

In [141]:
test_featured_text = test_featured[selected_features]

In [142]:
test_featured_text.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,num_unique_words_scaled,verbs_vs_length,num_uppercase_scaled,uppercase_vs_length,sentiment,bad_toxic_vs_length,bad_severe_toxic_vs_length,bad_obscene_vs_length,bad_threat_vs_length,bad_insult_vs_length,bad_identity_hate_vs_length
0,0,0,0,0,0,0,0.00289,0.166667,0.0,0.032911,0.03125,0.000403,0.020833,0.3612,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0.005058,0.1875,0.0,0.01519,0.03125,0.000202,0.03125,0.0516,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0.000723,0.149554,0.002232,0.118987,0.03125,0.002821,0.03125,-0.7393,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0.007225,0.169661,0.0,0.070886,0.02994,0.008061,0.07984,0.0258,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0.003613,0.167665,0.0,0.060759,0.035928,0.000806,0.011976,0.7184,0.0,0.0,0.0,0.0,0.0,0.0


### Embedding Features

In [126]:
glove_embedding = get_glove_embedding(test_processed)

building glove model
generating embedding vectors


In [129]:
# filter out the selected features
index = ['29', '34', '46', '47', '53', '54', '65', '72', '82', '86', '87', '93',
       '95', '96', '98', '100', '103', '105', '114', '132', '135', '139',
       '143', '156', '157', '170', '198']

In [131]:
index = list(map(int, index))

In [132]:
test_featured_embedding = pd.DataFrame(glove_embedding.tolist())[index]

In [133]:
test_featured_embedding.head()

Unnamed: 0,29,34,46,47,53,54,65,72,82,86,...,105,114,132,135,139,143,156,157,170,198
0,-0.052982,-0.169101,0.10421,-0.231872,-0.147303,0.085074,0.279414,0.045263,0.216464,-0.070892,...,-0.12319,0.040738,-0.010083,-0.158238,-0.204194,0.116565,0.098296,-0.20616,0.046779,-0.036748
1,0.032172,0.046765,0.194164,0.00534,-0.175857,0.152049,0.067513,0.09368,0.433347,-0.13959,...,-0.004935,-0.013467,0.00355,-0.117174,-0.075475,0.043945,-0.148617,-0.35074,0.113815,-0.070907
2,0.000621,-0.132685,0.15657,-0.122877,-0.041197,-0.045822,0.151609,-0.012794,0.058626,0.027053,...,-0.114241,-0.12508,0.045087,-0.119656,-0.274707,0.102546,0.100782,-0.107205,-0.05078,-0.098343
3,0.079728,0.022776,0.088192,-0.00438,0.004026,-0.220622,0.163259,0.237079,0.203803,-0.036898,...,-0.241482,-0.023304,-0.005002,-0.281848,-0.157011,0.184323,0.067625,-0.066242,0.023737,-0.111269
4,0.104646,-0.068364,-0.012882,-0.10212,0.041755,-0.031094,0.149613,0.158524,0.13738,0.068439,...,-0.1922,0.055657,0.024936,-0.181419,-0.165707,0.194511,0.143557,-0.059017,0.136676,-0.00453


In [143]:
selected_test = pd.concat([test_featured_text, test_featured_embedding], axis=1)

In [145]:
selected_test['none'] = 1 - selected_test[columns].max(axis=1)

In [146]:
selected_test.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,num_unique_words_scaled,...,114,132,135,139,143,156,157,170,198,none
0,0,0,0,0,0,0,0.00289,0.166667,0.0,0.032911,...,0.040738,-0.010083,-0.158238,-0.204194,0.116565,0.098296,-0.20616,0.046779,-0.036748,1
1,0,0,0,0,0,0,0.005058,0.1875,0.0,0.01519,...,-0.013467,0.00355,-0.117174,-0.075475,0.043945,-0.148617,-0.35074,0.113815,-0.070907,1
2,0,0,0,0,0,0,0.000723,0.149554,0.002232,0.118987,...,-0.12508,0.045087,-0.119656,-0.274707,0.102546,0.100782,-0.107205,-0.05078,-0.098343,1
3,0,0,0,0,0,0,0.007225,0.169661,0.0,0.070886,...,-0.023304,-0.005002,-0.281848,-0.157011,0.184323,0.067625,-0.066242,0.023737,-0.111269,1
4,0,0,0,0,0,0,0.003613,0.167665,0.0,0.060759,...,0.055657,0.024936,-0.181419,-0.165707,0.194511,0.143557,-0.059017,0.136676,-0.00453,1


In [147]:
selected_test.to_csv('Data/selected_test.csv', index=False)