In [1]:
import pandas as pd
from packages.text.textutilities import *
from packages.text.skeleton import Skeleton
import packages.classification.classifier as clfs
import packages.augmentation.embedding as embeddings
pd.set_option('mode.chained_assignment', None)

categories=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
random_state=20

In [2]:
"""
Build a skeleton framework to ease up workflow
"""
skeleton=Skeleton(categories,random_state)
skeleton.build([
    TextCleaner(),
    Sampler()
    # Trimmer(threshold=12)
],df_path='./Data/train.csv')

[Step 1/2   Cleaning data...]
[Step 2/2   Sampling data...]
Sample 1: 
category highschool of the dead if you disagree with my creation of the above category you should take it to wp cfd not remove it from articles personally i believe it aids in navigation and that four pages main charlist chaplist eplist is sufficient for a category but you are of course welcome to nominate it for deletion anyway talk
Sample 2: 
these are your views but are they the consensus
Sample 3: 
i am moving this article back to nancy cartwright actor and making nancy cartwright a redirect to that page i would rather nancy cartwright be a disambiguation page but to minimize the possibility of further disruption and to give this discussion a more appropriate home i think that a good compromise for now that way no matter what happens with nancy cartwright we can be sure that nancy cartwright actor will always work correctly and the discussion can stay on talk nancy cartwright where it belongs as for the value of

In [3]:
skeleton.info()

toxic
severe_toxic
obscene
threat
insult
identity_hate


'toxic: 15294 samples\nsevere_toxic: 1595 samples\nobscene: 8449 samples\nthreat: 478 samples\ninsult: 7877 samples\nidentity_hate: 1405 samples\n'

In [4]:
"""
Get <n_category> random samples from each category
"""
samples=skeleton.split_by_keys(n_category=3000)
samples['toxic']


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
152717,918747ca1ebc96c8,shut the hell up,1,0,0,0,0,0
123059,9249d906a8e5a8ad,and then we would have to believe that the eng...,1,0,0,0,0,0
1878,0512f33cf8807fa2,aaron swartz stop fucking reverting my goddamn...,1,0,1,1,1,1
38875,67bf11045c1f1453,up yours you authoritarian little hitlers,1,0,0,0,0,0
79005,d36d6b135a4e2541,you know what acroterion you can go fuck yours...,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...
149086,56d57654c05bf031,more rubbish this page contains 0000001% of th...,1,0,1,0,1,0
112100,57bea1d837f085fa,hey fucking pig hey fucking pig of a bully; i ...,1,1,1,0,1,0
49132,835ae075e5d44a3b,ricky81682 is a paid mole scumbag there is no ...,1,0,1,0,1,0
52002,8b3212f40c51abaa,im the bane of your existence hey gaashoo i am...,1,0,0,0,0,0


In [None]:
BOOSTING_CLASSIFIER_COUNT=100

# 
# skeleton.classify([
#     clfs.AdaBoostSVM(n_estimators=2),
#     clfs.AdaBoostNaiveBayes(n_estimators=10),
#     clfs.AdaBoostNaiveBayes(n_estimators=20),
#     clfs.AdaBoostDecisionTree(n_estimators=20),
#     clfs.DecisionTree(),
#     clfs.NaiveBayes(),
#     clfs.AveragingEstimator(),
#     clfs.SVM(),
#     
# ])    
# 
# skeleton.save_progress("progress.txt")
word2vec_model=embeddings.EmbeddingAugmentation(load_path="./word2vec/word2vec.model")
word2vec_model.populate("toxic",samples,6000,20)

Loading Custom Model!
"word 'swartz' not in vocabulary"
"word 'live;' not in vocabulary"
"word 'conformist' not in vocabulary"
"word 'music75139103133' not in vocabulary"
"word 'pwspyware' not in vocabulary"
"word 'scherer' not in vocabulary"
"word 'crud' not in vocabulary"
"word 'subjectiv' not in vocabulary"
"word 'wasn' not in vocabulary"
"word 'diaper' not in vocabulary"
"word 'helpdesk' not in vocabulary"
"word 'assh*le' not in vocabulary"
"word 'shits' not in vocabulary"
"word '' not in vocabulary"
"word 'vma' not in vocabulary"
"word '' not in vocabulary"
"word '' not in vocabulary"
"word '' not in vocabulary"
"word 'rr2' not in vocabulary"
"word 'rr3' not in vocabulary"
"word '~slurp~' not in vocabulary"
"word 'scummy' not in vocabulary"
"word 'aren' not in vocabulary"
"word 'evildoing' not in vocabulary"
"word 'nutjob' not in vocabulary"
"word 'burp' not in vocabulary"
"word 'fabolous' not in vocabulary"
"word 'wwwyoutubecom' not in vocabulary"
"word '' not in vocabulary"
"wor

In [6]:

df=samples['toxic'].append({'id':0,'comment_text':"adsasdads",'toxic':1},ignore_index=True)
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,918747ca1ebc96c8,shut the hell up,1,0.0,0.0,0.0,0.0,0.0
1,9249d906a8e5a8ad,and then we would have to believe that the eng...,1,0.0,0.0,0.0,0.0,0.0
2,0512f33cf8807fa2,aaron swartz stop fucking reverting my goddamn...,1,0.0,1.0,1.0,1.0,1.0
3,67bf11045c1f1453,up yours you authoritarian little hitlers,1,0.0,0.0,0.0,0.0,0.0
4,d36d6b135a4e2541,you know what acroterion you can go fuck yours...,1,1.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...
2996,57bea1d837f085fa,hey fucking pig hey fucking pig of a bully; i ...,1,1.0,1.0,0.0,1.0,0.0
2997,835ae075e5d44a3b,ricky81682 is a paid mole scumbag there is no ...,1,0.0,1.0,0.0,1.0,0.0
2998,8b3212f40c51abaa,im the bane of your existence hey gaashoo i am...,1,0.0,0.0,0.0,0.0,0.0
2999,03dd90664e855020,listen up you narrow eyed teacher living on mi...,1,0.0,0.0,0.0,0.0,0.0
