## Toxic: PCA

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.utils import shuffle

In [20]:
df = pd.read_pickle('../data/toxictrain.pkl')
print(df.shape)
# df.head()

(159571, 24)


In [21]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'category', 'rating', 'clean',
       'comment_text_s', 'comment_text_f', 'token_clean', 'sent_token',
       'polarity_sentence', 'polarity_comment', 'polarity_comment_s',
       'word_count', 'char_count', 'char_count_s', 'polarity_min',
       'polarity_max', 'polarity_mean'],
      dtype='object')

### MiniBatchSparsePCA

In [22]:
# https://github.com/lambdaofgod/stackexchange/blob/master/stackoverflow/SPCA%20Word%20Clusters.ipynb
import numpy as np
from sklearn.decomposition import MiniBatchSparsePCA
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
def token_topics(data, n_components):
    class_list = ['clean','toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
    ttd = {}
    for item in class_list:
        listy = []
        data = shuffle(data)[-50000:]
        X = data[data[item]==1]['comment_text_s']

        vectorizer = TfidfVectorizer(min_df=5, token_pattern='[a-zA-Z]+', stop_words='english')

        X_train = vectorizer.fit_transform(X)
        print(X_train)

        print('dictionary has ', len(vectorizer.vocabulary_), 'entries')

        X_train_dense = X_train.toarray()
        print(X_train_dense)

        spca = MiniBatchSparsePCA(n_components=n_components, alpha=0.04,
                              batch_size=30, n_iter=100, random_state=0)

        %time X_train_reduced = spca.fit_transform(X_train_dense)

        for i in range(n_components):
            print('component', i, 'has', sum(spca.components_[i, :] != 0), 'nonzero entries')   

        component_idxs = [np.where(spca.components_[i, :])[0] for i in range(n_components)]   

        idx_to_words = dict((n, word) for (word, n) in vectorizer.vocabulary_.items())    

        for i in range(n_components):
            aa = 'Tokens for {} component'.format(i)
            bb = [idx_to_words[i] for i in component_idxs[i]]
            listy.append([aa,bb])
        ttd[item] = listy
    return ttd

In [24]:
ttd = token_topics(df, 1)
ttd

  (0, 6080)	0.142770259127701
  (0, 8853)	0.17686398441996673
  (0, 11925)	0.2336805530734526
  (0, 12692)	0.20241984607098862
  (0, 14414)	0.2794118422146501
  (0, 9287)	0.08984143954334384
  (0, 14413)	0.19138404778445917
  (0, 13116)	0.11617564948177102
  (0, 3249)	0.13987570803561095
  (0, 6802)	0.17716301288730446
  (0, 6043)	0.13372043927760596
  (0, 10671)	0.3049568778151184
  (0, 8118)	0.39678088301898545
  (0, 7810)	0.11179871510076109
  (0, 5655)	0.1427072708628111
  (0, 12437)	0.3409993104222091
  (0, 14527)	0.18387413836736374
  (0, 13037)	0.20498662113934968
  (0, 4752)	0.22359261125877639
  (0, 12209)	0.18688448359576085
  (0, 13158)	0.15181382055060252
  (0, 10170)	0.17978888946668986
  (0, 10759)	0.17876357235990828
  (1, 14514)	0.5124712720040744
  (1, 2089)	0.430065697684712
  :	:
  (44863, 9288)	0.35698860111421327
  (44863, 12517)	0.38886238323629513
  (44863, 2853)	0.4098289740002403
  (44863, 13952)	0.4811243995740653
  (44864, 6080)	0.12931608808198142
  (44864, 

CPU times: user 779 ms, sys: 11.5 ms, total: 791 ms
Wall time: 1.41 s
component 0 has 57 nonzero entries
  (0, 370)	0.6960075219051419
  (0, 462)	0.7180344904330594
  (1, 526)	0.14464174401918295
  (1, 1102)	0.5703642486085897
  (1, 463)	0.2067512371001267
  (1, 523)	0.21797982399341576
  (1, 486)	0.15813781280308667
  (1, 930)	0.32888281188019736
  (1, 1028)	0.132014818180574
  (1, 544)	0.25725952595059737
  (1, 295)	0.18763840094876325
  (1, 1047)	0.20019683134334465
  (1, 645)	0.1794560180148403
  (1, 629)	0.11424181341240633
  (1, 37)	0.20816331329888313
  (1, 299)	0.12485003018081743
  (1, 619)	0.18681604744228666
  (1, 937)	0.16564602293254807
  (1, 639)	0.16564602293254807
  (1, 971)	0.17750345787811247
  (1, 77)	0.21441373849299156
  (2, 534)	0.2351306930071478
  (2, 447)	0.202249585631961
  (2, 849)	0.6562607152809702
  (2, 1143)	0.30920466591846574
  :	:
  (2487, 864)	0.3316698716399014
  (2488, 1102)	0.3988708410972975
  (2488, 478)	0.23857698354776838
  (2488, 614)	0.302324

{'clean': [['Tokens for 0 component',
   ['ability',
    'able',
    'absolutely',
    'abuse',
    'academic',
    'accept',
    'acceptable',
    'accepted',
    'access',
    'accordance',
    'according',
    'account',
    'accounts',
    'accurate',
    'accusation',
    'accusations',
    'accuse',
    'accused',
    'accusing',
    'acknowledged',
    'act',
    'acting',
    'action',
    'actions',
    'active',
    'actual',
    'actually',
    'ad',
    'add',
    'added',
    'adding',
    'addition',
    'additional',
    'additionally',
    'additions',
    'address',
    'addressed',
    'addresses',
    'admin',
    'administrative',
    'administrator',
    'administrators',
    'admins',
    'adminship',
    'admit',
    'advance',
    'advertisement',
    'advertising',
    'advice',
    'advise',
    'afd',
    'afraid',
    'age',
    'agenda',
    'agf',
    'ago',
    'agree',
    'agreed',
    'agreement',
    'ah',
    'ahead',
    'air',
    'aka',
    'al',


In [6]:
ttd = token_topics(df, 1)
ttd

dictionary has  14658 entries
CPU times: user 2min 9s, sys: 1min 22s, total: 3min 32s
Wall time: 3min 25s
component 0 has 1887 nonzero entries
dictionary has  2326 entries
CPU times: user 2.57 s, sys: 154 ms, total: 2.72 s
Wall time: 2.21 s
component 0 has 1039 nonzero entries
dictionary has  191 entries
CPU times: user 847 ms, sys: 18.1 ms, total: 866 ms
Wall time: 1.01 s
component 0 has 151 nonzero entries
dictionary has  1336 entries
CPU times: user 1.35 s, sys: 41.5 ms, total: 1.39 s
Wall time: 1.31 s
component 0 has 746 nonzero entries
dictionary has  54 entries
CPU times: user 922 ms, sys: 13.1 ms, total: 935 ms
Wall time: 1.16 s
component 0 has 54 nonzero entries
dictionary has  1275 entries
CPU times: user 1.29 s, sys: 24.1 ms, total: 1.31 s
Wall time: 1.25 s
component 0 has 567 nonzero entries
dictionary has  242 entries
CPU times: user 987 ms, sys: 20.4 ms, total: 1.01 s
Wall time: 1.15 s
component 0 has 237 nonzero entries


{'clean': [['Tokens for 0 component',
   ['ability',
    'able',
    'absolutely',
    'abuse',
    'abusive',
    'academic',
    'accept',
    'acceptable',
    'accepted',
    'access',
    'accidentally',
    'accordance',
    'according',
    'account',
    'accounts',
    'accuracy',
    'accurate',
    'accusation',
    'accusations',
    'accuse',
    'accused',
    'accusing',
    'act',
    'acting',
    'action',
    'actions',
    'active',
    'activity',
    'actual',
    'actually',
    'ad',
    'adam',
    'add',
    'added',
    'adding',
    'addition',
    'additional',
    'additionally',
    'additions',
    'address',
    'addressed',
    'addresses',
    'adds',
    'admin',
    'administrator',
    'administrators',
    'admins',
    'admit',
    'advance',
    'advertising',
    'advice',
    'advise',
    'afd',
    'afraid',
    'age',
    'agenda',
    'ago',
    'agree',
    'agreed',
    'agreement',
    'ah',
    'ahead',
    'air',
    'aiv',
    'al',


In [12]:
ttd = pd.DataFrame(ttd)
ttd

Unnamed: 0,clean,identity_hate,insult,obscene,severe_toxic,threat,toxic
0,"[Tokens for 0 component, [ability, able, absol...","[Tokens for 0 component, [account, actually, a...","[Tokens for 0 component, [account, act, acting...","[Tokens for 0 component, [able, abusive, accou...","[Tokens for 0 component, [account, add, admin,...","[Tokens for 0 component, [ass, asshole, big, b...","[Tokens for 0 component, [able, absolutely, ab..."


In [13]:
df_pca = df[df['rating']>0]
X = df_pca['comment_text']

vectorizer = TfidfVectorizer(min_df=5, token_pattern='[a-zA-Z]+', stop_words='english')

X_train = vectorizer.fit_transform(X)

print('dictionary has ', len(vectorizer.vocabulary_), 'entries')

X_train_dense = X_train.toarray()

dictionary has  6673 entries


In [14]:
n_components = 6

spca = MiniBatchSparsePCA(n_components=n_components, alpha=0.04,
                          batch_size=30, n_iter=100, random_state=0)

%time X_train_reduced = spca.fit_transform(X_train_dense)

CPU times: user 19.7 s, sys: 2.17 s, total: 21.9 s
Wall time: 14.9 s


In [15]:
for i in range(n_components):
    print('component', i, 'has', sum(spca.components_[i, :] != 0), 'nonzero entries')

component 0 has 524 nonzero entries
component 1 has 48 nonzero entries
component 2 has 298 nonzero entries
component 3 has 165 nonzero entries
component 4 has 96 nonzero entries
component 5 has 1293 nonzero entries


In [16]:
component_idxs = [np.where(spca.components_[i, :])[0] for i in range(n_components)]

In [17]:
idx_to_words = dict((n, word) for (word, n) in vectorizer.vocabulary_.items())

In [18]:
for i in range(n_components):
    print('Tokens for', i, 'component:')
    print([idx_to_words[i] for i in component_idxs[i]])
    print()

Tokens for 0 component:

Tokens for 1 component:
['article', 'ask', 'asshole', 'big', 'black', 'block', 'blocked', 'crap', 'don', 'dont', 'dumb', 'edit', 'editing', 'edits', 'face', 'fact', 'fag', 'fuck', 'fucking', 'good', 'idiot', 'im', 'leave', 'look', 'loser', 'love', 'make', 'mean', 'moron', 'nazi', 'pathetic', 'people', 'piece', 'really', 'retard', 'right', 's', 'sex', 'shit', 'stop', 't', 'tell', 'user', 'want', 'wikipedia', 'won', 'work', 'ya']

Tokens for 2 component:
['actually', 'ad', 'advice', 'agree', 'ahead', 'allow', 'america', 'american', 'anus', 'anyways', 'asian', 'ass', 'attention', 'away', 'b', 'ban', 'bastard', 'bastards', 'belongs', 'best', 'better', 'bias', 'bitch', 'blocking', 'bother', 'boy', 'boys', 'brain', 'bring', 'brown', 'burn', 'butt', 'c', 'came', 'censor', 'censorship', 'christian', 'ck', 'cking', 'clown', 'clue', 'cocksucking', 'come', 'completely', 'computer', 'consider', 'correctly', 'cunt', 'cunts', 'd', 'dad', 'dare', 'day', 'dead', 'death', 'deep