In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%pylab inline
# %matplotlib inline
import seaborn as sns

import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

from sklearn.utils import shuffle

import warnings
warnings.filterwarnings("ignore")

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_pickle('../data/toxictrain.pkl')
print(df.shape)
# df.head()

(159571, 16)


In [4]:
# DOWNSAMPLING: to prevent kernel crashing
df_t = df[df['rating']>0]
df_nt = df[df['rating']==0]
df_nt = shuffle(df_nt)
df_nt = df_nt[-32000:] # 16225 with rating > 0 
print(df_t.shape,df_nt.shape)
df = pd.concat([df_t,df_nt])
df_0 = df[df['rating']==0]
df_1 = df[df['rating']>0]
print(df.shape)

(16225, 16) (32000, 16)
(48225, 16)


In [5]:
df1 = df[['comment_text','toxic','severe_toxic','obscene','threat','insult','identity_hate','rating']]
X = df1['comment_text']
y = df1['toxic']
print(X.shape, y.shape)

(48225,) (48225,)


In [None]:
# df_1 = df1[df1['rating']>0]
# df_0 = df1[df1['rating']==0]
# print(df_1.shape, df_0.shape)

In [6]:
vectorizer = CountVectorizer(min_df = 1, stop_words = 'english') # df: document frequency
dtm = vectorizer.fit_transform(X) 
# pd.DataFrame(dtm.toarray(), index=X, columns=vectorizer.get_feature_names()).head(10)
dtm.shape

(48225, 86631)

In [None]:
# vectorizer.get_feature_names()

In [7]:
# Fit LSA. Use algorithm = “randomized” for large datasets 
lsa = TruncatedSVD(2, algorithm = 'randomized') # from arpack
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [8]:
lsa.explained_variance_ratio_

array([ 0.06109373,  0.04799747])

In [9]:
# pd.DataFrame(lsa.components_.round(5),index = ["component_1","component_2"],columns = vectorizer.get_feature_names())

In [10]:
# array(vectorizer.get_feature_names())

In [11]:
# pd.DataFrame(dtm_lsa.round(5), index = X, columns = ["component_1","component_2" ])

In [12]:
[[X[i], dtm_lsa[i][0].round(5), dtm_lsa[i][1].round(5)] for i in range(len(X))]

  1.0,
  0.0030400000000000002],
 ['I am going to murder ZimZalaBim ST47 for being evil homosexual jews.',
  0.87209000000000003,
  -0.48934],
 ["Shut up you asswipe, we don't care. I'll decapitate your mother and shit down her(his?) neck. Go back to Jewland you fuck. \n\nhttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_Stephen_Hawkinghttp://en.wikipedia.org/wiki/The_Real_

In [13]:
xs = [w[0].round(5) for w in dtm_lsa] 
ys = [w[1].round(5) for w in dtm_lsa]
c_[xs, ys]

array([[ 1.     ,  0.00304],
       [ 0.87209, -0.48934],
       [ 0.99821, -0.05974],
       ..., 
       [ 0.99995, -0.00969],
       [ 0.98415, -0.17734],
       [ 0.99998, -0.00638]])

In [None]:
# plt.figure()
# ax = plt.gca()
# ax.quiver(0,0,xs,ys,angles='xy',scale_units='xy',scale=1, linewidth = .01) # collection of arrows
# # ax.quiver(0,0,xs,ys,scale=1, linewidth = .01)
# ax.set_xlim([-1,1])
# ax.set_ylim([-1,1])
# xlabel('First principal component')
# ylabel('Second principal component')
# title('Plot of points against LSA components')
# plt.draw()
# plt.show()

In [None]:
# Compute document similarity using LSA components
similarity = np.asarray(numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T) 
pd.DataFrame(similarity.round(6),index=(range(7)), columns=(range(7))).head(10)

In [None]:
from sklearn.decomposition import NMF

nmf_model = NMF(2)

dtm_nmf = nmf_model.fit_transform(dtm)
dtm_nmf = Normalizer(copy=False).fit_transform(dtm_nmf)

In [None]:
dtm.shape

In [None]:
dtm_nmf.round(2)

In [None]:
nmf_model.components_[:,:5]

In [None]:
np.dot(dtm_nmf,nmf_model.components_).round(2)[0,370:380]

In [None]:
print(dtm)

In [None]:
# Fit LSA. Use algorithm = “randomized” for large datasets 
lsa = TruncatedSVD(5, algorithm = 'randomized')
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [None]:
lsa.explained_variance_ratio_ # how much of the variance is explained by the different components

In [None]:
dtm.shape

In [None]:
numpy.asmatrix(dtm_lsa)

In [None]:
dtm = pd.DataFrame(dtm)

In [None]:
dtm.iloc[:10,:10]