### Load packages

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

import sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics

import matplotlib.pyplot as plt
import scipy
import math

from __future__ import division
from sklearn.cluster import KMeans 
from numbers import Number
from pandas import DataFrame
import sys, codecs
from six.moves import cPickle as pickle

### Load a GloVe vector

In [8]:
def build_word_vector_matrix(vector_file, n_words):
        '''Read a GloVe array from sys.argv[1] and return its vectors and labels as arrays'''
        numpy_arrays = []
        labels_array = []
        with codecs.open(vector_file, 'r', 'utf-8') as f:
                for c, r in enumerate(f):
                        sr = r.split()
                        labels_array.append(sr[0])
                        numpy_arrays.append( np.array([float(i) for i in sr[1:]]) )

                        if c == n_words-1:
                                return np.array( numpy_arrays ), labels_array

        return np.array( numpy_arrays ), labels_array

In [39]:
input_vector_file = 'glove.6B.50d.txt'
n_words = 100000
df, labels_array  = build_word_vector_matrix(input_vector_file, n_words)
df.shape

(100000, 50)

### Load Kaggle Data

In [13]:
data_filename  = './data/train.csv'
data_df = pd.read_csv(data_filename)
corpus = data_df['Comment']
labels = data_df['Insult']
train_corpus, test_corpus, train_labels, test_labels = \
sklearn.cross_validation.train_test_split(corpus, labels, test_size=0.33)
train_corpus.shape

(2644,)

### CountVectorize the Kaggle Data

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_corpus)
X_train_counts.shape

(2644, 13176)

In [15]:
len(count_vect.vocabulary_)

13176

### Intersection of Kaggle and GloVe?

In [22]:
type(count_vect.vocabulary_.keys())

list

In [23]:
type(labels_array)

list

In [24]:
def intersect(a, b):
    return list(set(a) & set(b))

In [40]:
kaggle_vocab = count_vect.vocabulary_.keys()
intersection = intersect(kaggle_vocab, labels_array)

In [41]:
len(intersection)

9823

In [32]:
# At 100000 word2vec, intersection is 9823
# At 200000 word2vec, intersection is 10319
# At 400000 word2vec, intersection is 10653
13176 - 10653

2523

### Cluster word2vec

In [None]:
reduction_factor  = 0.1
clusters_to_make  = int( n_words * reduction_factor ) # The number of clusters to make

In [42]:
kmeans_model      = KMeans(init='k-means++', n_clusters=clusters_to_make, n_init=10)
kmeans_model.fit(df)

cluster_labels    = kmeans_model.labels_
cluster_inertia   = kmeans_model.inertia_

KeyboardInterrupt: 

### Save work up to now

In [None]:
from six.moves import cPickle as pickle
pickle_file = 'kmeansData.pickle'
try:
  f = open(pickle_file, 'wb')
  save = {
    'kmeans_model': kmeans_model,
    'df': df,
    'labels_array': labels_array,
    'count_vect': count_vect,
    'X_train_counts': X_train_counts,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

### [Start from here] Load work to now

In [None]:
from six.moves import cPickle as pickle
with open('kmeansData.pickle', 'r') as f:
  save = pickle.load(f)
  kmeans_model = save['kmeans_model']
  df = save['df']
  labels_array = save['labels_array']
  count_vect = save['count_vect']
  X_train_counts = save['X_train_counts']
  del save  # hint to help gc free up memory