## Document Classification by K-Means

### Includes

In [None]:
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.cluster import KMeans, DBSCAN
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from mpl_toolkits import mplot3d

from glove.glovevectorizer import GloveVectorizer

# Defining data

In [None]:
#-------------------------------------------------------------------------------#
train = pd.read_csv('../Data/train/Reuters/r8-train-all-terms.txt', header=None, sep='\t')
test = pd.read_csv('../Data/test/Reuters/r8-test-all-terms.txt', header=None, sep='\t')
train.columns = ['label', 'content']
test.columns = ['label', 'content']
train['lenght'] = train['content'].str.len()
lenght_mn=train['lenght'].mean()
train['lenght_mean']=(lambda x: train['lenght']/lenght_mn)(train['lenght'].values)
train['words_num'] = train['content'].str.split().str.len()
train['words_len_med'] = train['content'].str.len()/train['words_num']
train['words_num_norm'] = (train['words_num'] - train['words_num'].min())/(train['words_num'].max()-train['words_num'].min())
train['words_len_med_norm'] = (train['words_len_med'] - train['words_len_med'].min())/(train['words_len_med'].max()-train['words_len_med'].min())
train['lenght_norm'] = (train['lenght'] - train['lenght'].min())/(train['lenght'].max()-train['lenght'].min())
train['words_num_norm'] = (train['words_num'] - train['words_num'].min())/(train['words_num'].max()-train['words_num'].min())
train['words_len_med_norm'] = (train['words_len_med'] - train['words_len_med'].min())/(train['words_len_med'].max()-train['words_len_med'].min())

#-------------------------------------------------------------------------------#

train.head()


# Vectorizer

In [None]:
vectorizer = GloveVectorizer()
Xtrain = vectorizer.fit_transform(train.content)
model_1=KMeans(n_clusters=8, init='random').fit(Xtrain)
centroids = model_1.cluster_centers_

# K Means Graph- 2 Features (Normalized)

In [None]:
zipped_data = np.array(list(zip(train.words_len_med_norm, train.lenght_norm)))
model_2=KMeans(n_clusters=8, init='random').fit(zipped_data)
plt.figure(figsize=(8, 6))
plt.scatter(zipped_data[:,0], zipped_data[:,1], c=model_2.labels_.astype(float))
plt.show()

# K Means Graph- 2 Features (Non Normalized)

In [None]:
zipped_data = np.array(list(zip(train.words_len_med, train.lenght)))
model_3=KMeans(n_clusters=8, init='random').fit(zipped_data)
plt.figure(figsize=(8, 6))
plt.scatter(zipped_data[:,0], zipped_data[:,1], c=model_3.labels_.astype(float))
plt.show()

# Testing Dataset's Novel Features with K-Means

In [None]:
zipped_data = np.array(list(zip(train.lenght_norm,train.words_num_norm, train.words_len_med_norm)))

model_4_fit=KMeans(n_clusters=8, init='random').fit(zipped_data)


plt.figure(figsize=(8, 6))
plt.scatter(zipped_data[:,0],zipped_data[:,1], zipped_data[:,2], c=model_4_fit.labels_.astype(float))
plt.show()

# 3D Plotting

In [None]:
plt.figure(figsize=(8, 6))
ax = plt.axes(projection='3d')
ax.scatter3D(zipped_data[:,0],zipped_data[:,1], zipped_data[:,2], c=model_4_fit.labels_.astype(float))
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('lenght_norm')
ax.set_ylabel('words_num_norm')
ax.set_zlabel('words_len_med_norm')
ax.set_title('K-Means Normalized')

# Elbow Test

In [None]:
sse = []
list_k = list(range(1, 10))
zipped_data = np.array(list(zip(train.lenght_norm,train.words_num_norm, train.words_len_med_norm)))
Xtrain = vectorizer.fit_transform(train.content)
for k in list_k:
    km = KMeans(n_clusters=k)
    km.fit(zipped_data)
    sse.append(km.inertia_)
# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');

In [None]:
sse = []
list_k = list(range(1, 10))
vectorizer = GloveVectorizer()
Xtrain = vectorizer.fit_transform(train.content)
for k in list_k:
    km = KMeans(n_clusters=k)
    km.fit(Xtrain)
    sse.append(km.inertia_)
# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');