# Exercise 2, Section 4

In [1]:
# Section 4

import io, nltk, sys, math, collections
from collections import Counter, defaultdict
from sklearn.metrics.cluster import v_measure_score
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds, eigs

In [2]:
stop_words = [line.rstrip('\n') for line in open('Stop_words2.txt', encoding="utf8")]

In [3]:
lines = [line.rstrip('\n') for line in open('Data/HAM-Train.txt', encoding="utf8")]
lines_test = [line.rstrip('\n') for line in open('Data/HAM-Test.txt', encoding="utf8")]
totalDocuments_train = len(lines)
totalDocuments_test = len(lines_test)

true_labels = []

words = []

puredFile = open('puredInputSectionFour.txt', 'w', encoding="utf8")
for i, line in enumerate(lines):
    delimiter = line.find('@@@@@@@@@@')+10
    true_labels.append(line[:delimiter-10])
    line = line[delimiter:]
    tokens = nltk.word_tokenize(line)
    for token in tokens:
        if token in stop_words:
            continue
        else:
            puredFile.write(token+" ")
            if not token in words:
                words.append(token)
    puredFile.write("\n")
    
for i, line in enumerate(lines_test):
    delimiter = line.find('@@@@@@@@@@')+10
    true_labels.append(line[:delimiter-10])
    line = line[delimiter:]
    tokens = nltk.word_tokenize(line)
    for token in tokens:
        if token in stop_words:
            continue
        else:
            puredFile.write(token+" ")
            if not token in words:
                words.append(token)
    puredFile.write("\n")
    
puredFile.close()

In [4]:
all_documents = [line.rstrip('\n') for line in open('puredInputSectionFour.txt', encoding="utf8")]

In [5]:
vec = CountVectorizer()
X = vec.fit_transform(all_documents)
matrix = csc_matrix(X.T, dtype=float)

In [6]:
u, s, representation = svds(matrix, k=300)

In [7]:
representation = representation.transpose()

In [8]:
representation_train = representation[:7740]

In [9]:
representation_test = representation[7740:8600]

In [10]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(representation_train)

In [12]:
cluster = collections.defaultdict(lambda: 0)
doc_type_counter = collections.defaultdict(lambda: 0)
lines = [line.rstrip('\n') for line in open('Data/HAM-Train.txt', encoding="utf8")]
for i,line in enumerate(lines):
    delimiter = line.find('@@@@@@@@@@')
    cluster[kmeans.labels_[i], line[:delimiter]] += 1
    doc_type_counter[line[:delimiter]] += 1

In [13]:
doc_types = ['اقتصاد', 'سیاسی', 'اجتماعی', 'ادب و هنر', 'ورزش']

# Calculating NMI

In [15]:
total_documents = 7740


# calculate class entropy
y = 0
for doc_type in doc_types: 
    x = doc_type_counter[doc_type]/total_documents
    y += (x*math.log2(x))
class_entropy = y
    
# calculating cluster entropy
z = 0
for i in range(5):
    x = Counter(kmeans.labels_)[i]/total_documents
    z += (x*math.log2(x))
cluster_entropy = z

# calculating mutual information
k = 0
for i in range(5):
    l = 0
    for doc_type in doc_types:
        x = cluster[(i), doc_type] / Counter(kmeans.labels_)[i]
        if (x!=0):
            l += (x*math.log2(x))
    k += (Counter(kmeans.labels_)[i]/total_documents)*l
mutual_information = y-k

# calculate NMI
n = (2 * mutual_information) / (class_entropy + cluster_entropy)
print("Our NMI is: "+str(n))

Our Class entropy is: -2.1980033260728837
Our Cluster entropy is: -0.5300210972654978
Our Mutual Information is: -0.05517303088563352
Our NMI is: 0.04044907399921024


# Calculating Accuracy

In [16]:
clusters_label = []
for i in range(5):
    max_val = 0 
    for doc_type in doc_types:
        if(max_val < cluster[(i), doc_type]):
            max_val = cluster[(i), doc_type]
            type = doc_type
    clusters_label.append(type)

x = 0
for i in range(5):
    x += cluster[(i), clusters_label[i]]
print(x/total_documents)

0.28488372093023256


# Calculating F-Measure

In [17]:
# calculating precision & recall
class_precision = []
class_recall = []
for i in range(5):
    precision = cluster[(i), clusters_label[i]]/Counter(kmeans.labels_)[i]
    recall = cluster[(i), clusters_label[i]]/doc_type_counter[clusters_label[i]]
    class_precision.append(precision)
    class_recall.append(recall)

precision = 0
recall = 0
for i in range(5):
    precision += (Counter(kmeans.labels_)[i]/total_documents)*class_precision[i]
    recall += (Counter(kmeans.labels_)[i]/total_documents)*class_recall[i]
    
f_measure = (2*precision*recall)/(precision+recall)
print(f_measure)

0.4250422681867488


# Calculating V-Measure

In [18]:
x = {}
for i, item in enumerate(clusters_label):
    x[item] = i
true_labels_code = []
for i, item in enumerate(true_labels[:]):
    try:
        true_labels_code.append(x[true_labels[i]])
    except:
        true_labels_code.append(5)
# Calculating V-Measure
print("Our V-Measure is: "+str(v_measure_score(true_labels_code[:7740], kmeans.labels_)))

Our V-Measure is: 0.040449073999209564


# Going to Test dataset

In [19]:
true_labels_test = []
doc_type_counter_test = collections.defaultdict(lambda: 0)
puredFile = open('puredInputSectionThree_test.txt', 'w', encoding="utf8")
for i, line in enumerate(lines_test[:]):
    delimiter = line.find('@@@@@@@@@@')+10
    doc_type_counter_test[line[:delimiter-10]] += 1
    true_labels_test.append(line[:delimiter-10])
    line = line[delimiter:]
    tokens = nltk.word_tokenize(line)
    for token in tokens:
        if token in stop_words:
            continue
        else:
            puredFile.write(token+" ")
    puredFile.write("\n")
puredFile.close()

In [20]:
lines = [line.rstrip('\n') for line in open('puredInputSectionThree_test.txt', encoding="utf8")]

In [34]:
representation_test = representation[7740:8600]

(860, 300)


In [39]:
kmeans_test_counter = collections.defaultdict(lambda: 0)
test_clustering = collections.defaultdict(lambda: 0)
predicted_labels_test = []

for i, item in enumerate(representation_test):
    x = kmeans.predict(item.reshape(1, -1))[0]
    kmeans_test_counter[x] += 1
    test_clustering[x, true_labels_test[i]] +=1
    predicted_labels_test.append(x)

# Calculating NMI


In [42]:
# calculate class entropy
y = 0
for doc_type in doc_types:
    x = doc_type_counter_test[doc_type]/totalDocuments_test
    y += (x*math.log2(x))
class_entropy = y


# calculating cluster entropy
z = 0
for i in range(5):
    x = kmeans_test_counter[i]/totalDocuments_test
    try:
        z += (x*math.log2(x))
    except:
        continue
cluster_entropy = z
    
# calculating mutual information
k = 0
for i in range(5):
    l = 0
    for doc_type in doc_types:
        try:
            x = test_clustering[i, doc_type] / kmeans_test_counter[i]
        except: 
            continue
        if (x!=0):
            l += (x*math.log2(x))
    k += (kmeans_test_counter[i]/totalDocuments_test)*l
mutual_information = y-k

# calculate NMI
n = (2 * mutual_information) / (class_entropy + cluster_entropy)
print("Our NMI is: "+str(n))

Our Class entropy is: -2.2116490392885453
Our Cluster entropy is: -0.45530339024128674
Our Mutual Information is: -0.06275360282543785
Our NMI is: 0.047060159101901144


# Calculating Accuracy

In [44]:
clusters_label = []
for i in range(5):
    max_val = 0 
    for doc_type in doc_types:
        if(max_val < cluster[(i), doc_type]):
            max_val = cluster[(i), doc_type]
            type = doc_type
    clusters_label.append(type)

x = 0
for i in range(5):
    x += test_clustering[(i), clusters_label[i]]
print(x/totalDocuments_test)

0.27674418604651163


# Calculating F-Measure 

In [49]:
# calculating precision & recall
class_precision_test = []
class_recall_test = []
for i in range(5):
    try:
        precision = test_clustering[(i), clusters_label[i]]/kmeans_test_counter[i]
    except:
        precision = 0
    recall = test_clustering[(i), clusters_label[i]]/doc_type_counter_test[clusters_label[i]]
    class_precision.append(precision)
    class_recall.append(recall)

precision = 0
recall = 0
for i in range(5):
    precision += (kmeans_test_counter[i]/totalDocuments_test)*class_precision[i]
    recall += (kmeans_test_counter[i]/totalDocuments_test)*class_recall[i]

f_measure = (2*precision*recall)/(precision+recall)
print(f_measure)

0.4219679202863865


# Calculating V-Measure


In [50]:
x = {}
for i, item in enumerate(clusters_label):
    x[item] = i

true_labels_code_test = []
for i, item in enumerate(true_labels_test[:]):
    try:
        true_labels_code_test.append(x[true_labels_test[i]])
    except:
        true_labels_code_test.append(5)
        
print("Our V-Measure is: "+str(v_measure_score(true_labels_code_test, predicted_labels_test)))

Our V-Measure is: 0.04706015910190115
