In [2]:
# Section 1

import io, nltk, sys, gensim, math, collections, numpy
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
from collections import Counter, defaultdict
from sklearn.metrics.cluster import v_measure_score
from sklearn.cluster import KMeans



# Reading train file and remove stop words

In [3]:
# We preproceed our input and put it in puredInputSectionOne

stop_words = [line.rstrip('\n') for line in open('Stop_words2.txt', encoding="utf8")]

In [4]:
lines = [line.rstrip('\n') for line in open('Data/HAM-Train.txt', encoding="utf8")]

In [5]:
true_labels = []

puredFile = open('puredInputSectionOne.txt', 'w', encoding="utf8")
for i, line in enumerate(lines):
    delimiter = line.find('@@@@@@@@@@')+10
    
    true_labels.append(line[:delimiter-10])
    
    line = line[delimiter:]
    tokens = nltk.word_tokenize(line)
    for token in tokens:
        if token in stop_words:
            continue
        else:
            puredFile.write(token+" ")
    puredFile.write("\n")
puredFile.close()

In [6]:
lines = [line.rstrip('\n') for line in open('puredInputSectionOne.txt', encoding="utf8")]
sentences = LineSentence('puredInputSectionOne.txt')

In [7]:
model = Word2Vec(sentences, size=300, window=5, min_count=1, workers=4)

In [8]:
document_vector = []

for i,line in enumerate(lines[:]):
    tokens = nltk.word_tokenize(line)
    sum_vector = model.wv[tokens[0]]
    counter = 1
    for token in tokens[1:]:
        counter = counter + 1
        sum_vector = sum_vector + model.wv[token]
    document_vector.append(sum_vector/counter)

In [9]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(document_vector)

In [13]:
cluster = collections.defaultdict(lambda: 0)
doc_type_counter = collections.defaultdict(lambda: 0)
lines = [line.rstrip('\n') for line in open('Data/HAM-Train.txt', encoding="utf8")]
for i,line in enumerate(lines):
    delimiter = line.find('@@@@@@@@@@')
    cluster[kmeans.labels_[i], line[:delimiter]] += 1
    doc_type_counter[line[:delimiter]] += 1

In [14]:
doc_types = ['اقتصاد', 'سیاسی', 'اجتماعی', 'ادب و هنر', 'ورزش']

# Calculating NMI

In [16]:
total_documents = 7740

# calculate class entropy
y = 0
for doc_type in doc_types: 
    x = doc_type_counter[doc_type]/total_documents
    y += (x*math.log2(x))
class_entropy = y
    
# calculating cluster entropy
z = 0
for i in range(5):
    x = Counter(kmeans.labels_)[i]/total_documents
    z += (x*math.log2(x))
cluster_entropy = z

# calculating mutual information
k = 0
for i in range(5):
    l = 0
    for doc_type in doc_types:
        x = cluster[(i), doc_type] / Counter(kmeans.labels_)[i]
        if (x!=0):
            l += (x*math.log2(x))
    k += (Counter(kmeans.labels_)[i]/total_documents)*l
mutual_information = y-k

# calculate NMI
n = (2 * mutual_information) / (class_entropy + cluster_entropy)
print("Our NMI is: "+str(n))

Our Class entropy is: -2.1980033260728837
Our Cluster entropy is: -2.0663232476449664
Our Mutual Information is: -1.1959163121702119
Our NMI is: 0.5608933985220335


# Calculating Accuracy

In [17]:
clusters_label = []
for i in range(5):
    max_val = 0 
    for doc_type in doc_types:
        if(max_val < cluster[(i), doc_type]):
            max_val = cluster[(i), doc_type]
            type = doc_type
    clusters_label.append(type)

x = 0
for i in range(5):
    x += cluster[(i), clusters_label[i]]
print(x/total_documents)

0.7277777777777777


# Calculating F-Measure

In [18]:
# calculating precision & recall
class_precision = []
class_recall = []
for i in range(5):
    precision = cluster[(i), clusters_label[i]]/Counter(kmeans.labels_)[i]
    recall = cluster[(i), clusters_label[i]]/doc_type_counter[clusters_label[i]]
    class_precision.append(precision)
    class_recall.append(recall)

precision = 0
recall = 0
for i in range(5):
    precision += (Counter(kmeans.labels_)[i]/total_documents)*class_precision[i]
    recall += (Counter(kmeans.labels_)[i]/total_documents)*class_recall[i]

f_measure = (2*precision*recall)/(precision+recall)
print(f_measure)

0.736731024254575


# Calculating V-Measure

In [26]:
x = {}
for i, item in enumerate(clusters_label):
    x[item] = i

true_labels_code = []
for i, item in enumerate(true_labels[:]):
    try:
        true_labels_code.append(x[true_labels[i]])
    except:
        true_labels_code.append(5)
        
# Calculating V-Measure
print("Our V-Measure is: "+str(v_measure_score(true_labels_code, kmeans.labels_)))

# Going to Test dataset

In [29]:
# Going to test dataset
lines_test = [line.rstrip('\n') for line in open('Data/HAM-Test.txt', encoding="utf8")]

In [30]:
true_labels_test = []
doc_type_counter_test = collections.defaultdict(lambda: 0)
puredFile = open('puredInputSectionOne_test.txt', 'w', encoding="utf8")
for i, line in enumerate(lines_test):
    delimiter = line.find('@@@@@@@@@@')+10
    doc_type_counter_test[line[:delimiter-10]] += 1
    true_labels_test.append(line[:delimiter-10])
    line = line[delimiter:]
    tokens = nltk.word_tokenize(line)
    for token in tokens:
        if token in stop_words:
            continue
        else:
            puredFile.write(token+" ")
    puredFile.write("\n")
puredFile.close()

In [31]:
lines = [line.rstrip('\n') for line in open('puredInputSectionOne_test.txt', encoding="utf8")]

# Creating document vector for test documents

In [32]:
document_vector_test = []

for i,line in enumerate(lines[:]):
    tokens = nltk.word_tokenize(line)
    sum_vector = [0] * 300
    counter = 1
    for token in tokens[1:]:
        counter = counter + 1
        try:
            sum_vector = sum_vector + model.wv[token]
        except:
            continue
    document_vector_test.append(sum_vector/counter)

kmeans_test_counter = collections.defaultdict(lambda: 0)
test_clustering = collections.defaultdict(lambda: 0)
predicted_labels_test = []

for i, item in enumerate(document_vector_test[:]):
    x = kmeans.predict(document_vector_test[i].reshape(1, -1))[0]
    kmeans_test_counter[x] += 1
    test_clustering[x, true_labels_test[i]] +=1
    predicted_labels_test.append(x)

# Calculating NMI

In [34]:
total_documents_test = len(document_vector_test)

# calculate class entropy
y = 0
for doc_type in doc_types:
    x = doc_type_counter_test[doc_type]/total_documents_test
    y += (x*math.log2(x))
class_entropy = y

# calculating cluster entropy
z = 0
for i in range(5):
    x = kmeans_test_counter[i]/total_documents_test
    z += (x*math.log2(x))
cluster_entropy = z
    
# calculating mutual information
k = 0
for i in range(5):
    l = 0
    for doc_type in doc_types:
        x = test_clustering[i, doc_type] / kmeans_test_counter[i]
        if (x!=0):
            l += (x*math.log2(x))
    k += (kmeans_test_counter[i]/total_documents_test)*l
mutual_information = y-k

# calculate NMI
n = (2 * mutual_information) / (class_entropy + cluster_entropy)
print("Our NMI is: "+str(n))

Our Class entropy is: -2.2116490392885453
Our Cluster entropy is: -2.053538325758056
Our Mutual Information is: -1.2401656563786483
Our NMI is: 0.581529274208144


# Calculating Accuracy

In [40]:
clusters_label = []
for i in range(5):
    max_val = 0 
    for doc_type in doc_types:
        if(max_val < cluster[(i), doc_type]):
            max_val = cluster[(i), doc_type]
            type = doc_type
    clusters_label.append(type)
#print(clusters_label)

x = 0
for i in range(5):
#     print(test_clustering[(i), clusters_label[i]])
    x += test_clustering[(i), clusters_label[i]]
print(x/total_documents_test)

# Calculating F-Measure 

In [44]:
# calculating precision & recall
class_precision_test = []
class_recall_test = []
for i in range(5):
    precision = test_clustering[(i), clusters_label[i]]/kmeans_test_counter[i]
    recall = test_clustering[(i), clusters_label[i]]/doc_type_counter_test[clusters_label[i]]
    class_precision.append(precision)
    class_recall.append(recall)

precision = 0
recall = 0
for i in range(5):
    precision += (kmeans_test_counter[i]/total_documents_test)*class_precision[i]
    recall += (kmeans_test_counter[i]/total_documents_test)*class_recall[i]

f_measure = (2*precision*recall)/(precision+recall)
print(f_measure)

0.7485153758376375


# Calculating V-Measure

In [45]:
x = {}
for i, item in enumerate(clusters_label):
    x[item] = i

true_labels_code_test = []
for i, item in enumerate(true_labels_test[:]):
    try:
        true_labels_code_test.append(x[true_labels_test[i]])
    except:
        true_labels_code_test.append(5)

# Calculating V-Measure
print("Our V-Measure is: "+str(v_measure_score(true_labels_code_test, predicted_labels_test)))