In [2]:
# Section 1

import io, nltk, sys, gensim, math, collections, numpy
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
from collections import Counter, defaultdict
from sklearn.metrics.cluster import v_measure_score
from sklearn.cluster import KMeans



# Reading train file and remove stop words

In [3]:
# We preproceed our input and put it in puredInputSectionOne

stop_words = [line.rstrip('\n') for line in open('Stop_words2.txt', encoding="utf8")]

In [4]:
lines = [line.rstrip('\n') for line in open('Data/HAM-Train.txt', encoding="utf8")]

In [5]:
true_labels = []

puredFile = open('puredInputSectionOne.txt', 'w', encoding="utf8")
for i, line in enumerate(lines):
    delimiter = line.find('@@@@@@@@@@')+10
    
    true_labels.append(line[:delimiter-10])
    
    line = line[delimiter:]
    tokens = nltk.word_tokenize(line)
    for token in tokens:
        if token in stop_words:
            continue
        else:
            puredFile.write(token+" ")
    puredFile.write("\n")
puredFile.close()

In [6]:
lines = [line.rstrip('\n') for line in open('puredInputSectionOne.txt', encoding="utf8")]
sentences = LineSentence('puredInputSectionOne.txt')

In [7]:
model = Word2Vec(sentences, size=300, window=5, min_count=1, workers=4)

In [8]:
# document_vector = collections.defaultdict(lambda: 0)
document_vector = []

for i,line in enumerate(lines[:]):
    tokens = nltk.word_tokenize(line)
    sum_vector = model.wv[tokens[0]]
    counter = 1
    for token in tokens[1:]:
        counter = counter + 1
        sum_vector = sum_vector + model.wv[token]
#     print(counter)
#     print(sum_vector)
#     print(sum_vector/counter)
    document_vector.append(sum_vector/counter)

In [9]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(document_vector)

In [10]:
kmeans.cluster_centers_.shape

(5, 300)

In [11]:
kmeans.labels_[0]
print(Counter(kmeans.labels_))
print(Counter(kmeans.labels_)[0])

Counter({1: 2649, 2: 1827, 3: 1582, 4: 1522, 0: 160})
160


In [12]:
kmeans.labels_[0]

2

In [13]:
cluster = collections.defaultdict(lambda: 0)
doc_type_counter = collections.defaultdict(lambda: 0)
lines = [line.rstrip('\n') for line in open('Data/HAM-Train.txt', encoding="utf8")]
for i,line in enumerate(lines):
    delimiter = line.find('@@@@@@@@@@')
    cluster[kmeans.labels_[i], line[:delimiter]] += 1
    doc_type_counter[line[:delimiter]] += 1

In [14]:
doc_types = ['اقتصاد', 'سیاسی', 'اجتماعی', 'ادب و هنر', 'ورزش']

In [15]:
for i in range(5):
    print("Cluster "+str(i))
    for doc_type in doc_types:
        print("   "+doc_type+"   "+str(cluster[(i), doc_type]))

Cluster 0
   اقتصاد   160
   سیاسی   0
   اجتماعی   0
   ادب و هنر   0
   ورزش   0
Cluster 1
   اقتصاد   106
   سیاسی   1611
   اجتماعی   874
   ادب و هنر   14
   ورزش   44
Cluster 2
   اقتصاد   1628
   سیاسی   84
   اجتماعی   97
   ادب و هنر   14
   ورزش   4
Cluster 3
   اقتصاد   1
   سیاسی   0
   اجتماعی   3
   ادب و هنر   7
   ورزش   1571
Cluster 4
   اقتصاد   94
   سیاسی   205
   اجتماعی   663
   ادب و هنر   407
   ورزش   153


# Calculating NMI

In [16]:
# Calculating NMI

total_documents = 7740


# calculate class entropy
y = 0
for doc_type in doc_types: 
    x = doc_type_counter[doc_type]/total_documents
#     print(x)
#     print(x*math.log2(x))
    y += (x*math.log2(x))
class_entropy = y
print("Our Class entropy is: "+str(y))
    
    
# calculating cluster entropy
z = 0
for i in range(5):
    x = Counter(kmeans.labels_)[i]/total_documents
    z += (x*math.log2(x))
cluster_entropy = z
print("Our Cluster entropy is: "+str(z))


# calculating mutual information
k = 0
for i in range(5):
    l = 0
    for doc_type in doc_types:
        x = cluster[(i), doc_type] / Counter(kmeans.labels_)[i]
        if (x!=0):
            l += (x*math.log2(x))
#         print(i)
#         print(doc_type)
#         print(cluster[(i), doc_type])
#         print(Counter(kmeans.labels_)[i])
    k += (Counter(kmeans.labels_)[i]/total_documents)*l
mutual_information = y-k
# print(k)
print("Our Mutual Information is: "+str(mutual_information))


# calculate NMI
n = (2 * mutual_information) / (class_entropy + cluster_entropy)
print("Our NMI is: "+str(n))

Our Class entropy is: -2.1980033260728837
Our Cluster entropy is: -2.0663232476449664
Our Mutual Information is: -1.1959163121702119
Our NMI is: 0.5608933985220335


# Calculating Accuracy

In [17]:
# Calculating Accuracy

clusters_label = []
for i in range(5):
    max_val = 0 
    for doc_type in doc_types:
        if(max_val < cluster[(i), doc_type]):
            max_val = cluster[(i), doc_type]
            type = doc_type
    clusters_label.append(type)
#print(clusters_label)

x = 0
for i in range(5):
    x += cluster[(i), clusters_label[i]]
print(x/total_documents)


0.7277777777777777


# Calculating F-Measure

In [18]:
# Calculating F-Measure


# calculating precision & recall
class_precision = []
class_recall = []
for i in range(5):
    precision = cluster[(i), clusters_label[i]]/Counter(kmeans.labels_)[i]
    recall = cluster[(i), clusters_label[i]]/doc_type_counter[clusters_label[i]]
    class_precision.append(precision)
    class_recall.append(recall)

precision = 0
recall = 0
for i in range(5):
    precision += (Counter(kmeans.labels_)[i]/total_documents)*class_precision[i]
    recall += (Counter(kmeans.labels_)[i]/total_documents)*class_recall[i]

# print(precision)
# print(recall)

f_measure = (2*precision*recall)/(precision+recall)
print(f_measure)

0.736731024254575


In [19]:
print(class_precision)

[1.0, 0.608154020385051, 0.8910782703886152, 0.993046776232617, 0.435611038107753]


In [20]:
print(class_recall)

[0.08044243338360986, 0.8478947368421053, 0.8185017596782302, 0.886568848758465, 0.40500916310323765]


In [21]:
predicted_labels = []
for i in range(20):
#     print(kmeans.labels_[i])
    predicted_labels.append(kmeans.labels_[i])
    

In [22]:
print(predicted_labels)

[2, 1, 4, 1, 4, 1, 2, 3, 4, 2, 1, 3, 1, 1, 4, 2, 2, 3, 2, 1]


In [23]:
print(true_labels[:20])

['اقتصاد', 'سیاسی', 'ادب و هنر', 'سیاسی', 'اجتماعی', 'اجتماعی', 'اقتصاد', 'ورزش', 'ادب و هنر', 'اقتصاد', 'سیاسی', 'ورزش', 'سیاسی', 'ادب و هنر', 'اجتماعی', 'اقتصاد', 'اقتصاد', 'ورزش', 'اقتصاد', 'اجتماعی']


In [24]:
x = {}
for i, item in enumerate(clusters_label):
#     print(i)
#     print(item)
    x[item] = i

In [25]:
print(x)

{'اقتصاد': 2, 'سیاسی': 1, 'ورزش': 3, 'اجتماعی': 4}


In [26]:
true_labels_code = []
for i, item in enumerate(true_labels[:]):
    try:
        true_labels_code.append(x[true_labels[i]])
    except:
        true_labels_code.append(5)

In [27]:
print(len(true_labels_code))

7740


# Calculating V-Measure

In [28]:
# Calculating V-Measure
print("Our V-Measure is: "+str(v_measure_score(true_labels_code, kmeans.labels_)))

Our V-Measure is: 0.5608933985220332


# Section 1-2: Going to Test dataset

In [29]:
# Going to test dataset

lines_test = [line.rstrip('\n') for line in open('Data/HAM-Test.txt', encoding="utf8")]


In [30]:
true_labels_test = []
doc_type_counter_test = collections.defaultdict(lambda: 0)
puredFile = open('puredInputSectionOne_test.txt', 'w', encoding="utf8")
for i, line in enumerate(lines_test):
    delimiter = line.find('@@@@@@@@@@')+10
    doc_type_counter_test[line[:delimiter-10]] += 1
    true_labels_test.append(line[:delimiter-10])
    line = line[delimiter:]
    tokens = nltk.word_tokenize(line)
    for token in tokens:
        if token in stop_words:
            continue
        else:
            puredFile.write(token+" ")
    puredFile.write("\n")
puredFile.close()

In [31]:
lines = [line.rstrip('\n') for line in open('puredInputSectionOne_test.txt', encoding="utf8")]

# Creating document vector for test documents

In [32]:
# Creating document vector for test documents
document_vector_test = []

for i,line in enumerate(lines[:]):
    tokens = nltk.word_tokenize(line)
    sum_vector = [0] * 300
    counter = 1
    for token in tokens[1:]:
        counter = counter + 1
        try:
            sum_vector = sum_vector + model.wv[token]
        except:
            continue
    document_vector_test.append(sum_vector/counter)

In [33]:
kmeans_test_counter = collections.defaultdict(lambda: 0)
test_clustering = collections.defaultdict(lambda: 0)
predicted_labels_test = []

for i, item in enumerate(document_vector_test[:]):
    x = kmeans.predict(document_vector_test[i].reshape(1, -1))[0]
    kmeans_test_counter[x] += 1
    test_clustering[x, true_labels_test[i]] +=1
    predicted_labels_test.append(x)
    
# print(test_clustering)

# Calculating NMI

In [34]:
# Calculating NMI

total_documents_test = len(document_vector_test)

# calculate class entropy
y = 0
for doc_type in doc_types:
    x = doc_type_counter_test[doc_type]/total_documents_test
#     try:
    y += (x*math.log2(x))
#     except:
#         continue
class_entropy = y
print("Our Class entropy is: "+str(y))


# calculating cluster entropy
z = 0
for i in range(5):
    x = kmeans_test_counter[i]/total_documents_test
    z += (x*math.log2(x))
cluster_entropy = z
print("Our Cluster entropy is: "+str(z))
    
# calculating mutual information
k = 0
for i in range(5):
    l = 0
    for doc_type in doc_types:
        x = test_clustering[i, doc_type] / kmeans_test_counter[i]
        if (x!=0):
            l += (x*math.log2(x))
    k += (kmeans_test_counter[i]/total_documents_test)*l
mutual_information = y-k
# print("K is: "+str(k))
print("Our Mutual Information is: "+str(mutual_information))

# calculate NMI
n = (2 * mutual_information) / (class_entropy + cluster_entropy)
print("Our NMI is: "+str(n))

Our Class entropy is: -2.2116490392885453
Our Cluster entropy is: -2.053538325758056
Our Mutual Information is: -1.2401656563786483
Our NMI is: 0.581529274208144


# Calculating Accuracy

In [40]:
# Calculating Accuracy

clusters_label = []
for i in range(5):
    max_val = 0 
    for doc_type in doc_types:
        if(max_val < cluster[(i), doc_type]):
            max_val = cluster[(i), doc_type]
            type = doc_type
    clusters_label.append(type)
#print(clusters_label)

In [41]:
print(test_clustering)

defaultdict(<function <lambda> at 0x0000020723E77E18>, {(4, 'ادب و هنر'): 53, (4, 'ورزش'): 21, (1, 'سیاسی'): 172, (1, 'اجتماعی'): 92, (2, 'اقتصاد'): 178, (3, 'ورزش'): 202, (2, 'اجتماعی'): 9, (4, 'اجتماعی'): 62, (1, 'ورزش'): 4, (2, 'سیاسی'): 14, (1, 'اقتصاد'): 11, (4, 'سیاسی'): 14, (0, 'اقتصاد'): 13, (4, 'اقتصاد'): 9, (3, 'ادب و هنر'): 2, (2, 'ورزش'): 1, (2, 'ادب و هنر'): 2, (1, 'ادب و هنر'): 1, (0, 'سیاسی'): 0, (0, 'اجتماعی'): 0, (0, 'ادب و هنر'): 0, (0, 'ورزش'): 0, (3, 'اقتصاد'): 0, (3, 'سیاسی'): 0, (3, 'اجتماعی'): 0})


In [42]:
# Confusion matrix

for i in range(5):
    print("Cluster "+str(i))
    for doc_type in doc_types:
        print("   "+doc_type+"   "+str(test_clustering[(i), doc_type]))

Cluster 0
   اقتصاد   13
   سیاسی   0
   اجتماعی   0
   ادب و هنر   0
   ورزش   0
Cluster 1
   اقتصاد   11
   سیاسی   172
   اجتماعی   92
   ادب و هنر   1
   ورزش   4
Cluster 2
   اقتصاد   178
   سیاسی   14
   اجتماعی   9
   ادب و هنر   2
   ورزش   1
Cluster 3
   اقتصاد   0
   سیاسی   0
   اجتماعی   0
   ادب و هنر   2
   ورزش   202
Cluster 4
   اقتصاد   9
   سیاسی   14
   اجتماعی   62
   ادب و هنر   53
   ورزش   21


# Calculating Accuracy

In [43]:
x = 0
for i in range(5):
#     print(test_clustering[(i), clusters_label[i]])
    x += test_clustering[(i), clusters_label[i]]
print(x/total_documents_test)

0.7290697674418605


# Calculating F-Measure 

In [44]:
# Calculating F-Measure


# calculating precision & recall
class_precision_test = []
class_recall_test = []
for i in range(5):
    precision = test_clustering[(i), clusters_label[i]]/kmeans_test_counter[i]
    recall = test_clustering[(i), clusters_label[i]]/doc_type_counter_test[clusters_label[i]]
    class_precision.append(precision)
    class_recall.append(recall)

precision = 0
recall = 0
for i in range(5):
    precision += (kmeans_test_counter[i]/total_documents_test)*class_precision[i]
    recall += (kmeans_test_counter[i]/total_documents_test)*class_recall[i]

# print(precision)
# print(recall)

f_measure = (2*precision*recall)/(precision+recall)
print(f_measure)

0.7485153758376375


# Calculating V-Measure

In [45]:
x = {}
for i, item in enumerate(clusters_label):
#     print(i)
#     print(item)
    x[item] = i

true_labels_code_test = []
for i, item in enumerate(true_labels_test[:]):
    try:
        true_labels_code_test.append(x[true_labels_test[i]])
    except:
        true_labels_code_test.append(5)

In [46]:
# Calculating V-Measure
print("Our V-Measure is: "+str(v_measure_score(true_labels_code_test, predicted_labels_test)))

Our V-Measure is: 0.5815292742081438
