# Exercise 2, Section 3

In [1]:
# Section 3
import io, nltk, sys, gensim, math, collections
from gensim.models.word2vec import LineSentence
from gensim.models import doc2vec
from collections import Counter, defaultdict
from sklearn.metrics.cluster import v_measure_score
from sklearn.cluster import KMeans
from gensim.models.doc2vec import Doc2Vec, TaggedDocument




In [2]:
stop_words = [line.rstrip('\n') for line in open('Stop_words2.txt', encoding="utf8")]

In [3]:
lines = [line.rstrip('\n') for line in open('Data/HAM-Train.txt', encoding="utf8")]

true_labels = []

puredFile = open('puredInputSectionThree.txt', 'w', encoding="utf8")
for i, line in enumerate(lines):
    delimiter = line.find('@@@@@@@@@@')+10
    true_labels.append(line[:delimiter-10])
    line = line[delimiter:]
    tokens = nltk.word_tokenize(line)
    for token in tokens:
        if token in stop_words:
            continue
        else:
            puredFile.write(token+" ")
    puredFile.write("\n")
puredFile.close()

In [4]:
documents = [line.rstrip('\n') for line in open('puredInputSectionThree.txt', encoding="utf8")]
totalDocuments = len(lines)

In [5]:
documents_tagged = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
model_doc2vec = doc2vec.Doc2Vec(documents_tagged, vector_size=300, window=5, min_count=1, workers=4, dm=0)
model_doc2vec.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [6]:
docs_vector = []
for doc in documents:
    docs_vector.append(model_doc2vec.infer_vector(doc))

In [7]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(docs_vector)

Counter({1: 1796, 3: 1666, 4: 1450, 2: 1448, 0: 1380})


In [8]:
cluster = collections.defaultdict(lambda: 0)
doc_type_counter = collections.defaultdict(lambda: 0)
lines = [line.rstrip('\n') for line in open('Data/HAM-Train.txt', encoding="utf8")]
for i,line in enumerate(lines):
    delimiter = line.find('@@@@@@@@@@')
    cluster[kmeans.labels_[i], line[:delimiter]] += 1
    doc_type_counter[line[:delimiter]] += 1

In [9]:
doc_types = ['اقتصاد', 'سیاسی', 'اجتماعی', 'ادب و هنر', 'ورزش']

# Calculating NMI

In [11]:
# Calculating NMI

total_documents = 7740


# calculate class entropy
y = 0
for doc_type in doc_types: 
    x = doc_type_counter[doc_type]/total_documents
#     print(x)
#     print(x*math.log2(x))
    y += (x*math.log2(x))
class_entropy = y
    
    
# calculating cluster entropy
z = 0
for i in range(5):
    x = Counter(kmeans.labels_)[i]/total_documents
    z += (x*math.log2(x))
cluster_entropy = z


# calculating mutual information
k = 0
for i in range(5):
    l = 0
    for doc_type in doc_types:
        x = cluster[(i), doc_type] / Counter(kmeans.labels_)[i]
        if (x!=0):
            l += (x*math.log2(x))
#         print(i)
#         print(doc_type)
#         print(cluster[(i), doc_type])
#         print(Counter(kmeans.labels_)[i])
    k += (Counter(kmeans.labels_)[i]/total_documents)*l
mutual_information = y-k


# calculate NMI
n = (2 * mutual_information) / (class_entropy + cluster_entropy)
print("Our NMI is: "+str(n))

Our Class entropy is: -2.1980033260728837
Our Cluster entropy is: -2.3146202239365077
Our Mutual Information is: -0.3990818392306732
Our NMI is: 0.17687353478880047


# Calculating Accuracy

In [12]:
# Calculating Accuracy

clusters_label = []
for i in range(5):
    max_val = 0 
    for doc_type in doc_types:
        if(max_val < cluster[(i), doc_type]):
            max_val = cluster[(i), doc_type]
            type = doc_type
    clusters_label.append(type)

x = 0
for i in range(5):
    x += cluster[(i), clusters_label[i]]
print(x/total_documents)

0.5142118863049095


# Calculating F-Measure


In [13]:
# Calculating F-Measure


# calculating precision & recall
class_precision = []
class_recall = []
for i in range(5):
    precision = cluster[(i), clusters_label[i]]/Counter(kmeans.labels_)[i]
    recall = cluster[(i), clusters_label[i]]/doc_type_counter[clusters_label[i]]
    class_precision.append(precision)
    class_recall.append(recall)

precision = 0
recall = 0
for i in range(5):
    precision += (Counter(kmeans.labels_)[i]/total_documents)*class_precision[i]
    recall += (Counter(kmeans.labels_)[i]/total_documents)*class_recall[i]


f_measure = (2*precision*recall)/(precision+recall)
print(f_measure)

0.46902304102897546


# Calculating V-Measure

In [14]:
x = {}
for i, item in enumerate(clusters_label):
    x[item] = i
true_labels_code = []
for i, item in enumerate(true_labels[:]):
    try:
        true_labels_code.append(x[true_labels[i]])
    except:
        true_labels_code.append(5)

print("Our V-Measure is: "+str(v_measure_score(true_labels_code, kmeans.labels_)))

# Going to Test dataset

In [17]:
lines_test = [line.rstrip('\n') for line in open('Data/HAM-Test.txt', encoding="utf8")]

In [18]:
true_labels_test = []
doc_type_counter_test = collections.defaultdict(lambda: 0)
puredFile = open('puredInputSectionThree_test.txt', 'w', encoding="utf8")
for i, line in enumerate(lines_test[:]):
    delimiter = line.find('@@@@@@@@@@')+10
    doc_type_counter_test[line[:delimiter-10]] += 1
    true_labels_test.append(line[:delimiter-10])
    line = line[delimiter:]
    tokens = nltk.word_tokenize(line)
    for token in tokens:
        if token in stop_words:
            continue
        else:
            puredFile.write(token+" ")
    puredFile.write("\n")
puredFile.close()

In [19]:
lines = [line.rstrip('\n') for line in open('puredInputSectionThree_test.txt', encoding="utf8")]

In [20]:
# Creating document vector for test documents
document_vector_test = []

for i, line in enumerate(lines):
    x = model_doc2vec.infer_vector(line)
    document_vector_test.append(x)

In [21]:
print(len(document_vector_test))


860


In [22]:
kmeans_test_counter = collections.defaultdict(lambda: 0)
test_clustering = collections.defaultdict(lambda: 0)
predicted_labels_test = []

for i, item in enumerate(document_vector_test[:]):
    x = kmeans.predict(document_vector_test[i].reshape(1, -1))[0]
    kmeans_test_counter[x] += 1
    test_clustering[x, true_labels_test[i]] +=1
    predicted_labels_test.append(x)

# Calculating NMI

In [24]:
total_documents_test = len(document_vector_test)

# calculate class entropy
y = 0
for doc_type in doc_types:
    x = doc_type_counter_test[doc_type]/total_documents_test
    y += (x*math.log2(x))
class_entropy = y

# calculating cluster entropy
z = 0
for i in range(5):
    x = kmeans_test_counter[i]/total_documents_test
    try:
        z += (x*math.log2(x))
    except:
        continue
cluster_entropy = z
    
# calculating mutual information
k = 0
for i in range(5):
    l = 0
    for doc_type in doc_types:
        try:
            x = test_clustering[i, doc_type] / kmeans_test_counter[i]
        except: 
            continue
        if (x!=0):
            l += (x*math.log2(x))
    k += (kmeans_test_counter[i]/total_documents_test)*l
mutual_information = y-k

# calculate NMI
n = (2 * mutual_information) / (class_entropy + cluster_entropy)
print("Our NMI is: "+str(n))

Our Class entropy is: -2.2116490392885453
Our Cluster entropy is: -2.3158108086436404
Our Mutual Information is: -0.4244797174580748
Our NMI is: 0.18751341004247504


# Calculating Accuracy

In [25]:
# Calculating Accuracy

clusters_label = []
for i in range(5):
    max_val = 0 
    for doc_type in doc_types:
        if(max_val < cluster[(i), doc_type]):
            max_val = cluster[(i), doc_type]
            type = doc_type
    clusters_label.append(type)

x = 0
for i in range(5):
    x += test_clustering[(i), clusters_label[i]]
print(x/total_documents_test)

0.5069767441860465


# Calculating F-Measure 

In [26]:

# calculating precision & recall
class_precision_test = []
class_recall_test = []
for i in range(5):
    precision = test_clustering[(i), clusters_label[i]]/kmeans_test_counter[i]
    recall = test_clustering[(i), clusters_label[i]]/doc_type_counter_test[clusters_label[i]]
    class_precision.append(precision)
    class_recall.append(recall)

precision = 0
recall = 0
for i in range(5):
    precision += (kmeans_test_counter[i]/total_documents_test)*class_precision[i]
    recall += (kmeans_test_counter[i]/total_documents_test)*class_recall[i]

f_measure = (2*precision*recall)/(precision+recall)
print(f_measure)

0.4726177308612159


# Calculating V-Measure

In [27]:
x = {}
for i, item in enumerate(clusters_label):
#     print(i)
#     print(item)
    x[item] = i

true_labels_code_test = []
for i, item in enumerate(true_labels_test[:]):
    try:
        true_labels_code_test.append(x[true_labels_test[i]])
    except:
        true_labels_code_test.append(5)
# Calculating V-Measure
print("Our V-Measure is: "+str(v_measure_score(true_labels_code_test, predicted_labels_test)))