In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
def load_data(path):
    
    with open(path) as f:
        tags=[]
        observations=[]
        tag_temp=[]
        obs_temp=[]
        for line in f.readlines():
            if line=='\n':
                tags.append(tag_temp)
                observations.append(obs_temp)
                tag_temp=[]
                obs_temp=[]
            else:
                tag_obs_lst = line.split(' ')
                if len(tag_obs_lst)==1:
                    obs_temp.append(tag_obs_lst[0].split('\n')[0])
                else:
                    # print(tag_obs_lst)
                    tag_temp.append(tag_obs_lst[1].split('\n')[0])
                    obs_temp.append(tag_obs_lst[0])
    return tags,observations

def tag_lib(tags):
    alltag=[]
    for tag in tags:
       for t in tag:
           alltag.append(t)
    #print(alltag)
    possible_tags=list(set(alltag))
    return possible_tags
def obs_lib(observations):
    all_obs=[]
    for observation in observations:
        for obs in observation:
            all_obs.append(obs)
    possible_obs=list(set(all_obs))
    return possible_obs

def map(lib):
    dictionary={}
    for i in range(len(lib)):
        dictionary[lib[i]]=i
    return dictionary

def label(data,dictionary):
    label=[]
    for d in data:
        temp=[]
        for ele in d:
            temp.append(dictionary[ele])
        label.append(temp)
    return label


import math
def modify(matrix):
  doclen=matrix.shape[1]
  modified_matrix=np.zeros(shape=(matrix.shape[0],math.ceil(matrix.shape[1]/10)))
  for i in range(0,doclen,10):
    col=i%10
    modified_matrix[:,col]=np.sum(matrix[:,:i],axis=1)
    matrix=matrix[:,i:]
  return modified_matrix

In [3]:
def calc_assignments(vocab, tags):
    distances = np.empty((len(vocab), len(tags)))
    for i, t in enumerate(tags):
        distances[:, i] = np.linalg.norm(vocab-t, axis=1) 
    assignments = distances.argmin(axis=1)  
    min_distances = distances.min(axis=1)
    return assignments, min_distances


def calc_centroids(vocab, tags, assignments):
    new_centroids = tags.copy()
    subbed = np.empty(vocab.shape)  
    for i, t in enumerate(tags):
        mask = (assignments == i)  
        subbed[mask, :] = t
        words_in_cluster = len(vocab[mask])
        if words_in_cluster:
            new_centroids[i] = vocab[assignments == i].mean(axis=0)
        
    return new_centroids, subbed


def run_kmeans(vocab, tags,epochs):  
    subbed = vocab
    old_tags = 0
    iteration = 0
    for i in range(epochs):
        old_tags = tags
        print(f'Iteration {iteration}')
        assignments, min_distances = calc_assignments(vocab, tags)
        error = min_distances.sum()
        tags, subbed = calc_centroids(vocab, tags, assignments)
        print(f'New centroids:\n{tags}\n')
        iteration += 1
    return tags, assignments

In [4]:
"""
vectorize words using count vector
"""
import numpy as np
tags,observations=load_data('./EN/train')
obslib=obs_lib(observations)
# print(obslib)
taglib=tag_lib(tags)
# print(taglib)
tag_dict=map(taglib)
obs_dict=map(obslib)
# print(obs_dict)

words_matrix=np.zeros(shape=(len(obslib),len(observations)))
for i in range(len(observations)):
    for w in observations[i]:
        words_matrix[obs_dict[w],i]=words_matrix[obs_dict[w],i]+1

modified_words_matrix=modify(words_matrix)

tagsvec=np.zeros(shape=(len(tags),modified_words_matrix.shape[1]))
assignmentslst=[]
for j in range(0,modified_words_matrix.shape[0],2000):
  print('---------------------------------------'+str(j))
  tags,assignments=run_kmeans(modified_words_matrix[:j,:],tagsvec,20)
  modified_words_matrix=modified_words_matrix[j:,:]
  assignmentslst.append(assignments)
  

---------------------------------------0
Iteration 0
New centroids:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Iteration 1
New centroids:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Iteration 2
New centroids:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Iteration 3
New centroids:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Iteration 4
New centroids:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Iteration 5
New centroids:
[[0. 0. 0. .