# Assignment 1 - Clustering Documents

In [1]:
#importing libraries and documents from NLTK
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import collections

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("brown")

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /Users/admin/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [17]:
from nltk.corpus import brown as corpus
    
docs=[corpus.words(fileid) for fileid in corpus.fileids()]

print("num of docs:", len(docs))

num of docs: 500


In [20]:
for n,item in enumerate(corpus.words(corpus.fileids()[0])):
    print(item, end=" ")



In [25]:
# English stopwords defined by the NLTK package.
en_stop = nltk.corpus.stopwords.words('english')

# Ignore noises that might affect our result.
en_stop = ["``","''","/",",.",".,",";","--",":",")","(",'"','&',"'",'),',',"','-','.,','.,"','.-',"?",">","<"]                  \
         +["0","1","2","3","4","5","6","7","8","9"]                                                      \
         +["said","say","u","v","mln","ct","net","dlrs","tonne","pct","shr","nil","company","lt","share","year","billion","price"]          \
         +en_stop

In [26]:
from nltk.corpus import wordnet as wn

def preprocess_word(word, stopwordset):
    
    word=word.lower()
    
    if word in [",","."]:
        return None
    
    if word in stopwordset:
        return None
    
    lemma = wn.morphy(word)
    if lemma is None:
        return word

    elif lemma in stopwordset: 
        return None
    else:
        return lemma
    

def preprocess_document(document):
    document=[preprocess_word(w, en_stop) for w in document]
    document=[w for w in document if w is not None]
    return document

def preprocess_documents(documents):
    return [preprocess_document(document) for document in documents]

In [32]:
pre_docs=preprocess_documents(docs)
pre_docs=[" ".join(doc) for doc in pre_docs]
print(pre_docs[0])



In [33]:
vectorizer = TfidfVectorizer(max_features=200, token_pattern=u'(?u)\\b\\w+\\b' )
tf_idf = vectorizer.fit_transform(pre_docs)

In [59]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tf_idf, tf_idf)
print(cosine_sim)

[[1.         0.63350389 0.47099225 ... 0.3553741  0.46419732 0.5014999 ]
 [0.63350389 1.         0.49839295 ... 0.30142647 0.32996559 0.35159734]
 [0.47099225 0.49839295 1.         ... 0.24180153 0.235491   0.27458611]
 ...
 [0.3553741  0.30142647 0.24180153 ... 1.         0.40172073 0.48187358]
 [0.46419732 0.32996559 0.235491   ... 0.40172073 1.         0.52333239]
 [0.5014999  0.35159734 0.27458611 ... 0.48187358 0.52333239 1.        ]]


In [60]:
# K-means setting
num_clusters = 10
km = KMeans(n_clusters=num_clusters, random_state = 0)

# fit
clusters = km.fit_predict(tf_idf)

In [61]:
for doc, cls in zip(preprocess_documents(docs)[0], clusters):
    print(cls,doc)

1 fulton
1 county
1 grand
1 jury
4 friday
1 investigation
4 atlanta's
1 recent
8 primary
0 election
2 produce
2 evidence
2 irregularity
8 take
2 place
9 jury
9 term-end
9 presentment
4 city
8 executive
9 committee
9 over-all
9 charge
4 election
8 deserve
8 praise
8 thanks
8 city
2 atlanta
9 manner
0 election
2 conduct
8 september-october
4 term
7 jury
7 charge
8 fulton
2 superior
2 court
2 judge
8 durwood
8 pye
8 investigate
6 report
1 possible
8 irregularity
4 hard-fought
8 primary
4 mayor-nominate
8 ivan
1 allen
0 jr.
4 relative
2 handful
8 report
0 receive
2 jury
4 consider
1 widespread
1 interest
8 election
1 number
8 voter
8 size
8 city
0 jury
8 find
2 many
8 georgia's
8 registration
2 election
2 laws
2 outmode
8 inadequate
2 often
2 ambiguous
8 recommend
4 fulton
2 legislator
4 act
2 laws
4 study
8 revise
2 end
4 modernize
2 improve
8 grand
8 jury
7 comment
0 number
0 topic
0 among
0 atlanta
0 fulton
0 county
6 purchasing
0 department
0 well
7 operate
7 follow
2 generally
0 accep