Doc2Vec for Document Clustering

Dataset - http://mlg.ucd.ie/datasets/bbc.html

Citation - D. Greene and P. Cunningham. "Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering", Proc. ICML 2006.

Consists of 2225 documents from the BBC news website corresponding to stories in five topical areas from 2004-2005.
Class Labels: 5 (business, entertainment, politics, sport, tech)
- 510 business
- 386 entertainment
- 417 politics
- 511 sports
- 401 tech

In [1]:
from pathlib import Path

documents = []
documentReference = []
documentCount = 0

def dataCleanup(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    
    # Add space around punctuations
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        text = text.replace(char, ' ' + char + ' ')
    
    return text

def buildDocument(folder, category):
    allDocs = Path(folder).glob('**/*')
    global documentCount

    for doc in allDocs:
        file = open(doc, "r")
        data = file.read()
        cleanData = dataCleanup(data)
        
        documents.append(cleanData)
        
        # Keep a record of category and filename
        documentReference.append([])
        documentReference[documentCount].append(category)
        documentReference[documentCount].append(doc)
        documentCount = documentCount + 1
    
    return

buildDocument("bbc-fulltext/business", "business")
buildDocument("bbc-fulltext/entertainment", "entertainment")
buildDocument("bbc-fulltext/politics", "politics")
buildDocument("bbc-fulltext/sport", "sport")
buildDocument("bbc-fulltext/tech", "tech")

In [2]:
from gensim.models import doc2vec
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import download

download('stopwords') # For stopword removal
download('punkt') # For tokenizer

def removeStopwords(text):
    # Removing stopwords improved results with BBC news data, but test with and without stop words.
    stop_words = stopwords.words('english')
    text = [w for w in text if w not in stop_words]
    text = [w for w in text if w.isalpha()]
    return text

def text2tokens(text):
    text = text.lower()
    wordList = word_tokenize(text)
    wordList = removeStopwords(wordList)
    return wordList

doc2vec_corpus = []

for i, text in enumerate(documents):
    words = text2tokens(text)
    tag = [i]
    doc2vec_corpus.append(TaggedDocument(words=words, tags=tag))

# The model parameters below can impact the outcome.
# 1. Size - Vector size. 100 worked best with the BBC news data set. Tried various between 50 to 300 before choosing 100.
# 2. Window - context window, i.e. the number of words on the left and right of a word that 
# defines a "context" for learning the meaning of the word. Context window of 1 gave the best result (tried between 1 and 10)
# ..... probably due to the very small size of documents / vocabulary.

model = doc2vec.Doc2Vec(doc2vec_corpus, size = 100, negative = 5, window = 1, iter = 20, min_count = 2, workers = 4, alpha=0.025, min_alpha=0.025)
model.save("bbc_news_doc2vec.model")
print("Doc2Vec Model Saved")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dhiraj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dhiraj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Doc2Vec Model Saved


In [3]:
docVectors = []
count = 0
while (count < documentCount):
    docVectors.append(model.docvecs[count])
    count = count + 1

In [4]:
from sklearn.cluster import KMeans
num_clusters = 20

km = KMeans(n_clusters = num_clusters, random_state = 99999)
km.fit(docVectors)
clusters = km.labels_.tolist()

In [5]:
import nltk
from nltk.stem import WordNetLemmatizer

def wordFrequencyFilter(text, max_word_count, freq):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    
    wordList = []
    for tag in tagged:
        if ((tag[1] == 'NN') or (tag[1] == 'NNS') or (tag[1] == 'NNP') or (tag[1] == 'NNPS')):
            tagLemma = lemmatizer.lemmatize(tag[0])
            wordList.append(tagLemma)

    freqDist = nltk.FreqDist(wordList)
    common = freqDist.most_common(max_word_count)
        
    mainText = ''
    
    for word in common:
        # Exclude words less than 2 characters long.
        # Exclude words with frequency count greater than freq
        # Only include alphabetic strings
        
        if ((word[0].isalpha()) and (len(word[0]) > 2) and ((word[1] < freq) == False)):
            mainText = mainText + ' ' + word[0]
    
    return mainText

def extractKeywords(file):
    myFile = open(file, 'r')
    data = myFile.read()
    
    # 10 most frequent words, minimum frequency 2
    mainText = wordFrequencyFilter(data, 10, 2)
    
    return mainText
    

myAnalysis = []
count = 0
while (count < documentCount):
    # Cluster, Original Category, File Reference, Keywords
    myAnalysis.append([])
    myAnalysis[count].append(clusters[count])
    myAnalysis[count].append(documentReference[count][0])
    myAnalysis[count].append(documentReference[count][1])
    keywords = extractKeywords(documentReference[count][1])
    myAnalysis[count].append(keywords)
    count = count + 1

import pandas as pd

myLabels = ['Cluster', 'Category', 'File', 'Keywords']
df = pd.DataFrame(myAnalysis, columns=myLabels)

In [6]:
outputFile = "ClusterAnalysis.csv"
df = df.sort_values('Cluster', ascending = True)
df.to_csv(outputFile, sep='\t', encoding='utf-8')

### Display the key phrases from each cluster

In [12]:
customStopwords = ['man', 'woman', 'men', 'women', 'year', 'people', 'male', 'female', 
                   'world', 'month', 'week', 'year', 'company']

def clusterKeyPatterns(text, wordcount):
    tokens = nltk.word_tokenize(text)
    freqDist = nltk.FreqDist(tokens)
    common = freqDist.most_common(wordcount)
    
    returnText = ''
    for word in common:
        if ((word[0] not in customStopwords) and (word[1] > 1)):
            returnText = returnText + ' ' + word[0]
    
    return returnText
    
clusterKeywords = []

for num in range(0, num_clusters):
    clusterKeywords.append([])
    clusterKeywords[0].append('')

for num in range(0, documentCount):
    tmpStr = str(clusterKeywords[myAnalysis[num][0]]) + ' ' + str(myAnalysis[num][3])
    clusterKeywords[myAnalysis[num][0]] = tmpStr

for num in range(0, num_clusters):
    # Obtain the top xx words in each cluster
    listofwords = clusterKeyPatterns(str(clusterKeywords[num]), 5)
    print("Key words in cluster - ", num)
    print(listofwords, "\n")   


Key words in cluster -  0
 firm share profit 

Key words in cluster -  1
 film award star show 

Key words in cluster -  2
 government lord law home right 

Key words in cluster -  3
 yukos oil firm court 

Key words in cluster -  4
 country government minister 

Key words in cluster -  5
 champion match seed 

Key words in cluster -  6
 technology game sony 

Key words in cluster -  7
 service phone broadband 

Key words in cluster -  8
 security software user program 

Key words in cluster -  9
 club game player chelsea champion 

Key words in cluster -  10
 court drug case charge 

Key words in cluster -  11
 england game wale ireland france 

Key words in cluster -  12
 government plan tax 

Key words in cluster -  13
 game time title phone 

Key words in cluster -  14
 site search information blog 

Key words in cluster -  15
 growth economy rate market 

Key words in cluster -  16
 election labour blair party brown 

Key words in cluster -  17
 party election leader minister 

Ke

The results above is perfect!!