"From Word Embedding to Document Distances"

http://proceedings.mlr.press/v37/kusnerb15.pdf

Dataset -
http://mlg.ucd.ie/datasets/bbc.html

Citation - D. Greene and P. Cunningham. "Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering", Proc. ICML 2006.

Consists of 2225 documents from the BBC news website corresponding to stories in five topical areas from 2004-2005.

Class Labels: 5 (business, entertainment, politics, sport, tech)

- 510 business
- 386 entertainment
- 417 politics
- 511 sports
- 401 tech

We will add the first following documents to our corpus. The remaining I will use for testing purposes.
- 500 business
- 350 entertainment
- 400 politics
- 500 sports
- 390 tech

In [1]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from pathlib import Path
import gensim

query_list = []
w2v_corpus = []
query_count = 0

sample = {'business': 500, 'entertainment': 350, 'politics': 400, 'sport': 500, 'tech': 390}
customStopwords = ['man', 'woman', 'men', 'women', 'year', 'people', 'male', 'female', 
                  'world', 'month', 'week', 'year', 'number']

def wordFrequencyFilter(text, max_word_count, freq):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    
    wordList = []
    for tag in tagged:
        if ((tag[1] == 'NN') or (tag[1] == 'NNS') or (tag[1] == 'NNP') or (tag[1] == 'NNPS')):
            tagLemma = lemmatizer.lemmatize(tag[0])
            wordList.append(tagLemma)

    freqDist = nltk.FreqDist(wordList)
    common = freqDist.most_common(max_word_count)
        
    mainText = ''
    for word in common:
        # Exclude words less than 2 characters long.
        # Exclude words with frequency count greater than freq
        # Only include alphabetic strings
        
        if ((word[0].isalpha()) and (len(word[0]) > 2) and ((word[1] < freq) == False) and (word[0] not in customStopwords)):
            mainText = mainText + ' ' + word[0]
    
    return mainText

def removeDuplicates(text, size):
    tokens = nltk.word_tokenize(text)
    freqDist = nltk.FreqDist(tokens)
    
    mainText = ''
    # Keep words that appear in 2% of the news articles
    freq = int(round((size * 2)/100))
    for word in freqDist.items():
        if ((word[1] < freq) == False):
            mainText = mainText + ' ' + word[0]
    
    return mainText

def dataSetup(folder, category):
    megaText = ''
    size = sample[category]
    allDocs = Path(folder).glob('**/*')
    count = 0
    global query_count

    for news in allDocs:
        file = open(news, "r")
        data_read = file.read()
        
        if (count < size):
            # Include text in our reference text & w2v training data
            megaText = megaText + ' ' + wordFrequencyFilter(data_read, 10, 2)
            
            ## Only use the first two paragraph to train w2v
            tempData = data_read
            sentences = tempData.split('\n')
            while '' in sentences:
                sentences.remove('')
            
            sentence = sentences[0] + ' ' + sentences[1]
            w2v_corpus.append(sentence)
            ## w2v training data selected
            
            count = count + 1
            
        else:
            # Save it as a query string
            # I want at least 3 strings in the query
            data = wordFrequencyFilter(data_read, 10, 4)
            
            if (len((data.split())) < 3):
                # Relax frequency rule
                data = wordFrequencyFilter(data_read, 10, 3)
            
            query_list.append([])
            query_list[query_count].append(category)
            query_list[query_count].append(data)
            query_list[query_count].append(data_read)
            query_count = query_count + 1
    
    # Include in function call below minimum no. of news a word should have appeared in
    megaText = removeDuplicates(megaText, size)
    
    return megaText

businessText = dataSetup("bbc-fulltext/business", "business")
entertainmentText = dataSetup("bbc-fulltext/entertainment", "entertainment")
politicsText = dataSetup("bbc-fulltext/politics", "politics")
sportText = dataSetup("bbc-fulltext/sport", "sport")
techText = dataSetup("bbc-fulltext/tech", "tech")

print("\n")
print("BUSINESS - ", businessText, "\n")
print("ENTERTAINMENT - ", entertainmentText, "\n")
print("POLITICS - ", politicsText, "\n")
print("SPORT - ", sportText, "\n")
print("TECHNOLOGY - ", techText, "\n")





BUSINESS -   profit sale quarter time dollar deficit currency euro government yukos unit price cost firm share takeover japan economy growth figure job rate president unemployment india exchange minister production report court case company demand group rise car deal airline china market investment investor country business bank export oil bankruptcy state offer tax analyst sec stock shareholder executive debt deutsche budget spending consumer plan product worldcom fraud insurance december interest trade stake drug official earnings economist bid talk club 

ENTERTAINMENT -   christmas concert film life producer story book award sale winner child actress show actor boy theatre category role act artist star prize list london company director chart series brother movie festival comedy audience bbc ceremony oscar nomination aviator day group academy drama berlin cinema box office studio court magazine event night hollywood ticket hit record school fan performance network time place sing

In [2]:
from gensim.models import Word2Vec
import pandas as pd
from IPython.display import display

# Train w2v with w2v_corpus data
model = Word2Vec(w2v_corpus, workers=3, size=100)

# Use the Google trained model instead
# model = gensim.models.KeyedVectors.load_word2vec_format('D:/Google/GoogleNews-vectors-negative300.bin.gz', binary=True) 

In [3]:
def myTokenizer(text):
    tokens = nltk.word_tokenize(text)
    return tokens

bTokens = myTokenizer(businessText)
eTokens = myTokenizer(entertainmentText)
pTokens = myTokenizer(politicsText)
sTokens = myTokenizer(sportText)
tTokens = myTokenizer(techText)

In [4]:
def calcBooster(text):
    bBoost = eBoost = pBoost = sBoost = tBoost = 1
    tokens = myTokenizer(text)
    count = 0
    
    for token in tokens:
        count = count + 1
        if (token in bTokens):
            eBoost = eBoost + 1
            pBoost = pBoost + 1
            sBoost = sBoost + 1
            tBoost = tBoost + 1
    
        if (token in eTokens):
            bBoost = bBoost + 1
            pBoost = pBoost + 1
            sBoost = sBoost + 1
            tBoost = tBoost + 1
    
        if (token in pTokens):
            bBoost = bBoost + 1
            eBoost = eBoost + 1
            sBoost = sBoost + 1
            tBoost = tBoost + 1
    
        if (token in sTokens):
            bBoost = bBoost + 1
            eBoost = eBoost + 1
            pBoost = pBoost + 1
            tBoost = tBoost + 1
    
        if (token in tTokens):
            bBoost = bBoost + 1
            eBoost = eBoost + 1
            pBoost = pBoost + 1
            sBoost = sBoost + 1

    return bBoost, eBoost, pBoost, sBoost, tBoost

def calculateDistance(text, category):
    businessDistance = model.wmdistance(businessText, text)
    entertainmentDistance = model.wmdistance(entertainmentText, text)
    politicsDistance = model.wmdistance(politicsText, text)
    sportDistance = model.wmdistance(sportText, text)
    techDistance = model.wmdistance(techText, text)
    
    # Apply booster where there are exact matching words between query and reference text
    # Rationale for applying booster is that the words are significant (NN, NNS, NNP, NNPS)
    bBoost, eBoost, pBoost, sBoost, tBoost = calcBooster(text)
    B = businessDistance * bBoost
    E = entertainmentDistance * eBoost
    P = politicsDistance * pBoost
    S = sportDistance * sBoost
    T = sportDistance * tBoost
    
    
    myArray = [('Business', B), ('Entertainment', E), 
               ('Politics', P), ('Sport', S), ('Tech', T)]
    
    myLabels = ['Category', 'Distance']
    df = pd.DataFrame(myArray, columns=myLabels)
    df = df.sort_values('Distance', ascending=True)

    print("Query Category - ", category)
    print("Query Key Words - ", text)
    
    display(df)
  
    return

In [5]:
query = query_list[10][1]
query_classification = query_list[10][0]

calculateDistance(query, query_classification)

Query Category -  entertainment
Query Key Words -   industry film workforce


Unnamed: 0,Category,Distance
1,Entertainment,1.316977
4,Tech,1.490974
2,Politics,2.23422
0,Business,2.251894
3,Sport,2.484956


In [6]:
query = query_list[45][1]
query_classification = query_list[45][0]

calculateDistance(query, query_classification)

Query Category -  entertainment
Query Key Words -   film character series


Unnamed: 0,Category,Distance
1,Entertainment,0.872897
4,Tech,1.364191
2,Politics,1.758752
0,Business,1.79633
3,Sport,1.818922


In [7]:
query = query_list[80][1]
query_classification = query_list[80][0]

calculateDistance(query, query_classification)

Query Category -  tech
Query Key Words -   user rate scam service company customer


Unnamed: 0,Category,Distance
4,Tech,1.913369
0,Business,2.431185
1,Entertainment,2.701026
2,Politics,2.769206
3,Sport,3.444065


In [8]:
query = query_list[25][1]
query_classification = query_list[25][0]

calculateDistance(query, query_classification)

Query Category -  entertainment
Query Key Words -   bollywood buhecha dvd film


Unnamed: 0,Category,Distance
4,Tech,1.231224
1,Entertainment,1.825941
3,Sport,2.462447
2,Politics,2.472827
0,Business,2.664179


In [9]:
query = query_list[70][1]
query_classification = query_list[70][0]

calculateDistance(query, query_classification)

Query Category -  sport
Query Key Words -   match nadal


Unnamed: 0,Category,Distance
3,Sport,0.650421
4,Tech,1.300841
1,Entertainment,1.41673
2,Politics,1.440139
0,Business,1.479252


In [10]:
query = query_list[0][1]
query_classification = query_list[0][0]

calculateDistance(query, query_classification)

Query Category -  business
Query Key Words -   argentina water state government firm tariff investment suez contribution


Unnamed: 0,Category,Distance
0,Business,1.0102
2,Politics,2.249463
4,Tech,2.406279
1,Entertainment,2.742377
3,Sport,2.807325


In [11]:
query = query_list[60][1]
query_classification = query_list[60][0]

calculateDistance(query, query_classification)

Query Category -  politics
Query Key Words -   campaign labour election


Unnamed: 0,Category,Distance
2,Politics,0.74039
4,Tech,1.368569
3,Sport,1.710711
1,Entertainment,1.966678
0,Business,2.063622


In [12]:
query = query_list[65][1]
query_classification = query_list[65][0]

calculateDistance(query, query_classification)

Query Category -  sport
Query Key Words -   safin final tennis title player confidence


Unnamed: 0,Category,Distance
3,Sport,1.063844
4,Tech,1.418458
0,Business,2.169376
2,Politics,2.347879
1,Entertainment,2.426738


In [13]:
query = query_list[55][1]
query_classification = query_list[55][0]

calculateDistance(query, query_classification)

Query Category -  politics
Query Key Words -   sayeed standard committee association vote


Unnamed: 0,Category,Distance
2,Politics,0.422336
1,Entertainment,1.249357
0,Business,1.271761
3,Sport,1.273725
4,Tech,1.273725


In [14]:
query = query_list[15][1]
query_classification = query_list[15][0]

calculateDistance(query, query_classification)

Query Category -  entertainment
Query Key Words -   davis star ossie


Unnamed: 0,Category,Distance
1,Entertainment,0.573917
3,Sport,1.13586
4,Tech,1.13586
0,Business,1.229875
2,Politics,1.235415


In [15]:
query = query_list[72][1]
query_classification = query_list[72][0]

calculateDistance(query, query_classification)

Query Category -  sport
Query Key Words -   mirza crowd bondarenko set


Unnamed: 0,Category,Distance
3,Sport,0.402458
1,Entertainment,0.717801
2,Politics,0.775488
4,Tech,0.804915
0,Business,0.824136
