<a href="https://colab.research.google.com/github/chilung/EmotionX2020/blob/master/lab1_0686028.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Recommend Similar News Articles
This notebook demonstrates how to use bag-of-word vectors and cosine similarity for news article recommendation.

In [1]:
import logging

logging.basicConfig(level=logging.DEBUG, format='%(message)s')

logging.debug('Hello Debug')
logging.info('Hello Info')
logging.warning('Hello Warning')
logging.error('Hello Error')
logging.critical('Hello Critical')

Hello Debug
Hello Info
Hello Error
Hello Critical


In [0]:
import re
import pandas as pd
from io import open
import glob
import os
import numpy as np
import math
from collections import OrderedDict
from collections import Counter
import torch
import torchvision
import heapq
import unicodedata
import string

#Fetching the Corpus
`get_corpus()` reads the CSV file, and then return a list of the news headlines

In [3]:
num_of_documents = 5354

def get_corpus():
  df = pd.read_csv('https://raw.githubusercontent.com/bshmueli/108-nlp/master/reuters.csv') # https://bit.ly/nlp-reuters
  logging.debug("Dataset columns: {}".format(df.columns))
  logging.debug("Dataset size: {}".format(len(df)))
  corpus = df.title.to_list()[0:num_of_documents]
  return corpus

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  test_corpus = get_corpus()
  #for i, c in enumerate(test_corpus):
  #  logging.debug("document {}: {}".format(i+1, c))
  logging.debug("len of corpus: {}".format(len(test_corpus)))

Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354
len of corpus: 5354


In [0]:
def lowerCase(inputStr):
    return inputStr.lower()

In [5]:
import urllib.request as urllib

def get_stop_words(stop_words_source):
  stop_words_byte = urllib.urlopen(stopwords_url)

  stop_words = []
  for line in stop_words_byte:
    stop_words = stop_words + [w for w in line.decode("utf-8").split('\n') if w != '']
  stop_words = [''] + stop_words
  return(stop_words)

stopwords_url = "https://bit.ly/nlp-stopwords"
stop_words = get_stop_words(stopwords_url)
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  logging.debug(stop_words)

['', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [0]:
def split_on_space(document):
  return re.split(r' ', document)

In [0]:
def remove_stop_words(words):
  return [w for w in words if w not in stop_words]

In [0]:
def split_on_punctation(words):
  return [re.split(r'\W+', w)[0] for w in words]

In [9]:
def tokenize(document):
  logging_level = logging.getLogger().getEffectiveLevel()
  logging.getLogger().setLevel(logging.INFO)
  
  words = split_on_space(document) # remain the words like you're unchanged so that we can remove them out in the following process
  logging.debug("Step 1 split on space: {}".format(words))

  words = [lowerCase(w) for w in words]
  logging.debug("Step 2 lower case: {}".format(words))

  words = remove_stop_words(words) # the first process of removing stop word such as you're in 
  logging.debug("Step 3 1st remove stop words: {}".format(words))
  
  words = split_on_punctation(words) # remove punctation words
  logging.debug("Step 4 split on punctation: {}".format(words))
 
  words = remove_stop_words(words) # the first process of removing stop word such as you're in 
  logging.debug("Step 5 2nd remove stop words: {}".format(words))
  
  logging.getLogger().setLevel(logging_level)
  return words

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  test_str = "you'll you'd me my myself I am here. But ,[]\& is strange to me??!! How do you think"
  logging.debug(test_str)
  logging.debug(tokenize(test_str))

you'll you'd me my myself I am here. But ,[]\& is strange to me??!! How do you think
['strange', 'think']


#Computing word frequencies
`get_vocab(corpus)` computes the word frequencies in a given corpus. It also collect word frequencies of individual document in the given corpus. It returns two items. The first item is a list of 2-tuples, `vocab`: each tuple contains the token and its frequency. The second item is two dimension list, `doc_vocab`: a list of individual document's list of 2-tuples, each tuple contains the token and its frequency.

In [10]:
def get_vocab(corpus):
  vocabulary = Counter()
  doc_vocab = {}
  for document in corpus:
    logging_level = logging.getLogger().getEffectiveLevel()
    logging.getLogger().setLevel(logging.INFO)

    logging.debug("origin document: {}".format(document))
    tokens = tokenize(document)
    logging.debug("token: {}".format(tokens))
    doc_vocab_collection = Counter()
    doc_vocab_collection.update(tokens)
    doc_vocab_collection['LENGTH'] = len(tokens)
    
    doc_vocab[document] = doc_vocab_collection
    vocabulary.update(tokens)
  
  logging.getLogger().setLevel(logging.DEBUG)
  return vocabulary, doc_vocab

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  # logging.getLogger().setLevel(logging.INFO)
  test_corpus = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus)
  logging.debug("vocab: {}".format(test_vocab))
  logging.debug("doc_vocab: {}".format(test_doc_vocab))

Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354


In [11]:
#
# 'cal_vocab_idf' calculates the idf of each word.
# The idf of word is given by log of the result of total number of documents divided by the number of documents in which the word happens.
#

def cal_vocab_idf(vocab, doc_vocab):
  vocab_idf = Counter()
  for i, token in enumerate(vocab):
    word, freq = token
    vocab_idf[word] = 0
    for document in doc_vocab:
      if word in doc_vocab[document]:
        vocab_idf[word] = vocab_idf[word] + 1
    vocab_idf[word] = math.log(len(doc_vocab) / vocab_idf[word])
  return vocab_idf

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  test_corpus = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus)
  test_vocab = test_vocab.most_common(100)
  logging.debug("corpus: {}".format(test_corpus))
  logging.debug("vocabpus: {}".format(test_vocab))
  logging.debug("doc_vocab: {}".format(test_doc_vocab))
  test_vocab_idf = cal_vocab_idf(test_vocab, test_doc_vocab)
  logging.debug("test_vocab_idf: {}".format(test_vocab_idf))

Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354
vocabpus: [('u', 1149), ('trump', 750), ('says', 308), ('exclusive', 236), ('new', 221), ('sources', 190), ('china', 181), ('deal', 168), ('oil', 163), ('billion', 150), ('court', 145), ('wall', 144), ('state', 140), ('obama', 135), ('may', 132), ('north', 130), ('fed', 123), ('dollar', 116), ('street', 113), ('clinton', 110), ('korea', 110), ('house', 106), ('talks', 101), ('data', 100), ('syria', 95), ('russia', 91), ('police', 89), ('islamic', 86), ('stocks', 86), ('apple', 84), ('trade', 84), ('brazil', 83), ('south', 83), ('eu', 82), ('election', 81), ('plan', 79), ('iran', 77), ('million', 76), ('bid', 75), ('white', 72), ('two', 72), ('first', 71), ('back', 71), ('probe', 69), ('security', 68), ('global', 68), ('japan', 68), ('bank', 67), ('investors', 66), ('ceo', 65), ('fight', 65), ('case', 65), ('vote', 63), ('seeks', 63), ('republican', 63), ('senate', 63), ('shares', 62), ('top', 62), ('growth'

In [12]:
def cal_doc_tfidf_vec(vocab, doc_vocab, vocb_idf):
  doc_tfidf_vec = {}
  for document in doc_vocab:
    doc_tfidf_vec[document] = []
    for i, token in enumerate(vocab): 
      word, freq = token
      # doc_vocab[document][word]: the count of the word in this document
      # doc_vocab[document]['LENGTH']: the count of total words in this document
      # vocb_idf[word]: the word's idf
      doc_tfidf_vec[document] = doc_tfidf_vec[document] + [(doc_vocab[document][word] / doc_vocab[document]['LENGTH']) * vocb_idf[word]]
  return doc_tfidf_vec

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  test_corpus = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus)
  test_vocab = test_vocab.most_common(100)
  test_vocab_idf = cal_vocab_idf(test_vocab, test_doc_vocab)
  #logging.debug("corpus: {}".format(test_corpus))
  #logging.debug("vocabpus: {}".format(test_vocab))
  #logging.debug("doc_vocab: {}".format(test_doc_vocab))
  #logging.debug("test_vocab_idf: {}".format(test_vocab_idf))
  
  test_doc_tfidf_vec = cal_doc_tfidf_vec(test_vocab, test_doc_vocab, test_vocab_idf)
  logging.debug("test_doc_tfidf_vec: {}".format(test_doc_tfidf_vec))

Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#Compute BoW (Bag-of-Words) Vector
`doc_to_vec(doc, vocab)` returns a bag-of-words vector for document `doc`, corresponding to the presence of a word in `vocab`

In [0]:
def doc2vec(doc):
  words = tokenize(doc)
  return [1 if token in words else 0 for token, freq in vocab]

Compute the Bag-of-Words vector for each document

Cosine similarity between two numerical vectors

In [0]:
def cosine_similarity(vec_a, vec_b):
  assert len(vec_a) == len(vec_b)
  if sum(vec_a) == 0 or sum(vec_b) == 0:
    return 0 # hack
  a_b = sum(i[0] * i[1] for i in zip(vec_a, vec_b))
  a_2 = sum([i*i for i in vec_a])
  b_2 = sum([i*i for i in vec_b])
  return a_b/(math.sqrt(a_2) * math.sqrt(b_2))

In [15]:
# tfidf_vector = False
tfidf_vector = True

if tfidf_vector:
  #
  # calculate the similiarity based on word tfidf vector
  #
  def doc_similarity(doc_a, doc_b):
    # logging.debug("tfidf {}, {}".format(doc_a, doc_b))
    return cosine_similarity(doc_tfidf_vec[doc_a], doc_tfidf_vec[doc_b])
else:
  #
  # calculate the similiarity based on word bow vector
  #
  def doc_similarity(doc_a, doc_b):
    # logging.debug("bow {}, {}".format(doc_a, doc_b))
    return cosine_similarity(doc2vec(doc_a), doc2vec(doc_b))
  
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  print("strange")
  test_corpus = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus)
  test_vocab = test_vocab.most_common(100)
  test_vocab_idf = cal_vocab_idf(test_vocab, test_doc_vocab)
  doc_tfidf_vec = cal_doc_tfidf_vec(test_vocab, test_doc_vocab, test_vocab_idf)

  seed_doc = test_corpus[5]
  logging.debug('> "{}"'.format(seed_doc))
  similarities = [doc_similarity(seed_doc, doc) for id, doc in enumerate(test_corpus)]
  logging.debug(similarities)



strange


Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354
> "New York, New Jersey 10-year transport plan could include rail tunnel"
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.39764983433389256, 0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0, 0, 0.0, 0, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4447004166881674, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0, 0.0, 0, 0.0, 0.3485339504293299, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0, 0.390943905515151, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0.2611255723827717, 0.0, 0.0, 0.0, 0, 0, 0.31405199975757436, 0.7222595832948128, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.29651447264165676, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0.41354669

# Find Similar Documents
Find and print the $k$ most similar titles to a given title

In [0]:
def k_similar(seed_id, k):
  seed_doc = corpus[seed_id]
  print('> "{}"'.format(seed_doc))
  similarities = [doc_similarity(seed_doc, doc) for id, doc in enumerate(corpus)]
  logging.debug("Similiarities: {}".format(similarities))
  top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i])[-k:] # https://stackoverflow.com/questions/13070461/get-indices-of-the-top-n-values-of-a-list
  nearest = [[corpus[id], similarities[id]] for id in top_indices]
  print()
  for story in reversed(nearest):
    print('* "{}" ({})'.format(story[0], story[1]))

# Test our program

In [21]:
corpus = get_corpus()
vocab, doc_vocab = get_vocab(corpus)
vocab = vocab.most_common(100)
vocab_idf = cal_vocab_idf(vocab, doc_vocab)
doc_tfidf_vec = cal_doc_tfidf_vec(vocab, doc_vocab, vocab_idf)

k_similar(28, 5)

Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354
Similiarities: [0.0, 0.05403268529304371, 0.20928476469036178, 0.0, 0.0, 0.0, 0.0, 0.07055273674150349, 0.0, 0.0, 0.0, 0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07566534975274185, 0.062061551505394934, 0, 0.0, 0.20928476469036178, 0, 0, 1.0, 0, 0.0, 0.11897146907397291, 0, 0.05508987212820124, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0.20928476469036178, 0.0, 0.0, 0.043622504686908786, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.04016015842604726, 0.0, 0.41723269205401964, 0.0, 0.0, 0.0, 0.20928476469036178, 0.20928476469036178, 0, 0.0, 0.07333297679746148, 0.0, 0.0, 0, 0.0, 0.0, 0, 0.0, 0.31382964446179373, 0.08278983267133783, 0.0, 0.0, 0.04906657630264249, 0.0, 0.0, 0.0, 0.07276332153844298, 0.0688016153450314, 0, 0.0, 0.0, 0.07341703582999837, 0.0, 0.6128793478436706, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.044724919537180496, 0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0, 0.055168232432727976, 0.07

> "U.S. banks’ dismal first quarter may spell trouble for 2016"

* "U.S. banks’ dismal first quarter may spell trouble for 2016" (1.0)
* "U.S. banks’ post-election rally may be just an appetizer" (0.8155447390056827)
* "U.S. small cap funds shine in first quarter but may have harder road ahead" (0.7901765024263359)
* "Small banks rally pauses but may not be over yet" (0.7882341711625722)
* "Biggest U.S. banks clear first hurdle in Fed’s annual stress tests" (0.7507645888142283)
