<a href="https://colab.research.google.com/github/chilung/EmotionX2020/blob/master/lab1_0686028.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Recommend Similar News Articles
This notebook demonstrates how to use bag-of-word vectors and cosine similarity for news article recommendation.

In [1]:
import logging

logging.basicConfig(level=logging.INFO, format='%(message)s')

logging.debug('Hello Debug')
logging.info('Hello Info')
logging.warning('Hello Warning')
logging.error('Hello Error')
logging.critical('Hello Critical')

Hello Info
Hello Error
Hello Critical


In [0]:
import re
import pandas as pd
from io import open
import glob
import os
import numpy as np
import math
from collections import OrderedDict
from collections import Counter
import torch
import torchvision
import heapq
import unicodedata
import string

#The Implementation of Corpus in Title or Content
1.   use Contents instead of Titles. The works will be: (1) modify get_corpus to return corpus, a list of titles and corpus_collection, collecting (title, content) pairs. (2) modify get_vocab to get the index from corpus_title and vocab from corpus content. (3) modify doc_similarity's word bow vector by getting content from corpus[doc]. (4) modify k_similar by getting title from corpus_collection.
2.   Switch between title and content by just modify the corpus_collection as  (title, content) pairs.

#Fetching the Corpus
`get_corpus()` reads the CSV file, and then return a list of the news headlines

In [0]:
def get_corpus():
  df = pd.read_csv('https://raw.githubusercontent.com/bshmueli/108-nlp/master/reuters.csv') # https://bit.ly/nlp-reuters
  logging.debug("Dataset columns: {}".format(df.columns))
  logging.debug("Dataset size: {}".format(len(df)))
  
  # num_of_documents = 40
  num_of_documents = len(df)
  
  corpus_title = df.title.to_list()[0:num_of_documents]
  corpus_content = df.content.to_list()[0:num_of_documents]

  corpus_collection = {}
  for i in range(len(corpus_title)):
    corpus_collection[corpus_title[i]] = corpus_content[i]
    # corpus_collection[corpus_title[i]] = corpus_title[i] # for swithing between title or content, just use this line and comment the previous line
  return corpus_title, corpus_collection

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  test_corpus, test_corpus_collection = get_corpus()
  logging.debug("len of corpus: {}".format(len(test_corpus)))

  logging.debug("corpus {}".format(test_corpus))
  logging.debug("corpus_collection: {}".format(test_corpus_collection))

In [0]:
def lowerCase(inputStr):
    return inputStr.lower()

In [0]:
import urllib.request as urllib

def get_stop_words(stop_words_source):
  stop_words_byte = urllib.urlopen(stopwords_url)

  stop_words = []
  for line in stop_words_byte:
    stop_words = stop_words + [w for w in line.decode("utf-8").split('\n') if w != '']
  stop_words = [''] + stop_words
  return(stop_words)

stopwords_url = "https://bit.ly/nlp-stopwords"
stop_words = get_stop_words(stopwords_url)
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  logging.debug(stop_words)

In [0]:
def split_on_space(document):
  return re.split(r' ', document)

In [0]:
def remove_stop_words(words):
  return [w for w in words if w not in stop_words]

In [0]:
def split_on_punctation(words):
  return [re.split(r'\W+', w)[0] for w in words]

In [0]:
def tokenize(document):
  logging_level = logging.getLogger().getEffectiveLevel()
  logging.getLogger().setLevel(logging.INFO)
  
  words = split_on_space(document) # remain the words like you're unchanged so that we can remove them out in the following process
  logging.debug("Step 1 split on space: {}".format(words))

  words = [lowerCase(w) for w in words]
  logging.debug("Step 2 lower case: {}".format(words))

  words = remove_stop_words(words) # the first process of removing stop word such as you're in 
  logging.debug("Step 3 1st remove stop words: {}".format(words))
  
  words = split_on_punctation(words) # remove punctation words
  logging.debug("Step 4 split on punctation: {}".format(words))
 
  words = remove_stop_words(words) # the first process of removing stop word such as you're in 
  logging.debug("Step 5 2nd remove stop words: {}".format(words))
  
  logging.getLogger().setLevel(logging_level)
  return words

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  test_str = "you'll you'd me my myself I am here. But ,[]\& is strange to me??!! How do you think"
  logging.debug(test_str)
  logging.debug(tokenize(test_str))

#Computing word frequencies
`get_vocab(corpus)` computes the word frequencies in a given corpus. It also collect word frequencies of individual document in the given corpus. It returns two items. The first item is a list of 2-tuples, `vocab`: each tuple contains the token and its frequency. The second item is two dimension list, `doc_vocab`: a list of individual document's list of 2-tuples, each tuple contains the token and its frequency.

In [0]:
def get_vocab(test_corpus_collection):
  logging_level = logging.getLogger().getEffectiveLevel()
  logging.getLogger().setLevel(logging.INFO)

  vocabulary = Counter()
  doc_vocab = {}
  for title in test_corpus_collection:

    logging.debug("origin document: {}".format(test_corpus_collection[title]))
    tokens = tokenize(test_corpus_collection[title])
    logging.debug("token: {}".format(tokens))
    doc_vocab_collection = Counter()
    doc_vocab_collection.update(tokens)
    doc_vocab_collection['LENGTH'] = len(tokens)
    
    doc_vocab[title] = doc_vocab_collection
    vocabulary.update(tokens)
  
  logging.getLogger().setLevel(logging_level)
  return vocabulary, doc_vocab

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  test_corpus, test_corpus_collection = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus_collection)
  logging.debug("vocab: {}".format(test_vocab))
  logging.debug("doc_vocab: {}".format(test_doc_vocab))

#Compute Words' IDF and TF-IDF

In [0]:
#
# 'cal_vocab_idf' calculates the idf of each word.
# The idf of word is given by log of the result of total number of documents divided by the number of documents in which the word happens.
#

def cal_vocab_idf(vocab, doc_vocab):
  vocab_idf = Counter()
  for i, token in enumerate(vocab):
    word, freq = token
    vocab_idf[word] = 0
    for document in doc_vocab:
      if word in doc_vocab[document]:
        vocab_idf[word] = vocab_idf[word] + 1
    vocab_idf[word] = math.log(len(doc_vocab) / vocab_idf[word])
  return vocab_idf

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  test_corpus, test_corpus_collection = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus_collection)
  test_vocab = test_vocab.most_common(100)
  logging.debug("corpus: {}".format(test_corpus))
  logging.debug("vocabpus: {}".format(test_vocab))
  logging.debug("doc_vocab: {}".format(test_doc_vocab))
  test_vocab_idf = cal_vocab_idf(test_vocab, test_doc_vocab)
  logging.debug("test_vocab_idf: {}".format(test_vocab_idf))

In [0]:
def cal_doc_tfidf_vec(vocab, doc_vocab, vocb_idf):
  doc_tfidf_vec = {}
  for document in doc_vocab:
    doc_tfidf_vec[document] = []
    for i, token in enumerate(vocab): 
      word, freq = token
      # doc_vocab[document][word]: the count of the word in this document
      # doc_vocab[document]['LENGTH']: the count of total words in this document
      # vocb_idf[word]: the word's idf
      doc_tfidf_vec[document] = doc_tfidf_vec[document] + [(doc_vocab[document][word] / doc_vocab[document]['LENGTH']) * vocb_idf[word]]
  return doc_tfidf_vec

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  test_corpus, test_corpus_collection = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus_collection)
  test_vocab = test_vocab.most_common(100)
  test_vocab_idf = cal_vocab_idf(test_vocab, test_doc_vocab)
  #logging.debug("corpus: {}".format(test_corpus))
  #logging.debug("vocabpus: {}".format(test_vocab))
  #logging.debug("doc_vocab: {}".format(test_doc_vocab))
  #logging.debug("test_vocab_idf: {}".format(test_vocab_idf))
  
  test_doc_tfidf_vec = cal_doc_tfidf_vec(test_vocab, test_doc_vocab, test_vocab_idf)
  logging.debug("test_doc_tfidf_vec: {}".format(test_doc_tfidf_vec))

#Compute BoW (Bag-of-Words) Vector
`doc_to_vec(doc, vocab)` returns a bag-of-words vector for document `doc`, corresponding to the presence of a word in `vocab`

Compute the Bag-of-Words vector for each document

In [0]:
def doc2vec(doc):
  words = tokenize(corpus_collection[doc])
  return [1 if token in words else 0 for token, freq in vocab]

Cosine similarity between two numerical vectors

In [0]:
def cosine_similarity(vec_a, vec_b):
  assert len(vec_a) == len(vec_b)
  if sum(vec_a) == 0 or sum(vec_b) == 0:
    return 0 # hack
  a_b = sum(i[0] * i[1] for i in zip(vec_a, vec_b))
  a_2 = sum([i*i for i in vec_a])
  b_2 = sum([i*i for i in vec_b])
  return a_b/(math.sqrt(a_2) * math.sqrt(b_2))

In [0]:
# tfidf_vector = False
tfidf_vector = True

if tfidf_vector:
  #
  # calculate the similiarity based on word tfidf vector
  #
  def doc_similarity(doc_a, doc_b):
    # logging.debug("tfidf {}, {}".format(doc_a, doc_b))
    return cosine_similarity(doc_tfidf_vec[doc_a], doc_tfidf_vec[doc_b])
else:
  #
  # calculate the similiarity based on word bow vector
  #
  def doc_similarity(doc_a, doc_b):
    # logging.debug("bow {}, {}".format(doc_a, doc_b))
    return cosine_similarity(doc2vec(doc_a), doc2vec(doc_b))
  
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  test_corpus, test_corpus_collection = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus_collection)
  test_vocab = test_vocab.most_common(100)
  test_vocab_idf = cal_vocab_idf(test_vocab, test_doc_vocab)
  doc_tfidf_vec = cal_doc_tfidf_vec(test_vocab, test_doc_vocab, test_vocab_idf)

  seed_doc = test_corpus[5]
  logging.debug('> "{}"'.format(seed_doc))
  similarities = [doc_similarity(seed_doc, doc) for id, doc in enumerate(test_corpus)]
  logging.debug(similarities)

# Find Similar Documents
Find and print the $k$ most similar titles to a given title

In [0]:
def k_similar(seed_id, k):
  seed_title = corpus[seed_id]
  print('> "{}"'.format(seed_title))
  similarities = [doc_similarity(seed_title, title) for id, title in enumerate(corpus)]
  logging.debug("Similiarities: {}".format(similarities))
  top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i])[-k:] # https://stackoverflow.com/questions/13070461/get-indices-of-the-top-n-values-of-a-list
  nearest = [[corpus[id], similarities[id]] for id in top_indices]
  print()
  for story in reversed(nearest):
    print('* "{}" ({})'.format(story[0], story[1]))

# Test our program

In [17]:
corpus, corpus_collection = get_corpus()
vocab, doc_vocab = get_vocab(corpus_collection)
vocab = vocab.most_common(1000)
vocab_idf = cal_vocab_idf(vocab, doc_vocab)
doc_tfidf_vec = cal_doc_tfidf_vec(vocab, doc_vocab, vocab_idf)

k_similar(686028 % 1000, 5)

> "U.S. banks’ dismal first quarter may spell trouble for 2016"

* "U.S. banks’ dismal first quarter may spell trouble for 2016" (1.0)
* "Citigroup’s first-quarter results suggest tough year ahead" (0.5321500708114473)
* "Goldman posts weakest results in four years, revenue tumbles 40 percent" (0.5198766373065498)
* "Investors look for trough in profit downturn" (0.48185289938530335)
* "Wells Fargo searching for rainmaker in bid to boost dealmaking franchise" (0.47481922359348333)
