<a href="https://colab.research.google.com/github/chilung/EmotionX2020/blob/master/lab1_0686028.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Recommend Similar News Articles
This notebook demonstrates how to use bag-of-word vectors and cosine similarity for news article recommendation.

In [65]:
import logging

logging.basicConfig(level=logging.DEBUG, format='%(message)s')

logging.debug('Hello Debug')
logging.info('Hello Info')
logging.warning('Hello Warning')
logging.error('Hello Error')
logging.critical('Hello Critical')

Hello Debug
Hello Info
Hello Error
Hello Critical


In [0]:
import re
import pandas as pd
from io import open
import glob
import os
import numpy as np
import math
from collections import OrderedDict
from collections import Counter
import torch
import torchvision
import heapq
import unicodedata
import string

#Fetching the Corpus
`get_corpus()` reads the CSV file, and then return a list of the news headlines

In [67]:
num_of_documents = 20

def get_corpus():
  df = pd.read_csv('https://raw.githubusercontent.com/bshmueli/108-nlp/master/reuters.csv') # https://bit.ly/nlp-reuters
  logging.debug("Dataset columns: {}".format(df.columns))
  logging.debug("Dataset size: {}".format(len(df)))
  corpus = df.title.to_list()[0:num_of_documents]
  return corpus

if __debug__:
  test_corpus = get_corpus()
  for i, c in enumerate(test_corpus):
    logging.debug("document {}: {}".format(i+1, c))
  logging.debug("len of corpus: {}".format(len(test_corpus)))

Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354
document 1: Exclusive: Apple makes iPhone screen fixes easier as states mull repair laws
document 2: Oil ends lower on OPEC output hike; U.S. stockpile rise seen
document 3: Patience an asset for U.S. businesses seeking opportunities in Cuba
document 4: Modi’s BJP vows to strip Muslim immigrants of vote in Assam
document 5: Oprah effect fails to lift Weight Watchers sales; shares plunge
document 6: New York, New Jersey 10-year transport plan could include rail tunnel
document 7: Israel calls on powers to punish Iran for its missile tests
document 8: Former U.S. national security adviser Brzezinski dies at 89
document 9: G7 ministers look to persuade Russia to abandon Syria’s Assad
document 10: Mosul train tunnel reveals assault course for elite Islamic State fighters
document 11: Trump says rival Cruz’s Canadian birthplace could be ’big problem’: Washington Post
document 12: Silicon Valley fights for its li

In [0]:
def lowerCase(inputStr):
    return inputStr.lower()

In [69]:
import urllib.request as urllib

def get_stop_words(stop_words_source):
  stop_words_byte = urllib.urlopen(stopwords_url)

  stop_words = []
  for line in stop_words_byte:
    stop_words = stop_words + [w for w in line.decode("utf-8").split('\n') if w != '']
  stop_words = [''] + stop_words
  return(stop_words)

stopwords_url = "https://bit.ly/nlp-stopwords"
stop_words = get_stop_words(stopwords_url)
if __debug__:
  logging.debug(stop_words)

['', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [0]:
def split_on_space(document):
  return re.split(r' ', document)

In [0]:
def remove_stop_words(words):
  return [w for w in words if w not in stop_words]

In [0]:
def split_on_punctation(words):
  return [re.split(r'\W+', w)[0] for w in words]

In [73]:
def tokenize(document):
  logging_level = logging.getLogger().getEffectiveLevel()
  logging.getLogger().setLevel(logging.INFO)
  
  words = split_on_space(document) # remain the words like you're unchanged so that we can remove them out in the following process
  logging.debug("Step 1 split on space: {}".format(words))

  words = [lowerCase(w) for w in words]
  logging.debug("Step 2 lower case: {}".format(words))

  words = remove_stop_words(words) # the first process of removing stop word such as you're in 
  logging.debug("Step 3 1st remove stop words: {}".format(words))
  
  words = split_on_punctation(words) # remove punctation words
  logging.debug("Step 4 split on punctation: {}".format(words))
 
  words = remove_stop_words(words) # the first process of removing stop word such as you're in 
  logging.debug("Step 5 2nd remove stop words: {}".format(words))
  
  logging.getLogger().setLevel(logging_level)
  return words

if __debug__:
  test_str = "you'll you'd me my myself I am here. But ,[]\& is strange to me??!! How do you think"
  logging.debug(test_str)
  logging.debug(tokenize(test_str))

you'll you'd me my myself I am here. But ,[]\& is strange to me??!! How do you think
['strange', 'think']


#Computing word frequencies
`get_vocab(corpus)` computes the word frequencies in a given corpus. It also collect word frequencies of individual document in the given corpus. It returns two items. The first item is a list of 2-tuples, `vocab`: each tuple contains the token and its frequency. The second item is two dimension list, `doc_vocab`: a list of individual document's list of 2-tuples, each tuple contains the token and its frequency.

In [74]:
def get_vocab(corpus):
  vocabulary = Counter()
  doc_vocab = {}
  for document in corpus:
    logging_level = logging.getLogger().getEffectiveLevel()
    logging.getLogger().setLevel(logging.INFO)

    logging.debug("origin document: {}".format(document))
    tokens = tokenize(document)
    logging.debug("token: {}".format(tokens))
    doc_vocab_collection = Counter()
    doc_vocab_collection.update(tokens)
    doc_vocab_collection['LENGTH'] = len(tokens)
    
    doc_vocab[document] = doc_vocab_collection
    vocabulary.update(tokens)
  
  logging.getLogger().setLevel(logging.DEBUG)
  return vocabulary, doc_vocab

if __debug__:
  # logging.getLogger().setLevel(logging.INFO)
  test_corpus = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus)
  logging.debug("vocab: {}".format(test_vocab))
  logging.debug("doc_vocab: {}".format(test_doc_vocab))

Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354
vocab: Counter({'u': 3, 'sales': 2, 'shares': 2, 'new': 2, 'could': 2, 'tunnel': 2, 'iran': 2, 'syria': 2, 'says': 2, 'johnson': 2, 'exclusive': 1, 'apple': 1, 'makes': 1, 'iphone': 1, 'screen': 1, 'fixes': 1, 'easier': 1, 'states': 1, 'mull': 1, 'repair': 1, 'laws': 1, 'oil': 1, 'ends': 1, 'lower': 1, 'opec': 1, 'output': 1, 'hike': 1, 'stockpile': 1, 'rise': 1, 'seen': 1, 'patience': 1, 'asset': 1, 'businesses': 1, 'seeking': 1, 'opportunities': 1, 'cuba': 1, 'modi': 1, 'bjp': 1, 'vows': 1, 'strip': 1, 'muslim': 1, 'immigrants': 1, 'vote': 1, 'assam': 1, 'oprah': 1, 'effect': 1, 'fails': 1, 'lift': 1, 'weight': 1, 'watchers': 1, 'plunge': 1, 'york': 1, 'jersey': 1, '10': 1, 'transport': 1, 'plan': 1, 'include': 1, 'rail': 1, 'israel': 1, 'calls': 1, 'powers': 1, 'punish': 1, 'missile': 1, 'tests': 1, 'former': 1, 'national': 1, 'security': 1, 'adviser': 1, 'brzezinski': 1, 'dies': 1, '89': 1, 'g7': 1, 'mi

#Compute BoW (Bag-of-Words) Vector
`doc_to_vec(doc, vocab)` returns a bag-of-words vector for document `doc`, corresponding to the presence of a word in `vocab`

In [0]:
def doc2vec(doc):
  words = tokenize(doc)
  return [1 if token in words else 0 for token, freq in vocab]


Compute the Bag-of-Words vector for each document

Cosine similarity between two numerical vectors

In [0]:
def cosine_similarity(vec_a, vec_b):
  assert len(vec_a) == len(vec_b)
  if sum(vec_a) == 0 or sum(vec_b) == 0:
    return 0 # hack
  a_b = sum(i[0] * i[1] for i in zip(vec_a, vec_b))
  a_2 = sum([i*i for i in vec_a])
  b_2 = sum([i*i for i in vec_b])
  return a_b/(math.sqrt(a_2) * math.sqrt(b_2))

In [0]:
def doc_similarity(doc_a, doc_b):
  return cosine_similarity(doc2vec(doc_a), doc2vec(doc_b))


# Find Similar Documents
Find and print the $k$ most similar titles to a given title

In [0]:
def k_similar(seed_id, k):
  seed_doc = corpus[seed_id]
  print('> "{}"'.format(seed_doc))
  similarities = [doc_similarity(seed_doc, doc) for id, doc in enumerate(corpus)]
  top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i])[-k:] # https://stackoverflow.com/questions/13070461/get-indices-of-the-top-n-values-of-a-list
  nearest = [[corpus[id], similarities[id]] for id in top_indices]
  print()
  for story in reversed(nearest):
    print('* "{}" ({})'.format(story[0], story[1]))

# Test our program

In [80]:
#
# 'cal_vocab_idf' calculates the idf of each word.
# The idf of word is given by log of the result of total number of documents divided by the number of documents in which the word happens.
#

def cal_vocab_idf(vocab, doc_vocab):
  vocab_idf = Counter()
  for i, token in enumerate(vocab):
    word, freq = token
    vocab_idf[word] = 0
    for document in doc_vocab:
      if word in doc_vocab[document]:
        vocab_idf[word] = vocab_idf[word] + 1
    vocab_idf[word] = math.log(len(doc_vocab) / vocab_idf[word])
  return vocab_idf

if __debug__:
  test_corpus = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus)
  test_vocab = test_vocab.most_common(100)
  logging.debug("corpus: {}".format(test_corpus))
  logging.debug("vocabpus: {}".format(test_vocab))
  logging.debug("doc_vocab: {}".format(test_doc_vocab))
  test_vocab_idf = cal_vocab_idf(test_vocab, test_doc_vocab)
  logging.debug("test_vocab_idf: {}".format(test_vocab_idf))

Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354
corpus: ['Exclusive: Apple makes iPhone screen fixes easier as states mull repair laws', 'Oil ends lower on OPEC output hike; U.S. stockpile rise seen', 'Patience an asset for U.S. businesses seeking opportunities in Cuba', 'Modi’s BJP vows to strip Muslim immigrants of vote in Assam', 'Oprah effect fails to lift Weight Watchers sales; shares plunge', 'New York, New Jersey 10-year transport plan could include rail tunnel', 'Israel calls on powers to punish Iran for its missile tests', 'Former U.S. national security adviser Brzezinski dies at 89', 'G7 ministers look to persuade Russia to abandon Syria’s Assad', 'Mosul train tunnel reveals assault course for elite Islamic State fighters', 'Trump says rival Cruz’s Canadian birthplace could be ’big problem’: Washington Post', 'Silicon Valley fights for its life on immigration', 'Oracle lawyer says Google’s Android generated $31 billion revenue', 'Rubio falters 

In [81]:
def cal_doc_tfidf_vec(vocab, doc_vocab, vocb_idf):
  doc_tfidf_vec = {}
  for document in doc_vocab:
    doc_tfidf_vec[document] = []
    for i, token in enumerate(vocab): 
      word, freq = token
      # doc_vocab[document][word]: the count of the word in this document
      # doc_vocab[document]['LENGTH']: the count of total words in this document
      # vocb_idf[word]: the word's idf
      doc_tfidf_vec[document] = doc_tfidf_vec[document] + [(doc_vocab[document][word] / doc_vocab[document]['LENGTH']) * vocb_idf[word]]
  return doc_tfidf_vec

if __debug__:
  test_corpus = get_corpus()
  test_vocab, test_doc_vocab = get_vocab(test_corpus)
  test_vocab = test_vocab.most_common(100)
  test_vocab_idf = cal_vocab_idf(test_vocab, test_doc_vocab)
  #logging.debug("corpus: {}".format(test_corpus))
  #logging.debug("vocabpus: {}".format(test_vocab))
  #logging.debug("doc_vocab: {}".format(test_doc_vocab))
  #logging.debug("test_vocab_idf: {}".format(test_vocab_idf))
  
  test_doc_tfidf_vec = cal_doc_tfidf_vec(test_vocab, test_doc_vocab, test_vocab_idf)
  logging.debug("test_doc_tfidf_vec: {}".format(test_doc_tfidf_vec))

Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354
test_doc_tfidf_vec: {'Exclusive: Apple makes iPhone screen fixes easier as states mull repair laws': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2723392975958174, 0.2723392975958174, 0.2723392975958174, 0.2723392975958174, 0.2723392975958174, 0.2723392975958174, 0.2723392975958174, 0.2723392975958174, 0.2723392975958174, 0.2723392975958174, 0.2723392975958174, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'Oil ends lower on OPEC output hike; U.S. stockpile rise seen': [0.18971199848858813, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [82]:
corpus = get_corpus()
vocab, doc_vocab = get_vocab(corpus)
vocab = vocab.most_common(100)
vocab_idf = cal_vocab_idf(vocab, doc_vocab)
doc_tfidf_vec = cal_doc_tfidf_vec(vocab, doc_vocab, vocab_idf)

print(vocab)
k_similar(10, 5)

Dataset columns: Index(['title', 'content'], dtype='object')
Dataset size: 5354


[('u', 3), ('sales', 2), ('shares', 2), ('new', 2), ('could', 2), ('tunnel', 2), ('iran', 2), ('syria', 2), ('says', 2), ('johnson', 2), ('exclusive', 1), ('apple', 1), ('makes', 1), ('iphone', 1), ('screen', 1), ('fixes', 1), ('easier', 1), ('states', 1), ('mull', 1), ('repair', 1), ('laws', 1), ('oil', 1), ('ends', 1), ('lower', 1), ('opec', 1), ('output', 1), ('hike', 1), ('stockpile', 1), ('rise', 1), ('seen', 1), ('patience', 1), ('asset', 1), ('businesses', 1), ('seeking', 1), ('opportunities', 1), ('cuba', 1), ('modi', 1), ('bjp', 1), ('vows', 1), ('strip', 1), ('muslim', 1), ('immigrants', 1), ('vote', 1), ('assam', 1), ('oprah', 1), ('effect', 1), ('fails', 1), ('lift', 1), ('weight', 1), ('watchers', 1), ('plunge', 1), ('york', 1), ('jersey', 1), ('10', 1), ('transport', 1), ('plan', 1), ('include', 1), ('rail', 1), ('israel', 1), ('calls', 1), ('powers', 1), ('punish', 1), ('missile', 1), ('tests', 1), ('former', 1), ('national', 1), ('security', 1), ('adviser', 1), ('brzezi