<a href="https://colab.research.google.com/github/chilung/EmotionX2020/blob/master/lab2_0686028.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import logging

logging.basicConfig(level=logging.INFO, format='%(message)s')

logging.debug('Hello Debug')
logging.info('Hello Info')
logging.warning('Hello Warning')
logging.error('Hello Error')
logging.critical('Hello Critical')

Hello Info
Hello Error
Hello Critical


In [0]:
import nltk
from nltk import ngrams
from nltk import pos_tag
import re
import pandas as pd
from io import open
import glob
import os
import numpy as np
import math
from collections import OrderedDict
from collections import Counter
import torch
import torchvision
import heapq
import unicodedata
import string

# Hands-On Lab2 Descriptions

1. Calculate and print the 
* 5 most frequent 2-grams 
* from the Reuters news dataset (content) available at bit.ly/nlp-reuters
* where both tokens are PROPER NOUNS
* using NLTK word_tokenize, POS tagger
* No need to remove punctuation, no need to remove stop words
2. Calculate and print the
* 5 most similar articles to seed_id = <student_ID> % 1000
* from the Buzzfeed new dataset (content), available at bit.ly/nlp-buzzfeed
* Tokens are lemma + POS (e.g., “give_VERB”)
* using the SpaCy POS tagger and tokenizer
* Use en_core_web_sm model
* Remove stopwords
* TF-IDF with 512 features (most common tokens)
* Can use TF-IDF code from here: Medium article
* Can use library function to compute cosine_similarity
3. Use the same Notebook

#Task 1: Subroutines
Calculate and print the
* 5 most frequent 2-grams
* from the Reuters news dataset (content) available at bit.ly/nlp-reuters
* where both tokens are PROPER NOUNS
* using NLTK word_tokenize, POS tagger
* No need to remove punctuation, no need to remove stop words

In [3]:
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
# stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
def get_corpus(corpus_path, req_size, to_lower = True, title_only=False):
  df = pd.read_csv(corpus_path)
  logging.debug("Dataset columns: {}".format(df.columns))
  logging.debug("Dataset size: {}".format(len(df)))

  if req_size == 'ALL':
    req_size = len(df)
  num_of_documents = req_size if req_size < len(df) else len(df)
  
  corpus_title = df.title.to_list()[0:num_of_documents]
  corpus_content = df.content.to_list()[0:num_of_documents]

  corpus = {}
  for index in range(len(corpus_title)):
    if pd.isnull(corpus_title[index]):
      corpus_title[index] = "NULL"
    if pd.isnull(corpus_content[index]):
      corpus_content[index] = "NULL"

    if to_lower == True:
      corpus_title[index] = corpus_title[index].lower()
      corpus_content[index] = corpus_content[index].lower()

    if title_only == True:
      corpus[corpus_title[index]] = corpus_title[index]
    else:
      corpus[corpus_title[index]] = corpus_content[index]

  return corpus_title, corpus

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  corpus_title, corpus = get_corpus('https://raw.githubusercontent.com/bshmueli/108-nlp/master/reuters.csv', 100, to_lower=False, title_only=False)
  logging.debug(corpus_title[0:50])
  for index, title in enumerate(corpus):
    if index < 50:
      logging.debug("index: {}, content:{}".format(index, corpus[title]))

In [0]:
sep_words = [('U. S.', 'U.S.'), ('U. N.', 'U.N.')]

def sep_word_preprocessing(doc):
  for sep_word in sep_words:
    sep, merge = sep_word
    doc = doc.replace(sep, merge)
  return doc

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  logging.debug(sep_word_preprocessing(corpus['Exclusive: Apple makes iPhone screen fixes easier as states mull repair laws']))

In [0]:
#
# use library word_tokenize
#
def get_tokens_nltk(corpus):
  tokens = []
  for index, title in enumerate(corpus):
    if (index % 1000) == 0:
      logging.info("word_tokenize document {}".format(index))
    #doc = title
    doc = corpus[title]
    doc = sep_word_preprocessing(doc)
    tokens = tokens + [token for token in word_tokenize(doc)]
  
  return tokens

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  tokens = get_tokens_nltk(corpus)
  logging.debug(tokens[0:100])

In [0]:
#
# use stopwords
#
def remove_stopwords(tokens):
  logging.info("Total Tokens: {}".format(len(tokens)))

  step = 100000
  new_tokens = []

  for i in range(int(len(tokens)/step)):
    logging.info("Process removing stop words: {} to {}".format(i*step, (i+1)*step))
    new_tokens = new_tokens + [token for token in tokens[i*step:(i+1)*step] if token not in stopwords.words('english')]

  i = int(len(tokens)/step)
  j = len(tokens) % step
  logging.info("Process removing stop words: {} to {}".format(i*step, i*step+j))
  new_tokens = new_tokens + [token for token in tokens[i*step:i*step+j] if token not in stopwords.words('english')]

  return new_tokens

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  tokens = remove_stopwords(tokens)
  logging.debug(tokens[0:100])

In [0]:
def get_ngram(tokens, n):
  return list(ngrams(tokens, n))

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  two_grams = get_ngram(tokens, 2)
  logging.debug(two_grams[0:100])

In [9]:
#
# use nltk averaged_perceptron_tagger Part of Speech
#
nltk.download('averaged_perceptron_tagger')

def to_token_pos(tokens):
  token_pos_pairs = pos_tag(tokens)
  return token_pos_pairs

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  logging.debug(to_token_pos(tokens)[0:100])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [0]:
def find_proper_noun_pairs(two_grams):
  nnp_word_pairs = []
  nnp_word_pos_pairs = []

  logging.info("Total Two-Gram Pairs: {}".format(len(two_grams)))
  for index, pair in enumerate(two_grams):
    if (index % 100000) == 0:
      logging.info("Checking Proper Noun Pairs. Progress: {}".format(index))
    (word1, pos1), (word2, pos2) = to_token_pos(pair)
    #logging.debug("{}, {}, {}, {}".format(word1, pos1, word2, pos2))
    if pos1 == 'NNP' and pos2 == 'NNP':
      nnp_word_pairs = nnp_word_pairs + [(word1, word2)]
      nnp_word_pos_pairs = nnp_word_pos_pairs + [(word1+'_'+pos1, word2+'_'+pos2)]

  return nnp_word_pairs, nnp_word_pos_pairs

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  proper_noun_pairs, proper_noun_pos_pairs = find_proper_noun_pairs(two_grams)
  logging.debug(proper_noun_pairs[0:100])
  logging.debug(proper_noun_pos_pairs[0:100])

In [0]:
def nnp_pairs_counter(proper_noun_pairs):
  nnp_pairs_collection = Counter()
  logging.info("Total Proper Noun Pairs: {}".format(len(proper_noun_pairs)))
  for index, nnp_pair in enumerate(proper_noun_pairs):
    if (index % 100000) == 0:
      logging.info("Counter Collection, Progress: {}".format(index))
    nnp_pairs_collection.update([nnp_pair])

  return nnp_pairs_collection

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  nnp_pairs_collection = nnp_pairs_counter(proper_noun_pairs)
  logging.debug(nnp_pairs_collection)
  logging.debug(nnp_pairs_collection.most_common(5))

# Task 1: Main Function

In [12]:
corpus_title, corpus = get_corpus('https://raw.githubusercontent.com/bshmueli/108-nlp/master/reuters.csv', 'ALL', to_lower=False, title_only=False)
logging.debug("corpus document number: {}".format(len(corpus)))
#logging.debug(corpus)

tokens = get_tokens_nltk(corpus)
logging.debug(tokens)

tokens = remove_stopwords(tokens)
logging.debug(tokens)

two_grams = get_ngram(tokens, 2)
logging.debug(two_grams)

proper_noun_pairs, proper_noun_pos_pairs = find_proper_noun_pairs(two_grams)
logging.debug(proper_noun_pairs)
logging.debug(proper_noun_pos_pairs)

nnp_pairs_collection = nnp_pairs_counter(proper_noun_pairs)
logging.debug(nnp_pairs_collection)
logging.info(nnp_pairs_collection.most_common(5))

word_tokenize document 0
word_tokenize document 1000
word_tokenize document 2000
word_tokenize document 3000
word_tokenize document 4000
word_tokenize document 5000
Total Tokens: 4094546
Process removing stop words: 0 to 100000
Process removing stop words: 100000 to 200000
Process removing stop words: 200000 to 300000
Process removing stop words: 300000 to 400000
Process removing stop words: 400000 to 500000
Process removing stop words: 500000 to 600000
Process removing stop words: 600000 to 700000
Process removing stop words: 700000 to 800000
Process removing stop words: 800000 to 900000
Process removing stop words: 900000 to 1000000
Process removing stop words: 1000000 to 1100000
Process removing stop words: 1100000 to 1200000
Process removing stop words: 1200000 to 1300000
Process removing stop words: 1300000 to 1400000
Process removing stop words: 1400000 to 1500000
Process removing stop words: 1500000 to 1600000
Process removing stop words: 1600000 to 1700000
Process removing stop

# Task 1: Output Result

In [13]:
logging.info(nnp_pairs_collection.most_common(5))

[(('Donald', 'Trump'), 3212), (('New', 'York'), 2442), (('Islamic', 'State'), 1952), (('President', 'Donald'), 1922), (('North', 'Korea'), 1650)]


# Task 2: Subroutines
Calculate and print the
* 5 most similar articles to seed_id = % 1000
* from the Buzzfeed new dataset (content), available at bit.ly/nlp-buzzfeed
* Tokens are lemma + POS (e.g., “give_VERB”)
* using the SpaCy POS tagger and tokenizer
* Use en_core_web_sm model
* Remove stopwords
* TF-IDF with 512 features (most common tokens)
* Can use TF-IDF code from here: Medium article
* Can use library function to compute cosine_similarity
* Use the same Notebook

In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")

#
# use library spacy nlp
#
def get_tokens_spacy(document):
  tokens = [token for token in nlp(document)]
  return tokens

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  corpus_title, corpus = get_corpus('https://raw.githubusercontent.com/bshmueli/108-nlp/master/buzzfeed.csv', 10, to_lower=False, title_only=True)
  logging.debug("corpus document number: {}".format(len(corpus)))
  logging.debug("Corpus:\n{}".format(corpus))
  
  tokens = []
  for title in corpus:
    tokens = tokens + get_tokens_spacy(corpus[title])
  logging.debug(tokens)

In [0]:
#
# use stopwords from spacy tokens
#
def remove_stop_punct_space(tokens):
  return [token for token in tokens if not (token.is_stop or token.is_punct or token.is_space)]

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  tokens = remove_stop_punct_space(tokens)
  logging.debug(tokens)

In [0]:
#
# change token to lemma+pos
#
def token_to_lemma_pos(tokens):
  return [token.lemma_+'_'+token.pos_ for token in tokens]

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  tokens = token_to_lemma_pos(tokens)
  logging.debug(tokens)

In [0]:
def tokenize(document, verbose=False):
  tokens = get_tokens_spacy(document)
  if verbose == True:
    logging.debug("NLP tokens:\n{}".format(tokens))
  
  tokens = remove_stop_punct_space(tokens)
  if verbose == True:
    logging.debug("Remove Stopwords:\n{}".format(tokens))

  tokens = token_to_lemma_pos(tokens)
  if verbose == True:
    logging.debug("Lemma+POS:\n{}".format(tokens))

  return tokens

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  corpus_title, corpus = get_corpus('https://raw.githubusercontent.com/bshmueli/108-nlp/master/buzzfeed.csv', 100, to_lower=False, title_only=False)
  logging.debug("corpus document number: {}".format(len(corpus)))
  logging.debug("Corpus:\n{}".format(corpus))
  
  for index, title in enumerate(corpus):
    if index < 10:
      tokens = tokenize(corpus[title], verbose=True)
      logging.debug(tokens)

In [0]:
def tokens_counter(tokens):
  tokens_collection = Counter()
  for token in tokens:
    tokens_collection.update([token])

  return tokens_collection

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  tokens_collection = tokens_counter(tokens)
  logging.debug(tokens_collection)
  logging.debug(tokens_collection.most_common(5))

In [19]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
print(tokenizer.tokenize("Taiwan gives 400,000 masks to U.S. under cooperation arrangement. #TaiwanCanHelp :)"))

['Taiwan', 'gives', '400,000', 'masks', 'to', 'U', '.', 'S', '.', 'under', 'cooperation', 'arrangement', '.', '#TaiwanCanHelp', ':)']


In [20]:
from nltk.tokenize import sent_tokenize
print(sent_tokenize("Hello! and welcome to U.S.! How are you? I feel so happy... and you?"))

['Hello!', 'and welcome to U.S.!', 'How are you?', 'I feel so happy... and you?']


#Computing word frequencies
`get_vocab(corpus)` computes the word frequencies in a given corpus. It also collect word frequencies of individual document in the given corpus. It returns two items. The first item is a list of 2-tuples, `vocab`: each tuple contains the token and its frequency. The second item is two dimension list, `doc_vocab`: a list of individual document's list of 2-tuples, each tuple contains the token and its frequency.

In [0]:
def get_vocab(corpus_collection, verbose=False):
  vocabulary = Counter()
  doc_vocab = {}
  for title in corpus_collection:

    if verbose == True:
      logging.debug("origin document: {}".format(corpus_collection[title]))
    tokens = tokenize(corpus_collection[title])
    if verbose == True:
      logging.debug("token: {}".format(tokens))

    doc_vocab_collection = Counter()
    doc_vocab_collection.update(tokens)
    doc_vocab_collection['LENGTH'] = len(tokens)
    
    doc_vocab[title] = doc_vocab_collection
    vocabulary.update(tokens)
  
  return vocabulary, doc_vocab

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  corpus_title, corpus = get_corpus('https://raw.githubusercontent.com/bshmueli/108-nlp/master/buzzfeed.csv', 20, to_lower=False, title_only=False)
  logging.debug("corpus document number: {}".format(len(corpus)))
  logging.debug("Corpus:\n{}".format(corpus))
  
  vocab, doc_vocab = get_vocab(corpus, verbose=True)
  logging.debug("vocab: {}".format(vocab))
  logging.debug("doc_vocab: {}".format(doc_vocab))

#Compute Words' IDF and TF-IDF

In [0]:
#
# 'cal_vocab_idf' calculates the idf of each word.
# The idf of word is given by log of the result of total number of documents divided by the number of documents in which the word happens.
#

def cal_vocab_idf(vocab, doc_vocab):
  vocab_idf = Counter()
  for i, token in enumerate(vocab):
    word, freq = token
    vocab_idf[word] = 0
    for document in doc_vocab:
      if word in doc_vocab[document]:
        vocab_idf[word] = vocab_idf[word] + 1
    vocab_idf[word] = math.log(len(doc_vocab) / vocab_idf[word])
  return vocab_idf

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  vocab = vocab.most_common(100)
  vocab_idf = cal_vocab_idf(vocab, doc_vocab)
  logging.debug("test_vocab_idf: {}".format(vocab_idf))

In [0]:
def cal_doc_tfidf_vec(vocab, doc_vocab, vocb_idf):
  doc_tfidf_vec = {}
  for document in doc_vocab:
    doc_tfidf_vec[document] = []
    for i, token in enumerate(vocab): 
      word, freq = token
      # doc_vocab[document][word]: the count of the word in this document
      # doc_vocab[document]['LENGTH']: the count of total words in this document
      # vocb_idf[word]: the word's idf
      doc_tfidf_vec[document] = doc_tfidf_vec[document] + [(doc_vocab[document][word] / doc_vocab[document]['LENGTH']) * vocb_idf[word]]
  return doc_tfidf_vec

if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  doc_tfidf_vec = cal_doc_tfidf_vec(vocab, doc_vocab, vocab_idf)
  logging.debug("test_doc_tfidf_vec: {}".format(doc_tfidf_vec))

Compute BoW (Bag-of-Words) Vector
doc_to_vec(doc, vocab) returns a bag-of-words vector for document doc, corresponding to the presence of a word in vocab

Compute the Bag-of-Words vector for each document

In [0]:
def doc2vec(doc):
  words = tokenize(corpus_collection[doc])
  return [1 if token in words else 0 for token, freq in vocab]

Cosine similarity between two numerical vectors

In [0]:
def cosine_similarity(vec_a, vec_b):
  assert len(vec_a) == len(vec_b)
  if sum(vec_a) == 0 or sum(vec_b) == 0:
    return 0 # hack
  a_b = sum(i[0] * i[1] for i in zip(vec_a, vec_b))
  a_2 = sum([i*i for i in vec_a])
  b_2 = sum([i*i for i in vec_b])
  return a_b/(math.sqrt(a_2) * math.sqrt(b_2))

In [0]:
# tfidf_vector = False
tfidf_vector = True

if tfidf_vector:
  #
  # calculate the similiarity based on word tfidf vector
  #
  def doc_similarity(doc_a, doc_b):
    # logging.debug("tfidf {}, {}".format(doc_a, doc_b))
    return cosine_similarity(doc_tfidf_vec[doc_a], doc_tfidf_vec[doc_b])
else:
  #
  # calculate the similiarity based on word bow vector
  #
  def doc_similarity(doc_a, doc_b):
    # logging.debug("bow {}, {}".format(doc_a, doc_b))
    return cosine_similarity(doc2vec(doc_a), doc2vec(doc_b))
  
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
  seed_doc = corpus_title[5]
  logging.debug('> "{}"'.format(seed_doc))
  similarities = [doc_similarity(seed_doc, doc) for id, doc in enumerate(corpus)]
  logging.debug(similarities)

# Find Similar Documents
Find and print the $k$ most similar titles to a given title

In [0]:
def k_similar(seed_id, k):
  seed_title = corpus_title[seed_id]
  print('> "{}"'.format(seed_title))
  similarities = [doc_similarity(seed_title, title) for id, title in enumerate(corpus_title)]
  logging.debug("Similiarities: {}".format(similarities))
  top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i])[-k:] # https://stackoverflow.com/questions/13070461/get-indices-of-the-top-n-values-of-a-list
  nearest = [[corpus_title[id], similarities[id]] for id in top_indices]
  print()
  for story in reversed(nearest):
    print('* "{}" ({})'.format(story[0], story[1]))

# Task 2: Main Function

In [0]:
corpus_title, corpus = get_corpus('https://raw.githubusercontent.com/bshmueli/108-nlp/master/buzzfeed.csv', 'ALL', to_lower=False, title_only=False)
vocab, doc_vocab = get_vocab(corpus)
vocab = vocab.most_common(512)
vocab_idf = cal_vocab_idf(vocab, doc_vocab)
doc_tfidf_vec = cal_doc_tfidf_vec(vocab, doc_vocab, vocab_idf)

In [29]:
#
# My Student ID = 0686028
#
k_similar(686028 % 1000, 5)


> "A Man On A Bus Was Seen With A 30 Gallon Tub Of Cinnabon Frosting And People Have So Many Questions"

* "A Man On A Bus Was Seen With A 30 Gallon Tub Of Cinnabon Frosting And People Have So Many Questions" (1.0)
* "Hollywood’s Forgotten Gay Romance" (0.4048519047948762)
* "How The Best Podcast Of The Year Was Made" (0.38040469572531127)
* "Invasion Of The Big-Brained Sci-Fi Blockbuster!" (0.30025812091745924)
* "Sean Penn Says He Has “Nothin’ To Hide” Regarding El Chapo Interview" (0.2750500512155769)


In [30]:
#
# for the content of (my student id % 1000) is too generic to identify the results, I test some more articles.
#
k_similar(55, 5)
print('\n\n')
k_similar(33, 5) 

> "Zika Is Spreading Fast In Puerto Rico, Could Reach 25% Of Population"

* "Zika Is Spreading Fast In Puerto Rico, Could Reach 25% Of Population" (0.9999999999999998)
* "Zika Virus Updates: Few Pregnant Travelers Are Infected With Zika" (0.9304581159208551)
* "3 Zika Deaths In Venezuela" (0.8963314154130989)
* "Everything You Need To Know About Zika Virus In 148 Seconds" (0.8926574979212561)
* "Zika Panic Spreads Among Pregnant Women" (0.8725727710572433)



> "Snap Inc.’s Growth Is Pissing Off Its Neighbors"

* "Snap Inc.’s Growth Is Pissing Off Its Neighbors" (1.0000000000000002)
* "Despite Legal Troubles, This Startup Is Trying To Do Right By Workers" (0.4435912350072317)
* "This Startup Is Trying To Help Its Workers While Fighting Regulations" (0.44211342774763224)
* "How High-Flying Zenefits Fell To Earth" (0.3577373234338207)
* "The Convicted Con Artist Of The Winter White House" (0.33425852576818466)
