In [17]:
import logging

logging.basicConfig(level=logging.DEBUG, format='%(message)s')

logging.debug('Hello Debug')
logging.info('Hello Info')
logging.warning('Hello Warning')
logging.error('Hello Error')
logging.critical('Hello Critical')

Hello Debug
Hello Info
Hello Error
Hello Critical


In [18]:
from io import open
import glob
import os
import numpy as np
import math
from collections import OrderedDict
import torch
import torchvision
import heapq
import unicodedata
import string

In [19]:
punctuation_letter = ",./;'[]<>?:\"\{\}!@#$%^&*()_+-=~`"

In [20]:
def findFiles(path):
    return glob.glob(path)

In [21]:
def removeLetter(inputStr, removeLetter):
    for i in removeLetter:
        inputStr = inputStr.replace(i, "")
    
    return inputStr

# unit test of removeLetter
if __debug__:
    oldstr = "Men Mon"
    logging.debug(oldstr)
    logging.debug(removeLetter(oldstr, "e "))

Men Mon
MnMon


In [22]:
def lowerCase(inputStr):
    return inputStr.lower()

# unit test of lowerCase
if __debug__:
    oldstr = "Men Mon"
    logging.debug(oldstr)
    logging.debug(lowerCase(oldstr))

Men Mon
men mon


In [23]:
all_letters = string.ascii_letters + " .,;'"

def unicodeToAscii(inputStr):
    return ''.join(
        c for c in unicodedata.normalize('NFD', inputStr)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [24]:
def readLines(filename):
    # lines = open(filename, encoding='utf-8').read().strip().split('\n')
    lines = open(filename, encoding='big5').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines if line != '']
    # return lines

if __debug__:
    all_lines = []
    
    for filename in findFiles('textFile/*.txt'):
        lines = readLines(filename)
        logging.debug('lines: {}'.format(lines))
        all_lines = all_lines + lines
    
    logging.debug('all_lines: {}'.format(all_lines))

lines: ['This is a test.', 'Test .', 'Test ', 'Test ', 'Test ', 'Test ']
lines: ['I am here.', 'I am not there.', 'WHere are you', 'Where is she']
lines: ['This is a dog.', 'This is a cat.', 'Dog is not a cat.']
all_lines: ['This is a test.', 'Test .', 'Test ', 'Test ', 'Test ', 'Test ', 'I am here.', 'I am not there.', 'WHere are you', 'Where is she', 'This is a dog.', 'This is a cat.', 'Dog is not a cat.']


In [25]:
def readWords(filename):
    lines = open(filename, encoding='big5').read().strip().split('\n')
    words = []
    for line in lines:
        line = removeLetter(unicodeToAscii(line), punctuation_letter)
        if line != '':
            words = words + lowerCase(line).strip().split(' ')
    return words

if __debug__:
    all_words = []
    
    for filename in findFiles('textFile/*.txt'):
        words = readWords(filename)
        logging.debug('words: {}'.format(words))
        all_words = all_words + words
    
    logging.debug('all_words: {}'.format(all_words))

words: ['this', 'is', 'a', 'test', 'test', 'test', 'test', 'test', 'test']
words: ['i', 'am', 'here', 'i', 'am', 'not', 'there', 'where', 'are', 'you', 'where', 'is', 'she']
words: ['this', 'is', 'a', 'dog', 'this', 'is', 'a', 'cat', 'dog', 'is', 'not', 'a', 'cat']
all_words: ['this', 'is', 'a', 'test', 'test', 'test', 'test', 'test', 'test', 'i', 'am', 'here', 'i', 'am', 'not', 'there', 'where', 'are', 'you', 'where', 'is', 'she', 'this', 'is', 'a', 'dog', 'this', 'is', 'a', 'cat', 'dog', 'is', 'not', 'a', 'cat']


In [26]:
def sortedUniqueWords(wordList):
    return(np.unique(sorted(wordList)))
    
if __debug__:
    all_words = []
    
    for filename in findFiles('textFile/*.txt'):
        words = readWords(filename)
        logging.debug('words: {}'.format(words))
        all_words = all_words + words
    
    logging.debug('all_words: {}'.format((all_words)))
    
    unique_sorted_all_words = sortedUniqueWords(all_words)
    logging.debug('unique_sorted_all_words: {}'.format(unique_sorted_all_words))

words: ['this', 'is', 'a', 'test', 'test', 'test', 'test', 'test', 'test']
words: ['i', 'am', 'here', 'i', 'am', 'not', 'there', 'where', 'are', 'you', 'where', 'is', 'she']
words: ['this', 'is', 'a', 'dog', 'this', 'is', 'a', 'cat', 'dog', 'is', 'not', 'a', 'cat']
all_words: ['this', 'is', 'a', 'test', 'test', 'test', 'test', 'test', 'test', 'i', 'am', 'here', 'i', 'am', 'not', 'there', 'where', 'are', 'you', 'where', 'is', 'she', 'this', 'is', 'a', 'dog', 'this', 'is', 'a', 'cat', 'dog', 'is', 'not', 'a', 'cat']
unique_sorted_all_words: ['a' 'am' 'are' 'cat' 'dog' 'here' 'i' 'is' 'not' 'she' 'test' 'there'
 'this' 'where' 'you']


In [27]:
#
# words_in_a_document: collects of words. each collect corresponding to its own document
# sorted_unique_all_words: sorted bag of words
# allWordCount: two dimension of order dictionary. dimension 0 corrsponding to document.
#               dimension 1 corresponding to word count in the document with <key, value> = <word, word count> pairs.
#               a special key 'LENGTH' record total word counts
#

all_words = []
words_in_a_document = {}
    
for filename in findFiles('textFile/*.txt'):
    words = readWords(filename)
    logging.debug('file: {}'.format(filename))
    logging.debug('words: {}'.format(words))
    words_in_a_document[os.path.splitext(os.path.basename(filename))[0]] = words
    all_words = all_words + words
    
logging.debug('\nwords_in_a_document: {}'.format((words_in_a_document)))

logging.debug('\nall_words: {}'.format((all_words)))    
sorted_unique_all_words = sortedUniqueWords(all_words)
logging.debug('sorted_unique_all_words: {}'.format(sorted_unique_all_words))
    
all_word_count = OrderedDict()
    
for document in words_in_a_document:
    words = words_in_a_document[document]
    wordCount = OrderedDict()
    wordCount['LENGTH'] = len(words)
    for w in sorted_unique_all_words:
        # logging.debug("{}: {} times".format(w, words.count(w)))
        wordCount[w] = words.count(w)
    all_word_count[document] = wordCount

logging.debug("\nall_word_count")
logging.debug(all_word_count)

file: textFile\test 1.txt
words: ['this', 'is', 'a', 'test', 'test', 'test', 'test', 'test', 'test']
file: textFile\test 2.txt
words: ['i', 'am', 'here', 'i', 'am', 'not', 'there', 'where', 'are', 'you', 'where', 'is', 'she']
file: textFile\test 3.txt
words: ['this', 'is', 'a', 'dog', 'this', 'is', 'a', 'cat', 'dog', 'is', 'not', 'a', 'cat']

words_in_a_document: {'test 1': ['this', 'is', 'a', 'test', 'test', 'test', 'test', 'test', 'test'], 'test 2': ['i', 'am', 'here', 'i', 'am', 'not', 'there', 'where', 'are', 'you', 'where', 'is', 'she'], 'test 3': ['this', 'is', 'a', 'dog', 'this', 'is', 'a', 'cat', 'dog', 'is', 'not', 'a', 'cat']}

all_words: ['this', 'is', 'a', 'test', 'test', 'test', 'test', 'test', 'test', 'i', 'am', 'here', 'i', 'am', 'not', 'there', 'where', 'are', 'you', 'where', 'is', 'she', 'this', 'is', 'a', 'dog', 'this', 'is', 'a', 'cat', 'dog', 'is', 'not', 'a', 'cat']
sorted_unique_all_words: ['a' 'am' 'are' 'cat' 'dog' 'here' 'i' 'is' 'not' 'she' 'test' 'there'
 'th

In [28]:
#
# count words tf, words_tf
#

words_tf = OrderedDict()
num_bow = len(sorted_unique_all_words)
logging.debug("number of bag of words: {}\n".format(num_bow))

for document in all_word_count:
    tf = OrderedDict()
    for word in all_word_count[document]:
        if word != 'LENGTH':
            tf[word] = (all_word_count[document][word] + 0.01) / (all_word_count[document]['LENGTH'] + 0.01)
    logging.debug("document {}".format(document))
    logging.debug("{}\n".format(tf))
    words_tf[document] = tf
logging.debug("words tf")
logging.debug("{}\n".format(words_tf))

number of bag of words: 15

document test 1
OrderedDict([('a', 0.1120976692563818), ('am', 0.0011098779134295228), ('are', 0.0011098779134295228), ('cat', 0.0011098779134295228), ('dog', 0.0011098779134295228), ('here', 0.0011098779134295228), ('i', 0.0011098779134295228), ('is', 0.1120976692563818), ('not', 0.0011098779134295228), ('she', 0.0011098779134295228), ('test', 0.6670366259711432), ('there', 0.0011098779134295228), ('this', 0.1120976692563818), ('where', 0.0011098779134295228), ('you', 0.0011098779134295228)])

document test 2
OrderedDict([('a', 0.0007686395080707148), ('am', 0.15449654112221367), ('are', 0.0776325903151422), ('cat', 0.0007686395080707148), ('dog', 0.0007686395080707148), ('here', 0.0776325903151422), ('i', 0.15449654112221367), ('is', 0.0776325903151422), ('not', 0.0776325903151422), ('she', 0.0776325903151422), ('test', 0.0007686395080707148), ('there', 0.0776325903151422), ('this', 0.0007686395080707148), ('where', 0.15449654112221367), ('you', 0.07763259

In [29]:
#
# count words idf, words_idf
#

words_idf = OrderedDict()
documents = len(all_word_count) + 0.1

for word in sorted_unique_all_words:
    doc_count = 0
    for document in all_word_count:
        if all_word_count[document][word] != 0:
            doc_count = doc_count + 1
    logging.debug("word {}\'s document count = {}".format(word, doc_count))
    words_idf[word] = math.log(documents / doc_count)
logging.debug("\nwords idf")
logging.debug(words_idf)

word a's document count = 2
word am's document count = 1
word are's document count = 1
word cat's document count = 1
word dog's document count = 1
word here's document count = 1
word i's document count = 1
word is's document count = 3
word not's document count = 2
word she's document count = 1
word test's document count = 1
word there's document count = 1
word this's document count = 2
word where's document count = 1
word you's document count = 1

words idf
OrderedDict([('a', 0.4382549309311553), ('am', 1.1314021114911006), ('are', 1.1314021114911006), ('cat', 1.1314021114911006), ('dog', 1.1314021114911006), ('here', 1.1314021114911006), ('i', 1.1314021114911006), ('is', 0.03278982282299097), ('not', 0.4382549309311553), ('she', 1.1314021114911006), ('test', 1.1314021114911006), ('there', 1.1314021114911006), ('this', 0.4382549309311553), ('where', 1.1314021114911006), ('you', 1.1314021114911006)])


In [30]:
#
# count words tfidf, words_tfidf in python dictionary implementation
# also tfidf_array in python numpy array implementation
#

words_tfidf = OrderedDict()

for document in all_word_count:
    tfidf = OrderedDict()
    for word in all_word_count[document]:
        if word != 'LENGTH':
            logging.debug("word: {:10s}, tf: {:.6f}, idf: {:.6f}, tf*idf: {:.6f}".format(
               word, words_tf[document][word], words_idf[word], words_tf[document][word] * words_idf[word]))
            tfidf[word] = words_tf[document][word] * words_idf[word]
            
    logging.debug("document {}".format(document))
    logging.debug("{}\n".format(tfidf))
    words_tfidf[document] = tfidf
    
logging.debug("words tfidf")
logging.debug("{}\n".format(words_tfidf))

word: a         , tf: 0.112098, idf: 0.438255, tf*idf: 0.049127
word: am        , tf: 0.001110, idf: 1.131402, tf*idf: 0.001256
word: are       , tf: 0.001110, idf: 1.131402, tf*idf: 0.001256
word: cat       , tf: 0.001110, idf: 1.131402, tf*idf: 0.001256
word: dog       , tf: 0.001110, idf: 1.131402, tf*idf: 0.001256
word: here      , tf: 0.001110, idf: 1.131402, tf*idf: 0.001256
word: i         , tf: 0.001110, idf: 1.131402, tf*idf: 0.001256
word: is        , tf: 0.112098, idf: 0.032790, tf*idf: 0.003676
word: not       , tf: 0.001110, idf: 0.438255, tf*idf: 0.000486
word: she       , tf: 0.001110, idf: 1.131402, tf*idf: 0.001256
word: test      , tf: 0.667037, idf: 1.131402, tf*idf: 0.754687
word: there     , tf: 0.001110, idf: 1.131402, tf*idf: 0.001256
word: this      , tf: 0.112098, idf: 0.438255, tf*idf: 0.049127
word: where     , tf: 0.001110, idf: 1.131402, tf*idf: 0.001256
word: you       , tf: 0.001110, idf: 1.131402, tf*idf: 0.001256
document test 1
OrderedDict([('a', 0.049

In [31]:
#
# print out word's tf.idf
#

print("{:20s}".format(""), end='')
for document in words_tfidf:
    print("{:20s}".format(document), end='')
print("")
for word in sorted_unique_all_words:
    print("{:10s}".format(word), end='')
    for document in words_tfidf:
        print("{:20.10f}".format(words_tfidf[document][word]), end='')
    print("")

                    test 1              test 2              test 3              
a                 0.0491273563        0.0003368601        0.1013948764
am                0.0012557182        0.1747977128        0.0008696404
are               0.0012557182        0.0878336766        0.0008696404
cat               0.0012557182        0.0008696404        0.1747977128
dog               0.0012557182        0.0008696404        0.1747977128
here              0.0012557182        0.0878336766        0.0008696404
i                 0.0012557182        0.1747977128        0.0008696404
is                0.0036756627        0.0025455589        0.0075862695
not               0.0004864095        0.0340228655        0.0340228655
she               0.0012557182        0.0878336766        0.0008696404
test              0.7546866471        0.0008696404        0.0008696404
there             0.0012557182        0.0878336766        0.0008696404
this              0.0491273563        0.0003368601        0.0677088

In [32]:
from collections import Counter 

top_k = 3
for document in words_tfidf:
    k = Counter(words_tfidf[document]) 
    logging.debug(document)
    logging.debug(k.most_common(top_k))
    logging.debug(k.most_common()[:-top_k-1:-1])

test 1
[('test', 0.7546866470656509), ('a', 0.0491273562974991), ('this', 0.0491273562974991)]
[('not', 0.00048640946829207025), ('you', 0.001255718214751499), ('where', 0.001255718214751499)]
test 2
[('am', 0.1747977128437442), ('i', 0.1747977128437442), ('where', 0.1747977128437442)]
[('this', 0.0003368600545204883), ('a', 0.0003368600545204883), ('test', 0.0008696403624066876)]
test 3
[('cat', 0.1747977128437442), ('dog', 0.1747977128437442), ('a', 0.10139487641066698)]
[('you', 0.0008696403624066876), ('where', 0.0008696403624066876), ('there', 0.0008696403624066876)]
