In [None]:
import csv
from itertools import count
import operator
import math

EPSILON = 0.000001


def calc_term_doc_given_author(prob_map, counts):
    """
    How likely is the document, given the counts of words in the doc
    and the authors prob_map
    """
    prob = 1
    return prob


# If a word is in a probability dictionary, return its probability
# otherwise, return epsilon
def get_word_prob(word_prob_map, word):
    """
    Gets probability of a word:
    Returns probability if word exists, EPSILON if not
    """
    if word in word_prob_map:
        return word_prob_map[word]
    #return EPSILON

# From a file name, approximate the probability of a word
# being generated from the same distribution as the file.
# Assume that each word is produced independently, regardless
# of order.
def make_word_prob_map(fileName):
    """
    Calculates word probabilities:
    1. Counts word frequencies
    2. Converts counts to probabilities
    Returns: word probability dictionary
    """
    wordMap, nWords = make_word_count_map(fileName)
    probabilityMap = {}
    for word in wordMap:
        count = wordMap[word]
        p = float(count) / nWords
        probabilityMap[word] = p
    return probabilityMap

# From a file name, count the number of times each word exists
# in that file. Return the result as a map (aka a dictionary)
def make_word_count_map(fileName):
    """
    Reads a file and counts word frequencies:
    1. Opens file
    2. Splits into words
    3. Standardizes each word
    4. Counts occurrences
    Returns: (wordMap, total word count)
    """
    wordMap = {}
    nWords = 0
    with open(fileName ,encoding="utf-8") as f:
        for line in f:
            words = line.split(' ')
            for word in words:
                word = standardize(word)
                add_word_to_count_map(wordMap, word)
                nWords+= 1
    return wordMap, nWords

# Add a word to a count map. Makes sure not to crash if the
# word has not been seen before.
def add_word_to_count_map(wordMap, word):
    """
    Updates word count in dictionary:
    1. Skips stop words
    2. Initializes count if new word
    3. Increments count if existing word
    """
    if is_stop(word):
        return
    if not word in wordMap:
        wordMap[word] = 0
    wordMap[word] += 1


def standardize(word):
    """
    Standardizes words by:
    1. Converting to lowercase
    2. Removing punctuation
    3. Keeping only alphabetic characters
    """
    standard = word.lower().strip()
    # remove punctuation
    standard = ''.join([i for i in standard if i.isalpha()])
    return standard

def is_stop(word):
    """
    Removes common words that don't help in analysis
    """
    stop_words = ['to', 'i', 'the', 'and', 'of']
    return word in stop_words


def main():
    # Calculate all the ps and qs
    # Eg hamiltonWordProb['congress'] = 0.005
    # hamilton_word_prob['piech'] = 0.0
    # hamilton_word_prob['the'] = 0.001

    hamilton_word_prob = make_word_prob_map('hamilton.txt')
    madison_word_prob = make_word_prob_map('madison.txt')

    

    # Get the word count of the unknown document
    # Eg unknown_doc_count['congress'] = 5
    unknown_doc_count, n_words = make_word_count_map('unknown.txt')

    print("hamilton['congress']\t", hamilton_word_prob['congress'])
    print("madison['congress']\t",  madison_word_prob['congress'])
    print("doc_count['congress']\t", unknown_doc_count['congress'])
    print("n_words", n_words)

    hamilton_term = calc_term_doc_given_author(hamilton_word_prob, unknown_doc_count)
    print('---'*10)
    madison_term = calc_term_doc_given_author(madison_word_prob, unknown_doc_count)
    print("Hamilton Term\t", hamilton_term)
    print("Madison Term\t",madison_term)

if __name__ == '__main__':
    main()

n_words 1112
------------------------------
Hamilton Term	 1
Madison Term	 1


In [None]:
import csv
from itertools import count
import operator
import math

EPSILON = 0.000001



def calc_term_doc_given_author(prob_map, counts):
    """
    How likely is the document, given the counts of words in the doc
    and the authors prob_map
    """
    prob = 1
    for word, c_i in counts.items():
        p_word = get_word_prob(prob_map, word)
        prob *= p_word ** c_i
        print("word: ", word,"| prob_word: ", p_word, "| c_i: ", c_i, "| prob: ", prob)
    return prob
    # prob = 0
    # for word, c_i in counts.items():
    #     p_word = get_word_prob(prob_map, word)
    #     prob += math.log(p_word) * c_i
    # return prob


# If a word is in a probability dictionary, return its probability
# otherwise, return epsilon
def get_word_prob(word_prob_map, word):
    if word in word_prob_map:
        return word_prob_map[word]
    return EPSILON

# From a file name, approximate the probability of a word
# being generated from the same distribution as the file.
# Assume that each word is produced independently, regardless
# of order.
def make_word_prob_map(fileName):
    wordMap, nWords = make_word_count_map(fileName)
    probabilityMap = {}
    for word in wordMap:
        count = wordMap[word]
        p = float(count) / nWords
        probabilityMap[word] = p
    return probabilityMap

# From a file name, count the number of times each word exists
# in that file. Return the result as a map (aka a dictionary)
def make_word_count_map(fileName):
    wordMap = {}
    nWords = 0
    with open(fileName ,encoding='utf-8') as f:
        for line in f:
            words = line.split(' ')
            for word in words:
                word = standardize(word)
                add_word_to_count_map(wordMap, word)
                nWords+= 1
    return wordMap, nWords

# Add a word to a count map. Makes sure not to crash if the
# word has not been seen before.
def add_word_to_count_map(wordMap, word):
    if is_stop(word):
        return
    if not word in wordMap:
        wordMap[word] = 0
    wordMap[word] += 1

# Standardizes a word. For now, we are just going to make it
# lower case.
def standardize(word):
    standard = word.lower().strip()
    # remove punctuation
    standard = ''.join([i for i in standard if i.isalpha()])
    return standard

def is_stop(word):
    stop_words = ['to', 'i', 'the', 'and', 'of']
    return word in stop_words

def main():
    # Calculate all the ps and qs
    # Eg hamiltonWordProb['congress'] = 0.005
    # hamilton_word_prob['piech'] = 0.0
    # hamilton_word_prob['the'] = 0.001

    hamilton_word_prob = make_word_prob_map('hamilton.txt')
    madison_word_prob = make_word_prob_map('madison.txt')

    

    # Get the word count of the unknown document
    # Eg unknown_doc_count['congress'] = 5
    unknown_doc_count, n_words = make_word_count_map('unknown.txt')

    #print("hamilton['congress']\t", hamilton_word_prob['congress'])
    #print("madison['congress']\t",  madison_word_prob['congress'])
    #print("doc_count['congress']\t", unknown_doc_count['congress'])
    print("n_words", n_words)

    hamilton_term = calc_term_doc_given_author(hamilton_word_prob, unknown_doc_count)
    print('---'*10)
    madison_term = calc_term_doc_given_author(madison_word_prob, unknown_doc_count)
    print("Hamilton Term\t", hamilton_term)
    print("Madison Term\t",madison_term)

if __name__ == '__main__':
    main()

n_words 1112
word:  everchanging | prob_word:  1e-06 | c_i:  2 | prob:  1e-12
word:  landscape | prob_word:  0.0011376564277588168 | c_i:  2 | prob:  1.294262147620952e-18
word:  global | prob_word:  0.012514220705346985 | c_i:  8 | prob:  7.784894483745558e-34
word:  affairs | prob_word:  0.004550625711035267 | c_i:  2 | prob:  1.6121110805656044e-38
word:   | prob_word:  0.04209328782707622 | c_i:  43 | prob:  1.1182720485364787e-97
word:  in | prob_word:  0.015927189988623434 | c_i:  19 | prob:  7.748034060716292e-132
word:  a | prob_word:  0.012514220705346985 | c_i:  18 | prob:  4.389955346677659e-166
word:  world | prob_word:  0.004550625711035267 | c_i:  6 | prob:  3.898403607633612e-180
word:  that | prob_word:  1e-06 | c_i:  2 | prob:  3.898403607633612e-192
word:  never | prob_word:  1e-06 | c_i:  1 | prob:  3.898403607633612e-198
word:  stands | prob_word:  1e-06 | c_i:  1 | prob:  3.8984036076336115e-204
word:  still | prob_word:  1e-06 | c_i:  3 | prob:  3.898403607633611e

In [None]:
import csv
from itertools import count
import operator
import math

EPSILON = 0.000001

def main():
    # Calculate all the ps and qs
    # Eg hamiltonWordProb['congress'] = 0.005
    # hamilton_word_prob['piech'] = 0.0
    # hamilton_word_prob['the'] = 0.001

    hamilton_word_prob = make_word_prob_map('hamilton.txt')
    madison_word_prob = make_word_prob_map('madison.txt')

    

    # Get the word count of the unknown document
    # Eg unknown_doc_count['congress'] = 5
    unknown_doc_count, n_words = make_word_count_map('unknown.txt')

    #print("hamilton['congress']\t", hamilton_word_prob['congress'])
    #print("madison['congress']\t",  madison_word_prob['congress'])
    #print("doc_count['congress']\t", unknown_doc_count['congress'])
    print("n_words", n_words)

    hamilton_term = calc_log_pr_doc_given_author(hamilton_word_prob, unknown_doc_count)
    madison_term = calc_log_pr_doc_given_author(madison_word_prob, unknown_doc_count)
    print("log P(D|H)\t", hamilton_term)
    print("log P(D|M)\t",madison_term)

    print('diff\t', hamilton_term - madison_term)

def calc_log_pr_doc_given_author(prob_map, counts):
    """
    How likely is the document, given the counts of words in the doc
    and the authors prob_map
    """
    log_prob = math.log(1)
    for word_i, c_i in counts.items():
        p_i = get_word_prob(prob_map, word_i)
        log_prob += c_i * math.log(p_i)
    return log_prob


def calcLogProbDoc(wordProbMap, countMap):
    logProb = math.log(1)
    for wordi in countMap:
        ci = countMap[wordi]
        pi = get_word_prob(wordProbMap, wordi)
        logProb += ci * math.log(pi)
    return logProb

# If a word is in a probability dictionary, return its probability
# otherwise, return epsilon
def get_word_prob(word_prob_map, word):
    if word in word_prob_map:
        return word_prob_map[word]
    return EPSILON

# From a file name, approximate the probability of a word
# being generated from the same distribution as the file.
# Assume that each word is produced independently, regardless
# of order.
def make_word_prob_map(fileName):
    wordMap, nWords = make_word_count_map(fileName)
    probabilityMap = {}
    for word in wordMap:
        count = wordMap[word]
        p = float(count) / nWords
        probabilityMap[word] = p
    return probabilityMap

# From a file name, count the number of times each word exists
# in that file. Return the result as a map (aka a dictionary)
def make_word_count_map(fileName):
    wordMap = {}
    nWords = 0
    with open(fileName , encoding='utf-8') as f:
        for line in f:
            words = line.split(' ')
            for word in words:
                word = standardize(word)
                add_word_to_count_map(wordMap, word)
                nWords+= 1
    return wordMap, nWords

# Add a word to a count map. Makes sure not to crash if the
# word has not been seen before.
def add_word_to_count_map(wordMap, word):
    if not word in wordMap:
        wordMap[word] = 0
    wordMap[word] += 1 

# Standardizes a word. For now, we are just going to make it
# lower case.
def standardize(word):
    standard = word.lower().strip()
    # remove punctuation
    standard = ''.join([i for i in standard if i.isalpha()])
    return standard

if __name__ == '__main__':
    main()

n_words 2172
log P(D|H)	 -14251.749082287015
log P(D|M)	 -12898.137397440358
diff	 -1353.6116848466572
