In [18]:
import pandas as pd
import math as math

In [8]:
def load(path):
	df = None
	'''YOUR CODE HERE'''
	df = pd.read_csv(path, encoding='latin-1')
	'''END'''
	return df

In [2]:
def prior(df):
	ham_prior = 0
	spam_prior = 0
	'''YOUR CODE HERE'''
	#calculate the prior probability of each class
	ham_prior = df['label'].value_counts()[1]/df.shape[0]
	spam_prior = df['label'].value_counts()[0]/df.shape[0]
	'''END'''
	return ham_prior, spam_prior

In [3]:
def likelihood(df):
	ham_like_dict = {}
	spam_like_dict = {}
	'''YOUR CODE HERE'''
	#alpha = 1
	#ham_alpha_count = 0
	#spam_alpha_count = 0
	ham_word_count = {}
	spam_word_count = {}

	#create a list of all text from ham emails
	ham_mail = df.loc[df['label'] == 'ham', 'text'].values

	#iterate through each word in each email, counting the number of emails that contain a given word
	for mail in ham_mail:
		words = []
		for word in mail.split():
			if word not in words:
				words.append(word)
		for word in words:
			if word not in ham_word_count:
				ham_word_count[word] = 1  # + alpha
				#ham_alpha_count += 1
			else:
				ham_word_count[word] += 1

			# if word not in spam_word_count:
			# 	spam_word_count[word] = alpha
			# 	spam_alpha_count += 1

	#create a list of all text from spam emails
	spam_mail = df.loc[df['label'] == 'spam', 'text'].values

	#iterate through each word in each email, counting the number of emails that contain a given word
	for mail in spam_mail:
		words = []
		for word in mail.split():
			if word not in words:
				words.append(word)
		for word in words:
			if word not in spam_word_count:
				spam_word_count[word] = 1  # + alpha
				# spam_alpha_count += 1
			else:
				spam_word_count[word] += 1

			# if word not in ham_word_count:
			# 	ham_word_count[word] = alpha
			# 	ham_alpha_count += 1

	#calculate the likelihood of each word in the 'ham' class
	ham_like_dict = {k: v/(df['label'].value_counts()[1])
                  for k, v in ham_word_count.items()}
	# ham_like_dict = {k: v/(df['label'].value_counts()[1] + ham_alpha_count)
    #               for k, v in ham_word_count.items()}

	#calculate the likelihood of each word in the 'spam' class
	spam_like_dict = {k: v/(df['label'].value_counts()[0])
                   for k, v in spam_word_count.items()}
	# spam_like_dict = {k: v/(df['label'].value_counts()[0] + spam_alpha_count)
    #                for k, v in spam_word_count.items()}
	'''END'''

	return ham_like_dict, spam_like_dict

In [49]:
def predict(ham_prior, spam_prior, ham_like_dict, spam_like_dict, text):
	'''
	prediction function that uses prior and likelihood structure to compute proportional posterior for a single line of text
	'''
	#ham_spam_decision = 1 if classified as spam, 0 if classified as normal/ham
	ham_spam_decision = None

	'''YOUR CODE HERE'''
	p_words_ham = 0
	p_words_spam = 0

	for i in text.split():
		if i in ham_like_dict:
			p_words_ham += math.log10(ham_like_dict[i])

	for i in text.split():
		if i in spam_like_dict:
			p_words_spam += math.log10(spam_like_dict[i])


	ham_numerator  = math.log10(ham_prior) + p_words_ham
	ham_denominator = math.log10(ham_prior) + p_words_ham + math.log10(spam_prior) + p_words_spam
	print(
		"ham_numerator: ", ham_numerator,
		"ham_denominator: ", ham_denominator
	)

	spam_numerator  = math.log10(spam_prior) + p_words_spam
	spam_denominator = math.log10(ham_prior) + p_words_ham + math.log10(spam_prior) + p_words_spam
	print(
		"spam_numerator: ", spam_numerator,
		"spam_denominator: ", spam_denominator
	)

	scale = 100
	#ham_posterior = posterior probability that the email is normal/ham
	ham_posterior = (10**((ham_numerator - ham_denominator)/scale))

	#spam_posterior = posterior probability that the email is spam
	spam_posterior = (10**((spam_numerator - spam_denominator)/scale))
		
	if ham_posterior > spam_posterior:
		ham_spam_decision = 0
	else:
		ham_spam_decision = 1

	'''END'''
	return ham_spam_decision


In [22]:
def metrics(ham_prior, spam_prior, ham_dict, spam_dict, df):
	'''
	Calls "predict"
	'''
	hh = 0  # true negatives, truth = ham, predicted = ham
	hs = 0  # false positives, truth = ham, pred = spam
	sh = 0  # false negatives, truth = spam, pred = ham
	ss = 0  # true positives, truth = spam, pred = spam
	num_rows = df.shape[0]
	for i in range(num_rows):
		roi = df.iloc[i, :]
		roi_text = roi.text
		roi_label = roi.label_num
		guess = predict(ham_prior, spam_prior, ham_dict, spam_dict, roi_text)
		if roi_label == 0 and guess == 0:
			hh += 1
		elif roi_label == 0 and guess == 1:
			hs += 1
		elif roi_label == 1 and guess == 0:
			sh += 1
		elif roi_label == 1 and guess == 1:
			ss += 1

	acc = (ss + hh)/(ss+hh+sh+hs)
	precision = (ss)/(ss + hs)
	recall = (ss)/(ss + sh)
	return acc, precision, recall

In [24]:
df = load('TRAIN_balanced_ham_spam.csv')
df_test = load('TEST_balanced_ham_spam.csv')
ham_prior, spam_prior = prior(df)
ham_like_dict, spam_like_dict = likelihood(df)

In [50]:
print(metrics(ham_prior, spam_prior, ham_like_dict, spam_like_dict, df_test))

ham_numerator:  -117.71750839341192 ham_denominator:  -256.14301386851423
spam_numerator:  -138.42550547510234 spam_denominator:  -256.14301386851423
ham_numerator:  -271.9164481106084 ham_denominator:  -585.6029438974319
spam_numerator:  -313.68649578682346 spam_denominator:  -585.6029438974319
ham_numerator:  -219.85864864545826 ham_denominator:  -457.6156285938821
spam_numerator:  -237.7569799484238 spam_denominator:  -457.6156285938821
ham_numerator:  -30.429721784946015 ham_denominator:  -72.70524486468224
spam_numerator:  -42.27552307973623 spam_denominator:  -72.70524486468224
ham_numerator:  -134.642656578031 ham_denominator:  -274.97283092432104
spam_numerator:  -140.33017434629005 spam_denominator:  -274.97283092432104
ham_numerator:  -197.63011804210285 ham_denominator:  -392.9994634785702
spam_numerator:  -195.36934543646737 spam_denominator:  -392.9994634785702
ham_numerator:  -850.4132633380106 ham_denominator:  -1571.1029218083518
spam_numerator:  -720.6896584703412 spam