In [2]:
# Import needed functions
from csv import reader
from math import sqrt
from math import exp
from math import pi

In [3]:
# Load csv file and retun as a list of list
def load_csv(filename):
	count = 0
	dataset = list()
	with open(filename, 'r', encoding="utf8") as file:
		csv_reader = reader(file)
		for row in csv_reader:
			# remove column names
			if count == 0:
				count = 1
				continue
			dataset.append(row)
	return dataset

def convert_columns_to_numerical(dataset):
	'''
	Converts categorical value to numerical

	Returns
	dataset (list) : list of list of dataset where categorical values are converted into numerical values
	'''
	for row in dataset:
		# Female->0, Male->1
		if row[0] == "Female":
			row[0] = 0
		else:
			row[0] = 1
		
		# No->0, Yes-> 1
		if row[-1] == "NO":
			row[-1] = 0
		else:
			row[-1] = 1
	return dataset

def string_to_int(dataset):
	'''
	Converts string type to int type

	Returns
	dataset (list) : list of list of dataset where all values are int type
	'''
	for row in dataset:
		for i in range(len(row)):
			# if value is int type, continue
			if isinstance(row[i], int):
				continue
			# else, cast value to int type
			else:
				try:
					row[i] = int(row[i])
				except ValueError:
					continue
	return dataset

def split_labels(dataset):
	'''
	Split the dataset by class labels

	Returns
	separated_dataset (dict) : dictonary of dataset where the key are the labels and values are the row of data
	'''
	# initialize an empty dictionary
	# TODO: change and not use a dictionary
	separated_dataset = dict()
	for i in range(len(dataset)):
		row = dataset[i]
		label_val = row[-1]
		if (label_val not in separated_dataset):
			separated_dataset[label_val] = list()
		separated_dataset[label_val].append(row)
	return separated_dataset

def calculate_mean(list_numbers):
	'''
	Calculate the mean value from a list of numbers

	Returns
	mean (float) : mean value
	'''
	sum = 0
	for num in list_numbers:
		sum += num
	mean = sum / len(list_numbers)
	return mean
 
def calculate_standard_deviation(numbers):
	'''
	Calculate the standard deviation from a list of numbers

	Returns
	standard_deviation (float) : standard deviation value
	'''
	mean = calculate_mean(numbers)
	sum_square_difference = 0
	for num in numbers:
		sum_square_difference += (num - mean) ** 2
	variance = sum_square_difference / (len(numbers) - 1)
	standard_deviation = sqrt(variance)
	return standard_deviation

# Calculate the mean, standard deviation and row count for each column in a dataset
def summarize_dataset(dataset):
	summaries = [(calculate_mean(column), calculate_standard_deviation(column), len(column)) for column in zip(*dataset)]
	# remove last colum which is the label column
	del(summaries[-1])
	return summaries

# Split dataset by class then return (mean, standard deviation, count) for each column
def summarize_by_class(dataset):
	separated = split_labels(dataset)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items(): # returns key and value pair
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, count = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities

# Predict the class for a given row
def predict(summaries, row):
	probabilities = calculate_class_probabilities(summaries, row)
	best_label, best_prob = None, -1
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob = probability
			best_label = class_value
	return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
	summarize = summarize_by_class(train)
	predictions = list()
	for row in test:
		output = predict(summarize, row)
		predictions.append(output)
	return(predictions)

# # Test Naive Bayes on Iris Dataset
# seed(1)
# filename = 'iris.csv'
# dataset = load_csv(filename)
# for i in range(len(dataset[0])-1):
# 	str_column_to_float(dataset, i)
# # convert class column to integers
# str_column_to_int(dataset, len(dataset[0])-1)
# # evaluate algorithm
# n_folds = 5
# scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
# print('Scores: %s' % scores)
# print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

In [4]:
def calculate_mean(numbers):
	sum = 0
	for num in numbers:
		sum += num
	return sum / len(numbers)

def std_dev(numbers):
	mean = calculate_mean(numbers)
	variance = sum([(x-mean)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)

def calculate_standard_deviation(numbers):
    mean = calculate_mean(numbers)
    sum_square_difference = 0
    for num in numbers:
        sum_square_difference += (num - mean) ** 2
    variance = sum_square_difference / (len(numbers) - 1)
    return sqrt(variance)

my_numbers = [1, 2, 3, 4, 5]

print(std_dev(my_numbers))
print(calculate_standard_deviation(my_numbers))
convert_columns_to_numerical()

1.5811388300841898
1.5811388300841898


TypeError: convert_columns_to_numerical() missing 1 required positional argument: 'dataset'

In [5]:
dataset = load_csv("dataset.csv")
dataset = convert_columns_to_numerical(dataset)
dataset = string_to_int(dataset)
seperated_dataset = split_labels(dataset)

# for label in seperated_dataset:
# 	print("label: ", label, "row count: ", len(seperated_dataset[label]))
# 	for row in seperated_dataset[label]:
# 		print(row)

summaries = summarize_by_class(dataset)
# for label in summary:
# 	print(label)
# 	for row in summary[label]:
# 		print(row)
prediciton_str = []
prediciton = []
for row in dataset:	
	probabilities = calculate_class_probabilities(summaries, row)
	print(probabilities)
	if probabilities[0] > probabilities[1]:
		prediciton.append(0)
		prediciton_str.append('NO')
	else:
		prediciton.append(1)
		prediciton_str.append('YES')

correct_count = 0.0
for i in range(len(prediciton)):
	if prediciton[i] == dataset[i][-1]:
		correct_count += 1
print("accuracy: ", correct_count/len(prediciton)*100)

{0: 1.0955969665301928e-08, 1: 9.369439267950225e-10}
{0: 6.43865508957679e-09, 1: 2.1579455897975952e-10}
{0: 3.464230055766331e-12, 1: 2.1410293659855624e-11}
{0: 2.0365405985997924e-10, 1: 9.700598586454303e-11}
{0: 2.7189765393840607e-08, 1: 5.183765917950581e-10}
{0: 4.7868117425327086e-09, 1: 1.934773781746855e-10}
{0: 2.0467979182390154e-08, 1: 9.87051495516838e-10}
{0: 2.479995085832816e-08, 1: 3.9075390342956917e-10}
{0: 1.2585645141919645e-09, 1: 2.531316776290722e-10}
{0: 2.9109271982711396e-08, 1: 1.0852397764198597e-09}
{0: 2.5649708771438153e-08, 1: 6.101177895841379e-10}
{0: 1.824318030633619e-10, 1: 5.335696013959939e-11}
{0: 9.385280746832347e-09, 1: 2.3321109719990585e-10}
{0: 7.0589066750106245e-09, 1: 2.877860261311471e-10}
{0: 3.966141205832723e-09, 1: 3.74032455520172e-10}
{0: 2.3254091000286526e-12, 1: 1.4370200990329167e-11}
{0: 8.717714890501178e-09, 1: 2.7600955679559636e-10}
{0: 1.6655277734607316e-13, 1: 4.1124358116997115e-11}
{0: 6.4203914239597005e-09, 1:

In [31]:
summaries

{0: [(0.6666666666666666, 0.47639306734033077, 48),
  (8.729166666666666, 10.720311966192133, 48),
  (2.2916666666666665, 3.3514578176587717, 48),
  (1.875, 2.017213160816232, 48),
  (0.8541666666666666, 1.1106751508790236, 48),
  (2.4166666666666665, 3.3759684715007894, 48),
  (1.4791666666666667, 1.2714472125040805, 48),
  (1.3541666666666667, 2.2072374505522894, 48),
  (5.208333333333333, 4.287330833252652, 48)],
 1: [(0.5576923076923077, 0.5015060275070906, 52),
  (9.115384615384615, 11.95523323946274, 52),
  (3.673076923076923, 3.353202471891047, 52),
  (2.7115384615384617, 2.4521822091204806, 52),
  (1.5192307692307692, 1.7205834010064738, 52),
  (3.576923076923077, 4.594737628670407, 52),
  (1.8846153846153846, 1.8958556054250062, 52),
  (1.8846153846153846, 2.3485819347155843, 52),
  (9.942307692307692, 24.956757473472866, 52)]}

In [24]:
summary = summarize_by_class(dataset)
for label in summary:
	print(label)
	for row in summary[label]:
		print(row)

0
(2.7420144012, 0.9265683289298018, 5)
(3.0054686692, 1.1073295894898725, 5)
1
(7.6146523718, 1.2344321550313704, 5)
(2.9914679790000003, 1.4541931384601618, 5)


In [29]:
dataset

[[0, 4, 3, 2, 1, 0, 2, 2, 1, 0],
 [0, 8, 0, 2, 0, 1, 0, 0, 4, 0],
 [1, 5, 0, 0, 0, 14, 2, 0, 15, 0],
 [1, 7, 0, 3, 0, 0, 5, 0, 0, 0],
 [1, 3, 2, 1, 0, 2, 1, 0, 6, 1],
 [0, 23, 1, 1, 0, 0, 1, 1, 3, 0],
 [0, 6, 3, 2, 1, 1, 1, 1, 3, 1],
 [1, 6, 0, 2, 0, 1, 1, 0, 5, 0],
 [0, 1, 7, 3, 0, 7, 1, 0, 2, 1],
 [1, 3, 2, 2, 1, 0, 2, 2, 8, 1],
 [1, 6, 0, 3, 0, 2, 1, 2, 4, 1],
 [1, 6, 0, 1, 3, 4, 2, 7, 3, 0],
 [1, 6, 0, 1, 0, 0, 0, 3, 6, 0],
 [0, 3, 1, 0, 0, 1, 2, 3, 4, 0],
 [0, 7, 3, 5, 0, 1, 1, 0, 3, 0],
 [1, 11, 2, 0, 0, 16, 1, 1, 12, 1],
 [1, 6, 0, 0, 2, 0, 1, 0, 8, 1],
 [1, 19, 5, 8, 0, 7, 1, 0, 20, 1],
 [1, 3, 0, 1, 1, 0, 3, 0, 10, 0],
 [1, 1, 3, 1, 3, 1, 2, 2, 2, 0],
 [1, 3, 2, 1, 1, 1, 4, 1, 1, 0],
 [1, 4, 2, 1, 1, 1, 1, 1, 4, 1],
 [1, 9, 1, 3, 0, 2, 1, 4, 2, 1],
 [0, 5, 3, 5, 2, 1, 2, 1, 0, 1],
 [1, 7, 2, 1, 2, 1, 2, 0, 2, 1],
 [0, 9, 0, 0, 0, 0, 1, 4, 1, 0],
 [1, 6, 2, 5, 1, 3, 3, 1, 10, 1],
 [1, 18, 2, 6, 2, 4, 5, 2, 9, 0],
 [1, 25, 4, 2, 0, 7, 2, 0, 17, 0],
 [1, 10, 12, 0, 0, 2, 2, 0, 11