Accessing dataframe from the main file

In [167]:
%store -r panda_df

In [168]:
# Make Predictions with Naive Bayes On The Iris Dataset
from csv import reader
from math import sqrt
from math import exp
from math import pi

In [169]:
# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)   
	return dataset

In [170]:
# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

In [171]:
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

In [None]:
# Calculate the mean of a list of numbers
def mean(numbers):
	return sum(numbers)/float(len(numbers))

In [None]:
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)

In [None]:
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
	del(summaries[-1])
	return summaries

In [None]:
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
	separated = separate_by_class(dataset)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries

In [None]:
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [None]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, _ = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities

In [172]:
# Predict the class for a given row
def predict(summaries, row):
	probabilities = calculate_class_probabilities(summaries, row)
	best_label, best_prob = None, -1
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob = probability
			best_label = class_value
	return best_label

In [173]:
# Make a prediction with Naive Bayes on Iris Dataset
filename = 'dataset/small_train.csv'
dataset = load_csv(filename)
del dataset[0]


In [174]:
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)

In [175]:
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

[1] => 0
[0] => 1


{'1': 1, '0': 0}

In [176]:
# fit model
model = summarize_by_class(dataset)

In [188]:
test_filename = 'dataset/small_test.csv'
test_dataset = load_csv(test_filename)
row = test_dataset[27]


In [192]:
# define a new record
# row = [5.7,2.9,4.2,1.3]
row = [float(i) for i in row]

In [193]:
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))

Data=[39.0, 186.0, 45.0, 203.0, 51.0, 217.0, 57.0, 230.0, 65.0, 241.0, 77.0, 249.0, 92.0, 254.0, 108.0, 256.0, 121.0, 253.0, 131.0, 247.0, 137.0, 236.0, 141.0, 225.0, 143.0, 212.0, 143.0, 198.0, 141.0, 184.0, 138.0, 170.0, 135.0, 157.0, 49.0, 183.0, 56.0, 175.0, 64.0, 169.0, 75.0, 166.0, 84.0, 167.0, 101.0, 161.0, 109.0, 155.0, 117.0, 152.0, 125.0, 151.0, 132.0, 155.0, 97.0, 174.0, 101.0, 185.0, 104.0, 195.0, 108.0, 206.0, 96.0, 212.0, 103.0, 213.0, 109.0, 213.0, 114.0, 209.0, 118.0, 205.0, 63.0, 185.0, 69.0, 180.0, 75.0, 178.0, 82.0, 181.0, 76.0, 183.0, 70.0, 185.0, 108.0, 172.0, 112.0, 165.0, 118.0, 162.0, 124.0, 163.0, 121.0, 167.0, 115.0, 170.0, 86.0, 226.0, 95.0, 224.0, 104.0, 221.0, 110.0, 221.0, 116.0, 218.0, 123.0, 215.0, 131.0, 212.0, 126.0, 221.0, 121.0, 227.0, 115.0, 230.0, 108.0, 231.0, 98.0, 231.0, 89.0, 226.0, 105.0, 225.0, 111.0, 224.0, 117.0, 221.0, 128.0, 214.0, 118.0, 221.0, 113.0, 224.0, 106.0, 225.0, 32.727911, 3.064631, 10.175547, 4.342047, 7.241858, 4.022362, 7.70