In [20]:
from pgmpy.models import MarkovModel
import pandas as pd
import numpy as np

train_data = pd.read_csv('./data/cleaned_train.csv', index_col=0)
test_data = pd.read_csv('./data/cleaned_test.csv', index_col=0)
# train_data.drop(columns=0, axis=0, inplace=True)
columns = list(train_data.columns)
continuos = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
discrete = list(set(columns).difference(set(continuos)))

def get_discrete_options(text):
	return [x.strip().lower() for x in text.split(',')]



discrete_options = {
	'native-country': get_discrete_options('United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands'),
	'race': get_discrete_options('White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black'),
	'relationship': get_discrete_options('Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried'),
	'occupation': get_discrete_options('Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces'),
    'marital-status': get_discrete_options('Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse'),
	'education': get_discrete_options('Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool'),
	'workclass': get_discrete_options('Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked'),
	'income': ['<=50k', '>50k'],
	'sex': ['female', 'male'],
}

def tokenize(var, value):
	return discrete_options[var].index(value) + 1

train_data = train_data[continuos + discrete]
test_data = test_data[continuos + discrete]
for i, row in train_data.iterrows():
    for field in discrete:
        train_data.at[i, field] = tokenize(field, row[field].strip().lower())
for i, row in test_data.iterrows():
    for field in discrete:
        test_data.at[i, field] = tokenize(field, row[field].strip().lower())
test_data.to_csv('./data/test.csv')
# train_data.to_csv('./data/train.csv')

In [19]:
columns = list(train_data.columns)
train_data['race'].unique()

array([1, 5, 2, 3, 4], dtype=object)

In [None]:
s = len(continuos)
r = len(discrete)

def cond_cont(x, y, x_ind, B_s, a_s, ro_s):
	res = a_s
	for j in range(r):
		res += ro_s[j][y[j]]
	for t in range(s):
		if x_ind != t:
			res -= B_s[t]*x[t]
	res = (-1*B_s[x_ind]/2)*(res / B_s[x_ind] - x[x_ind])**2
	res += np.log(B_s[x_ind])/2 - np.log(2*np.pi)/2
	return res

def cond_discrete(x, y, y_ind, phi_r, ro):
	def b_l(l):
	    res = 0
	    for i in range(s):
	        res += ro[i, y_ind][l]*x[i]
	    res += phi_r[y_ind][l, l]
	    for j in range(r):
	        if j != y_ind:
	            res += phi_r[j][l, y[j]]
	var = discrete[y_ind]
	options = len(discrete_options[var])
	probs = np.zeros(options)
	for i in range(options):
		probs[i] = b_l(i)
	probs /= np.sum(probs)
	return probs[y[y_ind]]

def log_likelihood_instance(row):
	res = 0
	beta = np.random.rand(s*s).reshape(s,s)
	alpha = np.random.rand(s)
	ro = np.empty((s, r))
	for i in range(s):
		for j, var in enumerate(discrete):
			options_length = len(discrete_options[var])
			ro[i, j] = np.random.rand(options_length)
			ro[i, j] /= np.sum(ro[i, j])

	phi = np.empty((r, r))
	for i in range(r):
		for j in range(r):
			len1 = len(discrete_options[discrete[i]])
			len2 = len(discrete_options[discrete[j]])
			phi[i, j] = np.random.rand(len1*len2).reshape(len1, len2)

	for i in range(s):
		res += cond_cont(row[:s], row[s:], i, B_s=beta[i], a_s=alpha[i], ro_s=ro[i])
	for j in range(r):
		res += cond_discrete(row[:s], row[s:], j, phi_r=phi[j], ro=ro)
	return res

def log_likelihood():
	res = 0
	for i, row in train_data.iter_rows():
		res += log_likelihood_instance(row)
	return res

In [24]:
preds = pd.read_csv('./data/predictions.csv', index_col=0)
true = pd.read_csv('./data/test.csv', index_col=0)
true = true.values[:,-1]
preds = preds.values[:, -1]

In [33]:
res = preds == true

In [34]:
s = res.sum() / preds.shape(1)

TypeError: 'tuple' object is not callable