# Assignment 9 - Data Science in Finance and Insurance
Submission by: Dennis Goldenberg

In [155]:
#import statements
import numpy as np
import pandas as pd;
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

In [156]:
#Generate datasaet

#Generate variables
category = pd.Categorical(["C1"] * 4 + ["C2"] * 6)
quality = pd.Categorical(["B"] + ["G"]*3 + ["B"]*2 + ["G"]*4)
temperature = pd.Categorical(["H"] + ["C"] + ["H"]*2 + ["C"]*2 + ["H"]*3 + ["C"])
value = [86, 15, 40, 33, 25, 38, 73, 79, 28, 50]

#Create training set
train_set = pd.DataFrame({"category": category, "quality": quality,\
    "temperature": temperature, "value": value})
train_set

Unnamed: 0,category,quality,temperature,value
0,C1,B,H,86
1,C1,G,C,15
2,C1,G,H,40
3,C1,G,H,33
4,C2,B,C,25
5,C2,B,C,38
6,C2,G,H,73
7,C2,G,H,79
8,C2,G,H,28
9,C2,G,C,50


In [157]:
#Ordinarily Encode categorical variables
for i in ['category', 'quality', 'temperature']:
    le = LabelEncoder()
    le.fit(train_set[i])
    train_set[i] = le.transform(train_set[i])
train_set

Unnamed: 0,category,quality,temperature,value
0,0,0,1,86
1,0,1,0,15
2,0,1,1,40
3,0,1,1,33
4,1,0,0,25
5,1,0,0,38
6,1,1,1,73
7,1,1,1,79
8,1,1,1,28
9,1,1,0,50


In [159]:
# generate priors
priors = np.asarray([train_set['category'].value_counts()[j]\
                      for j in range(2)])/len(train_set)

## Multinomial Naive Bayes

### Manual Implementation of Multinomial

In [160]:
train_1s = train_set.loc[train_set['category'].values.nonzero()]
train_0s = pd.concat([train_set, train_1s]).drop_duplicates(keep = False)

## P(quality = 0| Y = 0) and P(quality = 1| Y = 0)
p_qy0 = np.asarray([train_0s['quality'].value_counts()[j] for j in range(2)])/len(train_0s)
##P(quality = 1|Y = 1) and P(quality = 1|Y = 1)
p_qy1 = np.asarray([train_1s['quality'].value_counts()[j] for j in range(2)])/len(train_1s)
p_qy = np.vstack((p_qy0,p_qy1))

## P(temperature = 0| Y = 0) and P(temperature = 1|Y = 0)
p_ty0 = np.asarray([train_0s['temperature'].value_counts()[j] for j in range(2)])/len(train_0s)
## P(temperature = 0| Y = 1) and P(temperature = 1| Y = 1)
p_ty1 = np.asarray([train_1s['temperature'].value_counts()[j] for j in range(2)])/len(train_1s)
p_ty = np.vstack((p_ty0, p_ty1))

Implemeting the multinomial Bayesian formula for the probabilities, for Category 1:
$$\begin{align*}
&\widehat{\mathbb{P}}(\text{C} = 0|\text{Q} = x_1, \text{T} = x_2)\\
&= \frac{\pi_{0} * \mathbb{P}(\text{Q} = x_1|\text{C} = 0)\mathbb{P}(\text{T} = x_2|\text{C} = 0)}{\pi_{0} * \mathbb{P}(\text{Q} = x_1|\text{C} = 0)\mathbb{P}(\text{T} = x_2|\text{C} = 0) + \pi_{1} * \mathbb{P}(\text{Q} = x_1|\text{C} = 1)\mathbb{P}(\text{T} = x_2|\text{C} = 1)}
\end{align*}$$
And noting that we have only 2 categories:
$$
\begin{equation*}
\widehat{\mathbb{P}}(\text{C} = 1|\text{Q} = x_1, \text{T} = x_2) = 1 - \widehat{\mathbb{P}}(\text{C} = 0|\text{Q} = x_1, \text{T} = x_2)
\end{equation*}
$$

In [161]:
p_0MB = np.asarray([priors[0]*p_qy[0,j['quality']]*p_ty[0, j['temperature']]/\
                    (priors[0]*p_qy[0,j['quality']]*p_ty[0, j['temperature']] +\
                     priors[1]*p_qy[1,j['quality']]*p_ty[1, j['temperature']])\
                     for (i,j) in train_set.iterrows()])

p_1MB = 1 - p_0MB
probs_mb = np.vstack((p_0MB, p_1MB)).T

## Gaussian Naive Bayes

In [162]:
gb_fit = GaussianNB().fit(train_set['value'].values.reshape(-1,1), train_set['category'])
probs_gb = gb_fit.predict_proba(train_set['value'].values.reshape(-1,1))

## Manipulating the Predicted Probability Formulas to get final predictions

To get the probabilities of the Naive Bayes model with all 3 classes included in predictions, I first manipulate the ratios, using the independence of features:
$$
\begin{align*}
\frac{\widehat{\mathbb{P}}(Y = 0|X)}{\widehat{\mathbb{P}}(Y = 1|X)} &= \frac{\frac{L_0(Q = X_1,T = X_2) * L_0(V = x_3)}{D}}{\frac{L_1(Q = X_1,T = X_2) * L_1(V = x_3)}{D}} * \frac{\pi_0}{\pi_1}\\
&= \frac{L_0(Q = X_1,T = X_2) * L_0(V = x_3)}{L_1(Q = X_1,T = X_2) * L_1(V = x_3)} * \frac{\pi_0}{\pi_1}\\
&= \frac{L_0(Q = X_1,T = X_2)}{L_1(Q = X_1,T = X_2)} * \frac{L_0(V = x_3)}{L_1(V = x_3)} * \frac{\pi_0}{\pi_1}\\
&= \frac{p_{MB}(Y = 0|(X_1, X_2))}{p_{MB}(Y = 1|(X_1, X_2))} * \frac{\pi_1}{\pi_0} * \frac{p_{GB}(Y = 0|X_3)}{p_{GB}(Y = 1|X_3)} * \frac{\pi_1}{\pi_0} * \frac{\pi_0}{\pi_1}\\
&= \text{predicted probability ratios}
\end{align*}
$$

In [163]:
#Formula manipulation
mb_likelihood = (probs_mb[:, 0]/probs_mb[:,1]) * (priors[1]/priors[0])
gb_likelihood = (probs_gb[:, 0]/probs_gb[:,1]) * (priors[1]/priors[0])
pred_prob_ratios = mb_likelihood * gb_likelihood * (priors[0]/priors[1])

I then manipulate to get the probabilities by themselves, utilizing the fact that:
$$
\begin{equation*}
 \widehat{\mathbb{P}}(Y = 0|X) + \widehat{\mathbb{P}}(Y = 1|X) = 1
\end{equation*}
$$

In [164]:
#Generating Probabilities from Predicted probability ratios
p_2 = 1/(1 + pred_prob_ratios)
p_1 = 1 - p_2

In [165]:
#generate Results
result_probs = pd.DataFrame({"C1": p_1, "C2": p_2})
result_probs

Unnamed: 0,C1,C2
0,0.43854,0.56146
1,0.380723,0.619277
2,0.493019,0.506981
3,0.524647,0.475353
4,0.229592,0.770408
5,0.182424,0.817576
6,0.481648,0.518352
7,0.504002,0.495998
8,0.553333,0.446667
9,0.225065,0.774935
