# Assignment 9 - Data Science in Finance and Insurance
Submission by: Dennis Goldenberg

In [4]:
#import statements
import numpy as np
import pandas as pd;
from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB
from sklearn.preprocessing import LabelEncoder

In [5]:
#Generate datasaet

#Generate variables
category = pd.Categorical(["C1"] * 4 + ["C2"] * 6)
quality = pd.Categorical(["B"] + ["G"]*3 + ["B"]*2 + ["G"]*4)
temperature = pd.Categorical(["H"] + ["C"] + ["H"]*2 + ["C"]*2 + ["H"]*3 + ["C"])
value = [86, 15, 40, 33, 25, 38, 73, 79, 28, 50]

#Create training set
train_set = pd.DataFrame({"category": category, "quality": quality,\
    "temperature": temperature, "value": value})
train_set

Unnamed: 0,category,quality,temperature,value
0,C1,B,H,86
1,C1,G,C,15
2,C1,G,H,40
3,C1,G,H,33
4,C2,B,C,25
5,C2,B,C,38
6,C2,G,H,73
7,C2,G,H,79
8,C2,G,H,28
9,C2,G,C,50


In [6]:
#Ordinarily Encode categorical variables
for i in ['category', 'quality', 'temperature']:
    le = LabelEncoder()
    le.fit(train_set[i])
    train_set[i] = le.transform(train_set[i])
train_set

Unnamed: 0,category,quality,temperature,value
0,0,0,1,86
1,0,1,0,15
2,0,1,1,40
3,0,1,1,33
4,1,0,0,25
5,1,0,0,38
6,1,1,1,73
7,1,1,1,79
8,1,1,1,28
9,1,1,0,50


In [7]:
# generate priors
priors = np.asarray([train_set['category'].value_counts()[j] for j in range(2)])/len(train_set)
priors

array([0.4, 0.6])

## Multinomial Naive Bayes

In [79]:
mb_fit = MultinomialNB().fit(train_set[['quality', 'temperature']], train_set['category'])
probs_mb = mb_fit.predict_proba(train_set[['quality', 'temperature']])
probs_mb

array([[0.42857143, 0.57142857],
       [0.375     , 0.625     ],
       [0.40298507, 0.59701493],
       [0.40298507, 0.59701493],
       [0.4       , 0.6       ],
       [0.4       , 0.6       ],
       [0.40298507, 0.59701493],
       [0.40298507, 0.59701493],
       [0.40298507, 0.59701493],
       [0.375     , 0.625     ]])

### Manual Implementation of Multinomial

In [85]:
train_1s = train_set.loc[train_set['category'].values.nonzero()]
train_0s = pd.concat([train_set, train_1s]).drop_duplicates(keep = False)

## P(quality = 0| Y = 0) and P(quality = 1| Y = 0)
p_qy0 = np.asarray([train_0s['quality'].value_counts()[j] for j in range(2)])/len(train_0s)
##P(quality = 1|Y = 1) and P(quality = 1|Y = 1)
p_qy1 = np.asarray([train_1s['quality'].value_counts()[j] for j in range(2)])/len(train_1s)
p_qy = np.vstack((p_qy0,p_qy1))

## P(temperature = 0| Y = 0) and P(temperature = 1|Y = 0)
p_ty0 = np.asarray([train_0s['temperature'].value_counts()[j] for j in range(2)])/len(train_0s)
## P(temperature = 0| Y = 1) and P(temperature = 1| Y = 1)
p_ty1 = np.asarray([train_1s['temperature'].value_counts()[j] for j in range(2)])/len(train_1s)
p_ty = np.vstack((p_ty0, p_ty1))

Implemeting the multinomial Bayesian formula for the probabilities, for Category 1:
$$\begin{align*}
&\widehat{\mathbb{P}}(\text{C} = 0|\text{Q} = x_1, \text{T} = x_2)\\
&= \frac{\pi_{0} * \mathbb{P}(\text{Q} = 1|\text{C} = 0)\mathbb{P}(\text{T} = 1|\text{C} = 0)}{\pi_{0} * \mathbb{P}(\text{Q} = 1|\text{C} = 0)\mathbb{P}(\text{T} = 1|\text{C} = 0) + \pi_{1} * \mathbb{P}(\text{Q} = 1|\text{C} = 1)\mathbb{P}(\text{T} = 1|\text{C} = 1)}
\end{align*}$$
And noting that we have only 2 categories:
$$
\begin{equation*}
\widehat{\mathbb{P}}(\text{C} = 1|\text{Q} = x_1, \text{T} = x_2) = 1 - \hat{\mathbb{P}}(\text{C} = 0|\text{Q} = x_1, \text{T} = x_2)
\end{equation*}
$$

In [86]:
p_0MB = np.asarray([priors[0]*p_qy[0,j['quality']]*p_ty[0, j['temperature']]/\
                    (priors[0]*p_qy[0,j['quality']]*p_ty[0, j['temperature']] +\
                     priors[1]*p_qy[1,j['quality']]*p_ty[1, j['temperature']])\
                     for (i,j) in train_set.iterrows()])

p_1MB = 1 - p_0MB
probs_mb = np.vstack((p_0MB, p_1MB)).T
probs_mb

array([[0.42857143, 0.57142857],
       [0.27272727, 0.72727273],
       [0.52941176, 0.47058824],
       [0.52941176, 0.47058824],
       [0.2       , 0.8       ],
       [0.2       , 0.8       ],
       [0.52941176, 0.47058824],
       [0.52941176, 0.47058824],
       [0.52941176, 0.47058824],
       [0.27272727, 0.72727273]])

## Gaussian Naive Bayes

In [81]:
gb_fit = GaussianNB().fit(train_set['value'].values.reshape(-1,1), train_set['category'])
probs_gb = gb_fit.predict_proba(train_set['value'].values.reshape(-1,1))

In [82]:
#Formula manipulation
mb_ratios = (probs_mb[:, 0]/probs_mb[:,1]) * (priors[1]/priors[0])
gb_ratios = (probs_gb[:, 0]/probs_gb[:,1]) * (priors[1]/priors[0])
pred_prob_ratios = mb_ratios * gb_ratios * (priors[0]/priors[1])

In [84]:
p_1 = 1/(1 + pred_prob_ratios)
1 - p_1

array([0.43854012, 0.38072284, 0.49301851, 0.52464652, 0.2295921 ,
       0.18242375, 0.48164762, 0.50400237, 0.55333349, 0.22506505])