In [None]:
import pandas as pd
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


**Goal:** Which factor amongst `['yr_construction', 'period_op', 'months in use'] contributes most to the number of damages that a ship incurs?

**Method:** Poisson regression - they key here is that the likelihood function is a Poisson likelihood, to model counts.

In [None]:
df = pd.read_csv('datasets/ship-damage.txt')
# Log10 transform months
df['months'] = df['months'].apply(lambda x: np.log10(x))
df.head()

In [None]:
plt.scatter(x=df['months'], y=df['n_damages'])

In [None]:
with pm.Model() as model:
    betas = pm.Normal('betas', mu=0, sd=100**2, shape=(3, 1))    
    n_damages = betas[0] * df['yr_construction'] + betas[1] * df['period_op'] + betas[2] * df['months']
        
    n_damages_like = pm.Poisson('likelihood', mu=np.exp(n_damages), observed=df['n_damages'])
    trace = pm.sample(draws=2000)

In [None]:
pm.traceplot(trace)

In [None]:
pm.plot_posterior(trace, color='beige', )

In [None]:
pm.summary(trace)