Topic:        Challenge Set 11

Subject:      Poisson GLM

Date:         02/19/2018

Name:         Browning Gentry 

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Challenge 1

In [42]:
from pandas.io.stata import StataReader
reader = StataReader('ships.dta')
df = reader.data()

import statsmodels.api as smf
from patsy import dmatrices



In [43]:
dummies = ['type','construction','operation']
df = pd.get_dummies(df, columns = dummies, drop_first=True)

mid = df['damage']
df.drop(labels=['damage'], axis=1,inplace = True)
df.insert(0, 'damage', mid)


df = df.rename(columns={'construction_1965-70': 'constr_A', 'construction_1970-74': 'constr_B',
                       'construction_1975-79': 'constr_C', 'operation_1975-79': 'oper_A'})

In [44]:
df.head()

Unnamed: 0,damage,months,type_B,type_C,type_D,type_E,constr_A,constr_B,constr_C,oper_A
0,0.0,127.0,0,0,0,0,0,0,0,0
1,0.0,63.0,0,0,0,0,0,0,0,1
2,3.0,1095.0,0,0,0,0,1,0,0,0
3,4.0,1095.0,0,0,0,0,1,0,0,1
4,6.0,1512.0,0,0,0,0,0,1,0,0


In [46]:
Y,X = dmatrices('damage ~ months + type_B + type_C + type_D + type_E + '
                   'constr_A + constr_B + constr_C + '
                   'oper_A', data=df, return_type='dataframe')

In [47]:
# Data 
pois_m=smf.GLM(Y,X, family=smf.families.Poisson(smf.families.links.log))
# Fitting our model using Maximum likelihood
pois_results=pois_m.fit()

print (pois_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 damage   No. Observations:                   34
Model:                            GLM   Df Residuals:                       24
Model Family:                 Poisson   Df Model:                            9
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -84.182
Date:                Sun, 18 Feb 2018   Deviance:                       70.498
Time:                        15:52:49   Pearson chi2:                     65.8
No. Iterations:                     6                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1786      0.277      0.645      0.519      -0.364       0.722
months      6.697e-05   8.52e-06      7.857      0.0

# Challenge 2

In [49]:
Y,X = dmatrices('damage ~ type_B + type_C + type_D + type_E + '
                   'constr_A + constr_B + constr_C + '
                   'oper_A', data=df, return_type='dataframe')

In [53]:
# Data 
pois_m=smf.GLM(Y,X, data=df, offset = np.log(df['months']),family=smf.families.Poisson(smf.families.links.log))
# Fitting our model using Maximum likelihood
pois_results=pois_m.fit()

print (pois_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 damage   No. Observations:                   34
Model:                            GLM   Df Residuals:                       25
Model Family:                 Poisson   Df Model:                            8
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -68.281
Date:                Sun, 18 Feb 2018   Deviance:                       38.695
Time:                        16:02:42   Pearson chi2:                     42.3
No. Iterations:                     6                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -6.4059      0.217    -29.460      0.000      -6.832      -5.980
type_B        -0.5433      0.178     -3.060      0.0

# Challenge 3

In [94]:
Y,X = dmatrices('damage ~ months + type_B + type_C + type_D + type_E + '
                   'constr_A + constr_B + constr_C + '
                   'oper_A', data=df, return_type='dataframe')

# Data 
pois_m=smf.GLM(Y[:25],X[:25],offset = np.log(df['months'][:25]),family=smf.families.Poisson(smf.families.links.log))
# Fitting our model using Maximum likelihood
pois_results=pois_m.fit()
preds = pois_results.predict(X[25:])
print(pois_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 damage   No. Observations:                   25
Model:                            GLM   Df Residuals:                       16
Model Family:                 Poisson   Df Model:                            8
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -43.873
Date:                Sun, 18 Feb 2018   Deviance:                       17.442
Time:                        16:20:50   Pearson chi2:                     20.9
No. Iterations:                    23                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -6.3486      0.324    -19.590      0.000      -6.984      -5.713
months     -3.275e-06   9.29e-06     -0.353      0.7

In [95]:
from sklearn.metrics import mean_squared_error
print('RMSE: ' + str(mean_squared_error(Y[25:],preds)**.25))

RMSE: 2.59594512892


# Challenge 4

In [98]:
df['intercept'] = 1
Y,X = dmatrices('damage ~ intercept', data=df, return_type='dataframe')

In [99]:
pois_m=smf.GLM(Y[:25], X[:25], family=smf.families.Poisson(smf.families.links.log))
# Fitting our model using Maximum likelihood
pois_results=pois_m.fit()

print (pois_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 damage   No. Observations:                   25
Model:                            GLM   Df Residuals:                       24
Model Family:                 Poisson   Df Model:                            0
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -310.15
Date:                Sun, 18 Feb 2018   Deviance:                       549.99
Time:                        16:21:29   Pearson chi2:                     629.
No. Iterations:                     5                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.2540      0.029     43.943      0.000       1.198       1.310
intercept      1.2540      0.029     43.943      0.0

In [100]:
print('Deviance - Null Deviance = ' + str(17.442 - 549.99))

Deviance - Null Deviance = -532.548


In [103]:
from scipy.stats import chisqprob
chisqprob([17.442, 549.99], df=2)

stats.chisqprob is deprecated in scipy 0.17.0; use stats.distributions.chi2.sf instead.
  


array([  1.63123985e-004,   3.72553758e-120])

In [104]:
df.head()

Unnamed: 0,damage,months,type_B,type_C,type_D,type_E,constr_A,constr_B,constr_C,oper_A,intercept
0,0.0,127.0,0,0,0,0,0,0,0,0,1
1,0.0,63.0,0,0,0,0,0,0,0,1,1
2,3.0,1095.0,0,0,0,0,1,0,0,0,1
3,4.0,1095.0,0,0,0,0,1,0,0,1,1
4,6.0,1512.0,0,0,0,0,0,1,0,0,1


# Challenge 5

In [114]:
from sklearn.linear_model import LinearRegression

X = df.iloc[:,1:10]
y = np.log(df.iloc[:,0]+.001)

lr = LinearRegression()
lr.fit(X[:25],y[:25])
preds = lr.predict(X[25:])

In [116]:
print('RMSE: ' + str(mean_squared_error(Y[25:],preds)**.25))

RMSE: 2.80671690834


In [118]:
lr.coef_

array([  1.55825532e-04,   1.19947195e+00,   4.92984429e-02,
        -4.74547052e+00,  -2.38418579e-07,   1.51773322e+00,
         4.05605412e+00,   3.23362184e+00,   1.13304114e+00])

# Challenge 6

In [135]:
from pandas.io.stata import StataReader
reader = StataReader('smoking.dta')
df = reader.data()

import statsmodels.api as smf
from patsy import dmatrices



In [136]:
dummies = ['age','smoke']
df = pd.get_dummies(df, columns = dummies, drop_first=True)

mid = df['dead']
df.drop(labels=['dead'], axis=1,inplace = True)
df.insert(0, 'dead', mid)


df = df.rename(columns={'age_45-49':'age_a', 'age_50-54':'age_b', 'age_55-59':'age_c', 
                        'age_60-64':'age_d', 'age_65-69':'age_e', 'age_70-74':'age_f',
                        'age_75-79': 'age_g','age_80+': 'age_h'})

df.columns = df.columns.str.replace('\s+', '_')

In [138]:
Y,X = dmatrices('dead ~ age_a + age_b + age_c + age_d + age_e + age_f + '
       'age_g + age_h + smoke_Smokes_cigars_or_pipe_only + '
       'smoke_Smokes_cigarettes_and_cigar_or_pipe + '
       'smoke_smokes_cigarettes_only', data=df, return_type='dataframe')

# Data 
pois_m=smf.GLM(Y[:25],X[:25],offset = np.log(df['pop'][:25]),family=smf.families.Poisson(smf.families.links.log))
# Fitting our model using Maximum likelihood
pois_results=pois_m.fit()
preds = pois_results.predict(X[25:])
print(pois_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                   dead   No. Observations:                   25
Model:                            GLM   Df Residuals:                       14
Model Family:                 Poisson   Df Model:                           10
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -85.902
Date:                Sun, 18 Feb 2018   Deviance:                       13.074
Time:                        16:36:34   Pearson chi2:                     12.4
No. Iterations:                     6                                         
                                                coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------
Intercept                                    -3.6620      0.085    -4

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [139]:
print('RMSE: ' + str(mean_squared_error(Y[25:],preds)**.25))

RMSE: 20.1079495734


In [141]:
from sklearn.linear_model import LinearRegression

X = df.iloc[:,1:]
y = np.log(df.iloc[:,0]+.001)

lr = LinearRegression()
lr.fit(X[:25],y[:25])
preds = lr.predict(X[25:])

In [142]:
print('RMSE: ' + str(mean_squared_error(Y[25:],preds)**.25))

RMSE: 20.0032118179
