# Poisson distribution Shenanigans

https://towardsdatascience.com/an-illustrated-guide-to-the-poisson-regression-model-50cccba15958

In [21]:
import sys
import datetime

import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from scipy.stats import poisson

sys.path.append("..")

import src.utils as utils

In [7]:
#!{sys.executable} -m pip install patsy statsmodels

Collecting patsy
  Downloading https://files.pythonhosted.org/packages/ea/0c/5f61f1a3d4385d6bf83b83ea495068857ff8dfb89e74824c6e9eb63286d8/patsy-0.5.1-py2.py3-none-any.whl (231kB)
[K    100% |████████████████████████████████| 235kB 2.1MB/s ta 0:00:01
[?25hCollecting statsmodels
  Downloading https://files.pythonhosted.org/packages/cb/83/540fd83238a18abe6c2d280fa8e489ac5fcefa1f370f0ca1acd16ae1b860/statsmodels-0.11.1-cp36-cp36m-manylinux1_x86_64.whl (8.7MB)
[K    100% |████████████████████████████████| 8.7MB 168kB/s ta 0:00:011
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.11.1


In [8]:
# import tweets
df = utils.rdt_tweets()

In [9]:
# get daily counts
by_day = df.groupby(df.index.date).count()

In [15]:
# get last year of data
year_date = by_day.iloc[len(by_day)-1].name - datetime.timedelta(days=365)
by_day_year = by_day[year_date:]


## Create Training Data

We'll do it this way for now, but in the future, it will be a different split

In [17]:
mask = np.random.rand(len(by_day_year)) < 0.8
df_train = by_day_year[mask]
df_test = by_day_year[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))

Training data set length=294
Testing data set length=72


## Super fucking basic poisson

tiers: 
```
prior_counts = {
  "149" : sum(by_day["sum_7"] <= 149),
  "150-159" : sum((by_day["sum_7"] >= 150) & (by_day["sum_7"] < 160)),
  "160-169" : sum((by_day["sum_7"] >= 160) & (by_day["sum_7"] < 170)),
  "170-179" : sum((by_day["sum_7"] >= 170) & (by_day["sum_7"] < 180)),
  "180-189" : sum((by_day["sum_7"] >= 180) & (by_day["sum_7"] < 190)),
  "190-199" : sum((by_day["sum_7"] >= 190) & (by_day["sum_7"] < 200)),
  "200-209" : sum((by_day["sum_7"] >= 200) & (by_day["sum_7"] < 210)),
  "210-219" : sum((by_day["sum_7"] >= 210) & (by_day["sum_7"] < 219)),
  "220" : sum(by_day["sum_7"] >= 220)
}
```


In [35]:
mu = int(df_train["ID"].mean().round())

week = mu * 7
print(week)

196


In [31]:
rv = poisson(week)

In [41]:
weekly_likelihoods = {
    "149<":rv.cdf(149),
    "150-159":rv.cdf(159) - rv.cdf(149),
    "160-169":rv.cdf(169) - rv.cdf(159),
    "170-179":rv.cdf(179) - rv.cdf(169),
    "180-189":rv.cdf(189) - rv.cdf(179),
    "190-199":rv.cdf(199) - rv.cdf(189),
    "200-209":rv.cdf(209) - rv.cdf(199),
    "210-219":rv.cdf(219) - rv.cdf(209),
    "220>":1 - rv.cdf(220)
}

In [42]:
weekly_likelihoods

{'149<': 0.00027482576838815776,
 '150-159': 0.003376058328624801,
 '160-169': 0.02340681808394423,
 '170-179': 0.09122212041226974,
 '180-189': 0.20630753064510746,
 '190-199': 0.27841963162602085,
 '200-209': 0.22979595015869514,
 '210-219': 0.1185671641686934,
 '220>': 0.04208432731798306}

In [32]:
rv.cdf(149)

0.00027482576838815776

In [34]:
rv.cdf(159) - rv.cdf(149)

0.003376058328624801

## And for a given day

Currently there is a day and 3 hours left and he already has 192 tweets. Let's assume he tweets for 20 hours out of the day, so 3 hours is 0.15, let's round up to 0.2

In [57]:
mu = df_train["ID"].mean()
days_left = 1.2
tweets_so_far = 197

def difference_dist(rv, upper, lower, tweets_so_far):
    adj_upper = upper - tweets_so_far
    adj_lower = lower - tweets_so_far
    
    return rv.cdf(adj_upper) - rv.cdf(adj_lower)

def remaining_odds(mu, days_left):
    likelihoods = dict()
    
    adj_mu = int(round(mu * days_left))
    
    rv = poisson(adj_mu)
    
    # add the lower bound to it
    likelihoods["<149"] = rv.cdf(149-tweets_so_far)
    
    likelihoods["150-159"] = difference_dist(rv, 159, 150, tweets_so_far)
    likelihoods["160-169"] = difference_dist(rv, 169, 160, tweets_so_far)
    likelihoods["170-179"] = difference_dist(rv, 179, 170, tweets_so_far)
    likelihoods["180-189"] = difference_dist(rv, 189, 180, tweets_so_far)
    likelihoods["190-199"] = difference_dist(rv, 199, 190, tweets_so_far)
    likelihoods["200-209"] = difference_dist(rv, 209, 200, tweets_so_far)
    likelihoods["210-219"] = difference_dist(rv, 219, 210, tweets_so_far)

    likelihoods[">220"] = 1 - rv.cdf(220-tweets_so_far)
    
    return likelihoods
    
remaining_odds(mu, days_left)

{'<149': 0.0,
 '150-159': 0.0,
 '160-169': 0.0,
 '170-179': 0.0,
 '180-189': 0.0,
 '190-199': 2.695165634942316e-12,
 '200-209': 2.4895526421990978e-05,
 '210-219': 0.02803483294672782,
 '>220': 0.9567073249868225}