# Poisson distribution Shenanigans

https://towardsdatascience.com/an-illustrated-guide-to-the-poisson-regression-model-50cccba15958

In [2]:
import sys
import datetime

import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from scipy.stats import poisson

sys.path.append("..")

import src.utils as utils

In [3]:
#!{sys.executable} -m pip install patsy statsmodels

In [4]:
# import tweets
df = utils.tweets('realDonaldTrump')

In [5]:
# get daily counts
by_day = df.groupby(df.index.date).count()

In [6]:
# get last year of data
year_date = by_day.iloc[len(by_day)-1].name - datetime.timedelta(days=365)
by_day_year = by_day[year_date:]


## Create Training Data

We'll do it this way for now, but in the future, it will be a different split

In [7]:
mask = np.random.rand(len(by_day_year)) < 0.8
df_train = by_day_year[mask]
df_test = by_day_year[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))

Training data set length=283
Testing data set length=83


## Super fucking basic poisson

tiers: 
```
prior_counts = {
  "149" : sum(by_day["sum_7"] <= 149),
  "150-159" : sum((by_day["sum_7"] >= 150) & (by_day["sum_7"] < 160)),
  "160-169" : sum((by_day["sum_7"] >= 160) & (by_day["sum_7"] < 170)),
  "170-179" : sum((by_day["sum_7"] >= 170) & (by_day["sum_7"] < 180)),
  "180-189" : sum((by_day["sum_7"] >= 180) & (by_day["sum_7"] < 190)),
  "190-199" : sum((by_day["sum_7"] >= 190) & (by_day["sum_7"] < 200)),
  "200-209" : sum((by_day["sum_7"] >= 200) & (by_day["sum_7"] < 210)),
  "210-219" : sum((by_day["sum_7"] >= 210) & (by_day["sum_7"] < 219)),
  "220" : sum(by_day["sum_7"] >= 220)
}
```


In [8]:
def weekly_likelihoods(data, lower, buckets=9, gap=10):
    mu = data.mean().round()
    week = int(mu * 7)
    rv = poisson(week)
    
    weekly_dict = dict()
    
    for i in range(buckets):
        if i == 0:
            weekly_dict[str(lower)] = rv.cdf(lower) * 100
        elif i == buckets - 1:
            lower_bound = (lower + i * gap) - (gap - 1)
            weekly_dict[str(lower_bound)] = (1 - rv.cdf(lower_bound)) * 100
        else:
            lower_bound = (lower + i * gap) - (gap - 1)
            upper_bound = (lower + i * gap)
            
            label = f"{lower_bound}-{upper_bound}"
            
            weekly_dict[label] = (rv.cdf(upper_bound) - rv.cdf(lower_bound)) * 100
            
    return weekly_dict
    
weekly_likelihoods(by_day_year["ID"], 149)

{'149': 0.14862694630635806,
 '150-159': 1.2235170569628409,
 '160-169': 5.897339054899048,
 '170-179': 15.925442913761156,
 '180-189': 24.86613284747075,
 '190-199': 23.08287360188668,
 '200-209': 13.058592051315799,
 '210-219': 4.603609819183085,
 '220': 1.243205861520269}

In [9]:
mu = int(df_train["ID"].mean().round())

week = mu * 7
print(week)

189


In [10]:
rv = poisson(week)

In [11]:
weekly_likelihoods = {
    "149<":rv.cdf(149),
    "150-159":rv.cdf(159) - rv.cdf(149),
    "160-169":rv.cdf(169) - rv.cdf(159),
    "170-179":rv.cdf(179) - rv.cdf(169),
    "180-189":rv.cdf(189) - rv.cdf(179),
    "190-199":rv.cdf(199) - rv.cdf(189),
    "200-209":rv.cdf(209) - rv.cdf(199),
    "210-219":rv.cdf(219) - rv.cdf(209),
    "220>":1 - rv.cdf(220)
}

In [12]:
weekly_likelihoods

{'149<': 0.0014862694630635807,
 '150-159': 0.012662458017805606,
 '160-169': 0.061984838402712135,
 '170-179': 0.17063247381916818,
 '180-189': 0.27256674122834484,
 '190-199': 0.25968206179322206,
 '200-209': 0.15118344912923365,
 '210-219': 0.05496378983777939,
 '220>': 0.01243205861520269}

In [13]:
rv.cdf(149)

0.0014862694630635807

In [14]:
rv.cdf(159) - rv.cdf(149)

0.012662458017805606

## And for a given day

Currently there is a day and 3 hours left and he already has 192 tweets. Let's assume he tweets for 20 hours out of the day, so 3 hours is 0.15, let's round up to 0.2

In [15]:

days_left = 5.2
tweets_so_far = 16

def difference_dist(rv, upper, lower, tweets_so_far):
    adj_upper = upper - tweets_so_far
    adj_lower = lower - tweets_so_far
    
    return rv.cdf(adj_upper) - rv.cdf(adj_lower)

def remaining_odds(data, days_left, tweets_so_far, lower, buckets=9, gap=10):
    likelihoods = dict()
    
    mu = data.mean()
    
    adj_mu = int(round(mu * days_left))
    
    rv = poisson(adj_mu)
    
    
    for i in range(buckets):
        if i == 0:
            likelihoods[str(lower)] = rv.cdf(lower - tweets_so_far)
        elif i == buckets - 1:
            lower_bound = (lower + i * gap) - (gap - 1)
            likelihoods[str(lower_bound)] = (1 - rv.cdf(lower_bound - tweets_so_far))
            pass
        else:
            lower_bound = (lower + i * gap) - (gap - 1)
            upper_bound = (lower + i * gap)
            
            label = f"{lower_bound}-{upper_bound}"
            
            likelihoods[label] = difference_dist(rv,
                                                 upper_bound,
                                                 lower_bound,
                                                 tweets_so_far)
            
    # format dict
    for thing in likelihoods:
        likelihoods[thing] = float(f"{likelihoods[thing]:.3f}")*100
            
            
    
    return likelihoods
    
remaining_odds(by_day_year["ID"], days_left, tweets_so_far, 169)

{'169': 83.3,
 '170-179': 10.9,
 '180-189': 2.7,
 '190-199': 0.4,
 '200-209': 0.0,
 '210-219': 0.0,
 '220-229': 0.0,
 '230-239': 0.0,
 '240': 0.0}

## Actual weekly

In [16]:
days_left = 7
tweets_so_far = 0
bucket = 169

remaining_odds(by_day_year["ID"], days_left, tweets_so_far, bucket)

{'169': 5.0,
 '170-179': 12.6,
 '180-189': 22.900000000000002,
 '190-199': 24.7,
 '200-209': 16.3,
 '210-219': 6.7,
 '220-229': 1.7000000000000002,
 '230-239': 0.3,
 '240': 0.0}

## And now POTUS

In [17]:
# import tweets
df = utils.tweets('potus')
by_day = df.groupby(df.index.date).count()

In [18]:
days_left = 7
tweets_so_far = 0
lower = 64
gap = 5

remaining_odds(by_day["ID"], days_left, tweets_so_far, lower, gap=gap)

{'64': 18.9,
 '65-69': 16.7,
 '70-74': 18.5,
 '75-79': 14.7,
 '80-84': 8.5,
 '85-89': 3.5999999999999996,
 '90-94': 1.2,
 '95-99': 0.3,
 '100': 0.1}

In [19]:
days_left = 0.2
tweets_so_far = 93

remaining_odds(by_day["ID"], days_left, tweets_so_far, lower, gap=gap)

{'64': 0.0,
 '65-69': 0.0,
 '70-74': 0.0,
 '75-79': 0.0,
 '80-84': 0.0,
 '85-89': 0.0,
 '90-94': 40.6,
 '95-99': 31.900000000000002,
 '100': 0.1}

# And White House

In [20]:
# import tweets
df = utils.tweets('whitehouse')
by_day = df.groupby(df.index.date).count()

### Weekly

In [21]:
days_left = 7
tweets_so_far = 0
lower = 199
gap = 10

remaining_odds(by_day["ID"], days_left, tweets_so_far, lower, gap=gap)

{'199': 70.89999999999999,
 '200-209': 16.3,
 '210-219': 6.7,
 '220-229': 1.7000000000000002,
 '230-239': 0.3,
 '240-249': 0.0,
 '250-259': 0.0,
 '260-269': 0.0,
 '270': 0.0}

In [22]:
days_left = 6.2
tweets_so_far = 27

remaining_odds(by_day["ID"], days_left, tweets_so_far, lower, gap=gap)

{'199': 58.099999999999994,
 '200-209': 22.1,
 '210-219': 10.6,
 '220-229': 3.0,
 '230-239': 0.5,
 '240-249': 0.1,
 '250-259': 0.0,
 '260-269': 0.0,
 '270': 0.0}