## Okay let's see how well we can process a non-normal distribution

In [1]:
import sys
from collections import Counter

from scipy.stats import gengamma
import pandas as pd

sys.path.append("..")

from src.tweetdownloader import TweetDownloader
import src.utils as utils

In [2]:
first = 'realDonaldTrump'

In [3]:
tweets = TweetDownloader(to_load=first)

INFO:root:API created
INFO:root:Liberating realDonaldTrump
INFO:root:	Page: 0
INFO:root:Found: 1 duplicate rows
INFO:root:Writing ../data/realDonaldTrump.csv


In [4]:
tweets_90_days = utils.setup_data(tweets.dataset)

k=7.5
w=0.6
loc=-1.75
scale=1.01

k, w, loc, scale = gengamma.fit(by_day["ID"])

rv = gengamma(k,w,loc=loc,scale=scale)

print(rv.stats(moments="mvsk"))

r = rv.rvs(size=10000)

#plt.hist(r)


r.round().astype(int)

fake = pd.DataFrame.from_dict(Counter(r.round().astype(int)), orient='index', columns=['count_fake']).sort_index()

fake["count_fake"] = fake["count_fake"]/fake["count_fake"].sum()

pd.concat([daily_count['probability'], fake], axis=1).plot()

In [5]:
k, w, loc, scale = gengamma.fit(tweets_90_days["id"])
rv = gengamma(k,w,loc=loc,scale=scale)
print(rv.stats(moments="mvsk"))

(array(29.64779683), array(345.02234775), array(1.42739532), array(3.38559602))


In [6]:
days = 7
x = 10000

In [7]:
d_dict = dict()
for i in range(days):
    a = rv.rvs(size=x).round().astype(int)
    d_dict[str(i)] = a

In [8]:
df = pd.DataFrame.from_dict(d_dict)

In [9]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,16,17,17,13,65,6,32
1,52,45,28,13,26,62,61
2,68,16,17,45,13,24,57
3,10,15,36,44,27,56,22
4,21,20,12,24,29,27,51
...,...,...,...,...,...,...,...
9995,11,49,50,44,18,32,24
9996,80,43,40,28,24,21,4
9997,35,13,16,16,27,22,12
9998,32,39,55,11,43,30,22


In [10]:
summed = df.sum(axis=1)
summed

0       166
1       287
2       240
3       210
4       184
       ... 
9995    228
9996    240
9997    141
9998    232
9999    208
Length: 10000, dtype: int64

In [11]:
counts = pd.Series(Counter(summed))

In [12]:
counts

166    70
287    16
240    56
210    87
184    80
       ..
382     1
376     1
353     1
378     1
414     1
Length: 309, dtype: int64

In [13]:
prior_counts = utils.prior_bucket_maker(summed, 169)

In [14]:
weeklies = pd.DataFrame.from_dict(prior_counts, orient='index', columns=["Counts"])

In [15]:
weeklies["Counts"] / weeklies["Counts"].sum()

169        0.2287
170-179    0.0749
180-189    0.0833
190-199    0.0837
200-209    0.0785
210-219    0.0749
220-229    0.0708
230-239    0.0607
240        0.2445
Name: Counts, dtype: float64

In [16]:
days_to_go = 1.66
tweets_so_far = 212

In [17]:
whole_days = int(days_to_go)
remainder = days_to_go - whole_days

In [18]:
remainder

0.6599999999999999

In [19]:
d_dict = dict()
for i in range(whole_days+1):
    a = rv.rvs(size=x).round().astype(int)
    d_dict[str(i)] = a

In [20]:
df = pd.DataFrame.from_dict(d_dict)

In [21]:
df[str(i)] = round(df[str(i)] * remainder).astype(int)
df

Unnamed: 0,0,1
0,16,24
1,20,34
2,39,32
3,49,11
4,37,16
...,...,...
9995,24,8
9996,24,32
9997,23,21
9998,27,18


In [22]:
summed = df.sum(axis=1) + tweets_so_far
summed

0       252
1       266
2       283
3       272
4       265
       ... 
9995    244
9996    268
9997    256
9998    257
9999    262
Length: 10000, dtype: int64

In [23]:
prior_counts = utils.prior_bucket_maker(summed, 169)

In [24]:
weeklies = pd.DataFrame.from_dict(prior_counts, orient='index', columns=["Counts"])

In [25]:
weeklies["Counts"] / weeklies["Counts"].sum()

169        0.0000
170-179    0.0000
180-189    0.0000
190-199    0.0000
200-209    0.0000
210-219    0.0007
220-229    0.0301
230-239    0.1182
240        0.8510
Name: Counts, dtype: float64