# Poisson underdispersion

In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns
import matplotlib

In [2]:
def poissonsamplevar(mu, n, perc=5, nrep=100):
    v = np.zeros(nrep)
    for i in range(nrep):
        x = np.random.poisson(mu, size=n)
        v[i] = np.var(x)
    return np.percentile(v, perc)

In [3]:
df = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv?raw=true')
countries = np.unique(df['location'])

win = 10
cutoff = 3
nrep = 1000

print('== DEATHS ==')
for country in countries:
    t = df[df['location']==country]['new_deaths'].values
    t = t[~np.isnan(t)]
    t = t[t>=0]
    
    if t.size > win*10:
        l = np.floor(t.size/win).astype(int)
        d = np.zeros(l)
        for i in range(l):
            tt = t[i*win:(i+1)*win]
            if np.var(tt) < poissonsamplevar(np.mean(tt), win, nrep=nrep):
                d[i] = 1
        if np.sum(d) > cutoff:
            print(f'{country[:20]:20}', end='')
            for i in range(l):
                if d[i]==1:
                    print('*', end='')
                else:
                    print('.', end='')
            print('')

print('')
print('== CASES ==')
for country in countries:
    t = df[df['location']==country]['new_cases'].values
    t = t[~np.isnan(t)]
    t = t[t>=0]
    
    if t.size > win*10:
        l = np.floor(t.size/win).astype(int)
        d = np.zeros(l)
        for i in range(l):
            tt = t[i*win:(i+1)*win]
            if np.var(tt) < poissonsamplevar(np.mean(tt), win, nrep=nrep):
                d[i] = 1
        if np.sum(d) > cutoff:
            print(f'{country[:20]:20}', end='')
            for i in range(l):
                if d[i]==1:
                    print('*', end='')
                else:
                    print('.', end='')
            print('')

== DEATHS ==
Albania             ...........*.....**..**....*...*.*.*.......*.
Algeria             ......***..*.*.**.....*.*.....*.*.******.****
Azerbaijan          ........*.***....****......**..*.**....**.*..
Belarus             .************.***....*.********************
Cuba                ..............................**.**.*.......
Egypt               ..........*........*.*****.*..****..**..*.**.
El Salvador         ...........*..**...****..**.**.***....*.***
Kuwait              ............*.*......*.*.......**..........
Moldova             ........................****......*..*.....*
Saudi Arabia        ..*.**..........*.**.*********.*************
Serbia              ...*...**....*..**.**.....**..*****.*.***.**
Syria               ...............***.******..******.*.....***
Turkey              ..........******.****.**....*.*..***........
United Arab Emirates....*..*.**.*..............*.....*.....*.***
Uzbekistan          ..........*****..*...*.....................
Venezuela    

Every dot corresponds to 10 consecutive days. Asterisks show when the variance is lower than the Poisson variance ($p<0.05$) with the observed average, i.e. show statistically significant underdispersion. All countries in the Johns Hopkins dataset were tested, but only countries with more than 3 asterisks are shown here. Daily numbers of COVID-19 deaths and cases were both analyzed.

NOTE: Four non-consecutive asterisks may not be suspicious enough.

## Russian regions

In [4]:
# https://docs.google.com/spreadsheets/d/1nCxvNcuZGNswsf97mliLikmUIsOrOGZtL-VI7xfN-Zw

win = 10
cutoff = 3
nrep = 1000

df = pd.read_csv('https://docs.google.com/spreadsheets/d/1nCxvNcuZGNswsf97mliLikmUIsOrOGZtL-VI7xfN-Zw/export?format=csv&gid=375550280')
df = df.values

print('== DEATHS ==')
for row in range(df.shape[0]):
    t = df[row,1:]
    t = np.diff(t[::-1])
    
    if t.size > win*10:
        l = np.floor(t.size/win).astype(int)
        d = np.zeros(l)
        for i in range(l):
            tt = t[i*win:(i+1)*win]
            if np.var(tt) < poissonsamplevar(np.mean(tt), win, nrep=nrep):
                d[i] = 1
        if np.sum(d) > cutoff:
            print(f'{df[row,0][:20]:20}', end='')
            for i in range(l):
                if d[i]==1:
                    print('*', end='')
                else:
                    print('.', end='')
            print('')
            
print('')

df = pd.read_csv('https://docs.google.com/spreadsheets/d/1nCxvNcuZGNswsf97mliLikmUIsOrOGZtL-VI7xfN-Zw/export?format=csv&gid=1771324359')
df = df.values

print('== CASES ==')
for row in range(df.shape[0]):
    t = df[row,1:]
    t = np.diff(t[::-1])
    
    if t.size > win*10:
        l = np.floor(t.size/win).astype(int)
        d = np.zeros(l)
        for i in range(l):
            tt = t[i*win:(i+1)*win]
            if np.var(tt) < poissonsamplevar(np.mean(tt), win, nrep=nrep):
                d[i] = 1
        if np.sum(d) > cutoff:
            print(f'{df[row,0][:20]:20}', end='')
            for i in range(l):
                if d[i]==1:
                    print('*', end='')
                else:
                    print('.', end='')
            print('')

== DEATHS ==
Алтайский край      .......................*..*.*.*..*.........*
Астраханская область..............**....*..******..*.***********
Белгородская область...........................*..*...********..
Владимирская область........*.**.*................**.*.*........
Волгоградская област................*......*********.*********.*
Вологодская область ........................*........***********
Забайкальский край  .....................**..........***..***.**
Ивановская область  .................................*.*.*....**
Иркутская область   .....................*....*.*.**.***********
Кабардино-Балкарская.................*.*...**.....**..***.......
Кемеровская область .......................****..*.*......*.....
Кировская область   ..............*..**.*******.*.****.***.*..**
Краснодарский край  ............**.....*************.***********
Красноярский край   .............*.*...***..*.....*..***********
Курская область     .....................................**..*.*
Липецкая обл

Ярославская область ........***********....**********.*********
