# Analyzing Coin Flips

> Data and some of the main ideas originally from [Can you Fake Coin Tosses?](https://faculty.math.illinois.edu/~hildebr/fakerandomness/)

It turns out that humans are rather bad random number generators. If you ask a person to simulate a random sequence (e.g., flipping a coin a number of time), they will almost always introduce patterns in the sequence that you would not see if it were genuinely random.  A cool example of this in action is the [mind reader app](http://mindreaderpro.appspot.com/) put together by Yoav Freund's group at UC San Diego.


In this notebook we'll be using conditional probabilities as a way to understand the difference between real and fake coin tosses.

In [1]:
import pandas as pd

df_fake = pd.read_csv('coin_flips.csv')
df_fake

Unnamed: 0,student,flips
0,math199chp2017fall1,0000011001000011101010011101111010001110110100...
1,math199chp2017fall2,0010101100010111100101011000101100100011011001...
2,math199chp2017fall3,0000101010001010011111001011110010110000000101...
3,math199chp2017fall4,1101001110101001110101100001101101000100111010...
4,math199chp2017fall5,0010100111011011010110000110010011000101100110...
5,math199chp2017fall6,0010110001011110101011001101010001101011110010...
6,math199chp2017fall7,0001011010010111100101001001110110100100001101...
7,math199chp2017fall8,1011010011100101101000011101001110100110100110...
8,math199chp2017fall9,0011101100101001001011011000011010110111011101...
9,math199chp2017fall10,0010100011110101100100111001011001101000110101...


In [2]:
df_fake.iloc[0]['flips']

'0000011001000011101010011101111010001110110100111101000110010111100011111110011101100001011100010110001101000001010110100010100101000101101111101100101111010111010010111110010111001010101001011110010010000101'

In [2]:
from random import randint
def sample_random_flips(x):
    return ''.join(str(randint(0,1)) for _ in range(len(x)))

df_real = df_fake.copy()
df_real['flips'] = df_fake['flips'].map(sample_random_flips)

In [3]:
#!pip install regex
import regex as re

def count_overlapping(text, search_for):
    return len(re.findall(search_for, text, overlapped=True))

def make_seq_column(df, seq):
    df['seq_' + seq] = df['flips'].map(lambda x: count_overlapping(x, seq)/(len(x) - len(seq) + 1))

In [4]:
import itertools

def populate_length_n_seqs(n):
    for s in itertools.product(*([['0', '1']]*n)):
        make_seq_column(df_real, ''.join(s))
        make_seq_column(df_fake, ''.join(s))

for n in range(10):
    populate_length_n_seqs(n)

In [48]:
df_fake

Unnamed: 0,student,flips,seq_,seq_0,seq_1,seq_00,seq_01,seq_10,seq_11,seq_000,...,seq_111110110,seq_111110111,seq_111111000,seq_111111001,seq_111111010,seq_111111011,seq_111111100,seq_111111101,seq_111111110,seq_111111111
0,math199chp2017fall1,0000011001000011101010011101111010001110110100...,1.0,0.480769,0.519231,0.21256,0.270531,0.2657,0.251208,0.092233,...,0.005,0.0,0.0,0.005,0.0,0.0,0.005,0.0,0.0,0.0
1,math199chp2017fall2,0010101100010111100101011000101100100011011001...,1.0,0.533113,0.466887,0.255814,0.27907,0.275748,0.189369,0.086667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,math199chp2017fall3,0000101010001010011111001011110010110000000101...,1.0,0.529412,0.470588,0.289941,0.236686,0.236686,0.236686,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,math199chp2017fall4,1101001110101001110101100001101101000100111010...,1.0,0.5,0.5,0.195767,0.306878,0.306878,0.190476,0.069149,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,math199chp2017fall5,0010100111011011010110000110010011000101100110...,1.0,0.477987,0.522013,0.158228,0.322785,0.316456,0.202532,0.031847,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,math199chp2017fall6,0010110001011110101011001101010001101011110010...,1.0,0.49345,0.50655,0.153509,0.337719,0.337719,0.171053,0.044053,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,math199chp2017fall7,0001011010010111100101001001110110100100001101...,1.0,0.507692,0.492308,0.196911,0.312741,0.30888,0.181467,0.069767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,math199chp2017fall8,1011010011100101101000011101001110100110100110...,1.0,0.489796,0.510204,0.189744,0.302564,0.302564,0.205128,0.061856,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,math199chp2017fall9,0011101100101001001011011000011010110111011101...,1.0,0.466019,0.533981,0.165854,0.302439,0.297561,0.234146,0.04902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,math199chp2017fall10,0010100011110101100100111001011001101000110101...,1.0,0.505338,0.494662,0.2,0.307143,0.303571,0.189286,0.043011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
from functools import lru_cache

# Note: we are implicitly passing in df as a way to get around the fact that it is not hashable
@lru_cache(maxsize=10**5)
def get_conditional_probs(conditioning_seq):
    followed_by_1 = df['seq_' + conditioning_seq + '1'] * (df['flips'].map(len)-2)
    followed_by_0 = df['seq_' + conditioning_seq + '0'] * (df['flips'].map(len)-2)
    p_1_given_conditioning_seq = followed_by_1.sum()
    p_0_given_conditioning_seq = followed_by_0.sum()
    return p_0_given_conditioning_seq / (p_1_given_conditioning_seq + p_0_given_conditioning_seq), \
            p_1_given_conditioning_seq / (p_1_given_conditioning_seq + p_0_given_conditioning_seq)

In [91]:
df = df_fake
get_conditional_probs('010')

(0.33636133110210625, 0.6636386688978937)

In [123]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import time

def get_llrs(flips):
    llrs = []
    for s in flips:
        context = []
        llr = 0
        for flip in s:
            context_to_use = ''.join(context[-5:])
            p_0, p_1 = get_conditional_probs(context_to_use)
            if flip == '0':
                llr += np.log(p_0) - np.log(0.5)
            else:
                llr += np.log(p_1) - np.log(0.5)
            context.append(flip)
        llrs.append(llr)
    return llrs

all_roc_scores = []
for iter in range(20):
    df_fake_train, df_fake_test = train_test_split(df_fake)
    get_conditional_probs.cache_clear()
    df = df_fake_train
    llrs_positive = np.array(get_llrs(df_fake_test['flips']))
    llrs_negative = np.array(get_llrs(df_real['flips']))
    all_llrs = np.concatenate((llrs_negative, llrs_positive))
    all_targets = np.concatenate((np.zeros(llrs_negative.shape), np.ones(llrs_positive.shape)))
    all_roc_scores.append(roc_auc_score(all_targets, all_llrs))

print(np.array(all_roc_scores).mean())

0.9160753880266077
