In [1]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
from scipy.stats import chi2
import gzip
from joblib import Parallel, delayed
from tqdm.notebook import trange, tqdm
import glob

In [3]:
#The 200-bit trial sums have expected mean = 100 and standard deviation = 7.071
#we also know from following analysis that

In [4]:
STD_200_FLIPS = math.sqrt(200)/2
N_JOBS = 8

In [5]:
def split_into_chunks(data, chunks=8):
    return [data[i:i + chunks] for i in range(0, len(data), chunks)]

def flatten(t):
    return [item for sublist in t for item in sublist]

In [8]:
# Create real data
def read_eggs_by_chunk(chunk, gen_synthetic_version=False):
    datas = []
    for filename in chunk:
        with gzip.open(filename, mode='rt') as f:
            reader = csv.reader(f)
            i=0
            for line in reader:
                i+=1
                if line[0]=='12': 
                    egg_names = line[3:]
                    break
        data = pd.read_csv(filename, skiprows=i, names=['rowtype','timestamp','HRtimestamp']+egg_names)
        
        if not gen_synthetic_version:
            datas.append(data)
            
            
        else:
            syn_data = data.copy()
            random_set = np.random.default_rng().binomial(200,0.5,(data.shape[0],data.shape[1]-3))

            syn_data.iloc[:,3:] = random_set
            mask = data.iloc[:,3:].isna()   
            syn_data.iloc[:,3:] = syn_data.iloc[:,3:].where(~mask,other=np.nan)

            assert data.iloc[:,3:].isna().sum().sum() == syn_data.iloc[:,3:].isna().sum().sum()

            datas.append(syn_data)
            
    return datas


In [9]:
data

NameError: name 'data' is not defined

In [6]:
file_chunks = split_into_chunks(glob.glob('eggsummary/2019/*'), 8)
datas =  Parallel(n_jobs=N_JOBS)(delayed(read_eggs_by_chunk)(i) for i in tqdm(file_chunks))

  0%|          | 0/46 [00:00<?, ?it/s]

In [7]:
datas = flatten(datas)

In [8]:
data = datas[0].append(datas[1:])

In [9]:
del datas

In [10]:
# Looking for rotten eggs
BAD_EGGS = []

In [11]:
data.iloc[:,3:].std(axis=0)

1       7.073840
37      7.070001
112     7.068353
226     7.072796
228     7.074319
1021    7.070751
1070    7.069762
1092    7.070643
1237    7.075864
2000    7.071978
2028    7.071450
2052         NaN
2080    7.071639
2083    7.069925
2178    7.074951
2221    7.071985
2232    9.259247
2241    7.073217
2250    7.071182
3101    7.072264
3104    7.071726
3106    7.070836
3247    7.068929
4002    7.071169
4234    7.071718
3066    7.071010
3108    7.071270
2049    7.041942
108     7.078792
2220    7.073993
110     7.072190
dtype: float64

In [12]:
BAD_EGGS += ['2052','2232']

In [13]:
BAD_EGGS

['2052', '2232']

In [14]:
# Egg-wise Z score normalization
data.iloc[:,3:] = (data.iloc[:,3:]-100)/data.iloc[:,3:].std(axis=0)

In [15]:
data.iloc[:,3:]

Unnamed: 0,1,37,112,226,228,1021,1070,1092,1237,2000,...,3106,3247,4002,4234,3066,3108,2049,108,2220,110
0,0.989562,-0.282885,-2.546562,0.282774,-2.544414,0.000000,-0.424342,2.121448,0.282651,-0.141403,...,-0.707130,0.848785,1.979871,-0.707042,,,,,,
1,0.706830,0.141443,-1.131805,1.413868,-1.272207,0.282855,0.282895,-0.565719,-0.141325,-0.707016,...,0.141426,-0.141464,0.424258,0.848450,,,,,,
2,0.989562,0.707214,0.990330,-1.131094,-0.141356,-0.282855,-0.848685,-0.282860,-1.978557,-1.414031,...,1.697112,1.414642,-0.141419,-1.131267,,,,,,
3,0.282732,0.000000,-0.707378,0.706934,-0.989494,-1.414277,-1.414475,0.565719,0.282651,1.272628,...,0.141426,-1.556106,1.979871,-1.272675,,,,,,
4,0.000000,-2.263083,-0.282951,-0.282774,0.706782,0.282855,0.424342,0.990009,-1.978557,0.707016,...,-0.424278,-0.565857,-0.989935,-1.979717,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86395,-0.989562,-0.565771,-1.697708,-1.272481,,-0.848566,0.282895,,0.565302,0.000000,...,-0.424278,-0.565857,,-0.565633,-0.282845,,0.426019,,0.282726,1.272590
86396,0.989562,0.141443,0.282951,-0.141387,,-1.555705,0.707237,,0.989278,0.424209,...,0.141426,0.424392,,-0.565633,0.424268,,-1.136050,,-0.424089,0.848394
86397,0.000000,-0.282885,-1.131805,-0.565547,,-0.424283,-1.414475,,0.565302,0.989822,...,0.282852,1.414642,,0.424225,0.565690,,-0.284013,,0.565451,0.424197
86398,-0.706830,0.000000,0.424427,0.141387,,1.272849,-0.707237,,1.978557,-0.989822,...,-0.141426,0.000000,,-1.414084,-2.969873,,1.420063,,0.989540,-0.424197


In [8]:

data.sort_values('timestamp',inplace=True)

# This is true only for test 2019 year - we know those are broken
data.drop(['2049','2232','228','2052','3104','4002'],axis=1,inplace=True) 

In [9]:
data.to_parquet('data2019.parquet')

In [10]:
del data

In [11]:
file_chunks = split_into_chunks(glob.glob('eggsummary/2019/*'), 8)
datas =  Parallel(n_jobs=N_JOBS)(delayed(read_eggs_by_chunk)(i, True) for i in tqdm(file_chunks))

  0%|          | 0/46 [00:00<?, ?it/s]

In [12]:
datas = flatten(datas)

In [13]:
data = datas[0].append(datas[1:])
data.sort_values('timestamp',inplace=True)

# This is true only for test 2019 year - we know those are broken
data.drop(['2049','2232','228','2052','3104','4002'],axis=1,inplace=True) 

In [14]:
data.to_parquet('syn_data2019.parquet')

In [16]:
del data

## data created successfully