# Family trees
Natalia Vélez, December 2019--January 2020

Next to-do: Choose subset of the data to run the analysis?

In [55]:
%matplotlib inline

import os, re, glob, random
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join as opj
from tqdm import tqdm_notebook

sns.set_context('paper')

Helper functions:

In [56]:
gsearch = lambda *args: glob.glob(opj(*args))
str_extract = lambda pattern, s: re.search(pattern, s).group(0)

## Load data

Find data files:

In [57]:
data_dir = '../data/'
data_files = gsearch(data_dir, 'publicLifeLogData', '**', '*y.txt')
print('%i files found' % len(data_files))

9046 files found


Load all files:

In [58]:
data_list = []
empty_files = []
for f in tqdm_notebook(data_files):
    try:
        tmp_d = pd.read_csv(f, sep =' ', header=None)
        data_list.append(tmp_d)
    except:
        empty_files.append(f)

HBox(children=(IntProgress(value=0, max=9046), HTML(value='')))

In [44]:
%time raw_data = pd.concat(data_list)
raw_data.head()

CPU times: user 6.95 s, sys: 6.75 s, total: 13.7 s
Wall time: 16.6 s


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,B,1547745009,18361.0,a9cce9560e9ec414fc5f47defd2fec48a9453828,F,"(-4198,1951)",noParent,pop=1,chain=1
1,D,1547745377,18361.0,a9cce9560e9ec414fc5f47defd2fec48a9453828,age=20.12,F,"(-4236,1866)",hunger,pop=0
0,B,1533340847,2206.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,F,"(5355,-12870)",noParent,pop=1,chain=1
1,D,1533343607,2206.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,age=60.00,F,"(5494,-12870)",oldAge,pop=0
2,B,1533343736,2207.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,F,"(5487,-12872)",noParent,pop=1,chain=1


In [45]:
print('Found %i birth/death events' % raw_data.shape[0])

Found 13722127 birth/death events


Deaths:

In [46]:
death_raw = raw_data[raw_data.iloc[:,0] == 'D']
death_raw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
1,D,1547745377,18361.0,a9cce9560e9ec414fc5f47defd2fec48a9453828,age=20.12,F,"(-4236,1866)",hunger,pop=0
1,D,1533343607,2206.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,age=60.00,F,"(5494,-12870)",oldAge,pop=0
3,D,1533346355,2207.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,age=57.65,F,"(5489,-12873)",hunger,pop=0
5,D,1533349254,2208.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,age=60.00,F,"(5237,-12105)",oldAge,pop=0
8,D,1533392786,2209.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,age=41.30,F,"(5243,-12108)",disconnect,pop=1


Births:

In [47]:
birth_raw = raw_data[raw_data.iloc[:,0] == 'B']
birth_raw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,B,1547745009,18361.0,a9cce9560e9ec414fc5f47defd2fec48a9453828,F,"(-4198,1951)",noParent,pop=1,chain=1
0,B,1533340847,2206.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,F,"(5355,-12870)",noParent,pop=1,chain=1
2,B,1533343736,2207.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,F,"(5487,-12872)",noParent,pop=1,chain=1
4,B,1533346494,2208.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,F,"(5163,-11877)",noParent,pop=1,chain=1
6,B,1533391148,2209.0,3e5fc11e9399744ba47ee55df3f03e612cad475b,F,"(5240,-12102)",noParent,pop=1,chain=1


Clean up data:

In [48]:
shared_header = ['event', 'timestamp', 'playerID', 'hash', 'age', 'sex',
                 'location', 'parent', 'cause_of_death', 'pop', 'chain']

In [60]:
death_data = death_raw.copy()
# Insert missing columns
death_data.insert(7, 'parent', np.nan)
death_data.insert(10, 'chain', np.nan)
death_data.columns = shared_header

for i, row in tqdm_notebook(death_data.iterrows(), total=death_data.shape[0]):
    age = str_extract('(?<=age=)\d+\.\d+', row['age'])
    age = float(age)
    
    pop = str_extract('(?<=pop=)[0-9]+', row['pop'])
    pop = int(pop)
    
    death_data.at[i, 'age'] = age
    death_data.at[i, 'pop'] = pop

death_data.head()

HBox(children=(IntProgress(value=0, max=6855695), HTML(value='')))

KeyboardInterrupt: 

In [None]:
birth_data = birth_raw.copy()
# Insert missing columns
birth_data.insert(5, 'age', np.nan)
birth_data.insert(8, 'cause_of_death', np.nan)
birth_data.columns = shared_header

for i, row in tqdm_notebook(birth_data.iterrows(), total=birth_data.shape[0]):
    pop = str_extract('(?<=pop=)[0-9]+', row['pop'])
    pop = int(pop)
    
    chain = str_extract('(?<=chain=)[0-9]+', row['chain'])
    chain = int(chain)
    
    birth_data.at[i, 'pop'] = pop
    birth_data.at[i, 'chain'] = chain

birth_data.head()

### Deaths

Age of death:

In [None]:
print('Plotting %i deaths' % len(death_data))
sns.distplot(death_data['age'], bins=5)

In [None]:
?sns.distplot

Causes of death: