# File formats

There are many different file formats in widespread use within data science. In this lecture, we will review common file formats and their trade-offs, and how to choose an appropriate file format. We will also review the mechanics of reading/parsing different file formats, and how to write to them.

In [229]:
import datetime
import decimal
import xml.etree.cElementTree as ET

import numpy as np
import pandas as pd
import h5py
from faker import Faker
import simplejson
from json2xml import json2xml, readfromjson, readfromstring
import fastavro 
from fastavro import json_writer, json_reader, writer, reader, parse_schema
from rec_avro import to_rec_avro_destructive, from_rec_avro_destructive, rec_avro_schema
import fastparquet

In [31]:
import warnings
warnings.simplefilter('ignore', pd.errors.PerformanceWarning)

## Reading data from different file formats

### CSV

#### When the CSV file can be read as is

In [12]:
df = pd.read_csv('data/profiles.csv')

In [32]:
df.head(1)

Unnamed: 0,job,company,ssn,residence,current_location,blood_group,website,username,name,sex,address,mail,birthdate
0,Research scientist (physical sciences),"آل عواض, آل سلطان and آل مقطة",113-90-3545,"456 سرحان Inlet Apt. 218\nEast سجى, WV 44739","(Decimal('24.333449'), Decimal('-31.313933'))",AB+,"['https://lhkyr-al.com/', 'http://al.com/']",mhnhrth,الأستاذة أسيل حجار,F,1063 داوود Stravenue Suite 188\nSouth ريتالsta...,alqyl@yahoo.com,1965-05-22


In [31]:
df.loc[0]

job                            Research scientist (physical sciences)
company                                 آل عواض, آل سلطان and آل مقطة
ssn                                                       113-90-3545
residence                456 سرحان Inlet Apt. 218\nEast سجى, WV 44739
current_location        (Decimal('24.333449'), Decimal('-31.313933'))
blood_group                                                       AB+
website                   ['https://lhkyr-al.com/', 'http://al.com/']
username                                                      mhnhrth
name                                               الأستاذة أسيل حجار
sex                                                                 F
address             1063 داوود Stravenue Suite 188\nSouth ريتالsta...
mail                                                  alqyl@yahoo.com
birthdate                                                  1965-05-22
Name: 0, dtype: object

#### When scrubbing of rows may be needed

In [2]:
import csv

In [16]:
rows = []
with open('data/profiles.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        rows.append(row)

In [27]:
list(map(len, rows))

[13, 13, 13, 13]

In [28]:
rows[:2]

[['job',
  'company',
  'ssn',
  'residence',
  'current_location',
  'blood_group',
  'website',
  'username',
  'name',
  'sex',
  'address',
  'mail',
  'birthdate'],
 ['Research scientist (physical sciences)',
  'آل عواض, آل سلطان and آل مقطة',
  '113-90-3545',
  '456 سرحان Inlet Apt. 218\nEast سجى, WV 44739',
  "(Decimal('24.333449'), Decimal('-31.313933'))",
  'AB+',
  "['https://lhkyr-al.com/', 'http://al.com/']",
  'mhnhrth',
  'الأستاذة أسيل حجار',
  'F',
  '1063 داوود Stravenue Suite 188\nSouth ريتالstad, NE 15082',
  'alqyl@yahoo.com',
  '1965-05-22']]

In [45]:
df = pd.DataFrame(rows[1:], columns=rows[0])

In [46]:
df.head(1)

Unnamed: 0,job,company,ssn,residence,current_location,blood_group,website,username,name,sex,address,mail,birthdate
0,Research scientist (physical sciences),"آل عواض, آل سلطان and آل مقطة",113-90-3545,"456 سرحان Inlet Apt. 218\nEast سجى, WV 44739","(Decimal('24.333449'), Decimal('-31.313933'))",AB+,"['https://lhkyr-al.com/', 'http://al.com/']",mhnhrth,الأستاذة أسيل حجار,F,1063 داوود Stravenue Suite 188\nSouth ريتالsta...,alqyl@yahoo.com,1965-05-22


### Tab-delimited

Same as CSV, just change separator.

#### Direct reading into DataFrame

In [29]:
df = pd.read_csv('data/profiles.txt', sep='\t')

In [30]:
df.head()

Unnamed: 0,job,company,ssn,residence,current_location,blood_group,website,username,name,sex,address,mail,birthdate
0,Research scientist (physical sciences),"آل عواض, آل سلطان and آل مقطة",113-90-3545,"456 سرحان Inlet Apt. 218\nEast سجى, WV 44739","(Decimal('24.333449'), Decimal('-31.313933'))",AB+,"['https://lhkyr-al.com/', 'http://al.com/']",mhnhrth,الأستاذة أسيل حجار,F,1063 داوود Stravenue Suite 188\nSouth ريتالsta...,alqyl@yahoo.com,1965-05-22
1,"Surveyor, building control","Mitchell, Carroll and Cooper",055-43-6596,"21433 Lucero Way Apt. 512\nErinberg, FL 46113","(Decimal('45.5171425'), Decimal('-68.251421'))",A-,['https://www.levine.com/'],lboyd,Kelly Miller,F,"9516 Thomas Falls\nWest Randall, OR 31568",xrichardson@yahoo.com,1903-11-22
2,可靠度工程师,图龙信息传媒有限公司,350212197208264282,北京市柳县永川昆明路D座 447897,"(Decimal('-63.7154915'), Decimal('119.882111'))",AB+,"['https://www.uj.com/', 'http://www.lei.cn/', ...",yinjun,王雷,M,甘肃省宁德市长寿崔路S座 747320,weiqin@gmail.com,2014-12-29


#### Row by row processing

In [24]:
rows = []
with open('data/profiles.txt') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        rows.append(row)

In [26]:
list(map(len, rows))

[13, 13, 13, 13]

### JSON

JSON is the most popular format for sharing information over the web. Most data retrieval APIs will return JSON.m

In [55]:
import json

In [66]:
with open('data/profiles.json') as f:
    profiles = json.load(f)

In [67]:
len(profiles)

3

In [68]:
profiles[0]

{'job': 'Research scientist (physical sciences)',
 'company': 'آل عواض, آل سلطان and آل مقطة',
 'ssn': '113-90-3545',
 'residence': '456 سرحان Inlet Apt. 218\nEast سجى, WV 44739',
 'current_location': [24.333449, -31.313933],
 'blood_group': 'AB+',
 'website': ['https://lhkyr-al.com/', 'http://al.com/'],
 'username': 'mhnhrth',
 'name': 'الأستاذة أسيل حجار',
 'sex': 'F',
 'address': '1063 داوود Stravenue Suite 188\nSouth ريتالstad, NE 15082',
 'mail': 'alqyl@yahoo.com',
 'birthdate': None}

#### Using a REST API to retrieve JSON data

In [54]:
%%bash

curl -o data/pokemon.json https://pokeapi.co/api/v2/pokemon/23

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  185k    0  185k    0     0  1428k      0 --:--:-- --:--:-- --:--:-- 1430k


In [56]:
with open('data/pokemon.json') as f:
    pokemon = json.load(f)

In [58]:
pokemon.keys()

dict_keys(['abilities', 'base_experience', 'forms', 'game_indices', 'height', 'held_items', 'id', 'is_default', 'location_area_encounters', 'moves', 'name', 'order', 'species', 'sprites', 'stats', 'types', 'weight'])

In [59]:
pokemon['name']

'ekans'

In [65]:
pokemon['abilities']

[{'ability': {'name': 'unnerve',
   'url': 'https://pokeapi.co/api/v2/ability/127/'},
  'is_hidden': True,
  'slot': 3},
 {'ability': {'name': 'shed-skin',
   'url': 'https://pokeapi.co/api/v2/ability/61/'},
  'is_hidden': False,
  'slot': 2},
 {'ability': {'name': 'intimidate',
   'url': 'https://pokeapi.co/api/v2/ability/22/'},
  'is_hidden': False,
  'slot': 1}]

### XML

In [84]:
tree = ET.parse('data/profiles.xml')
root = tree.getroot()

In [86]:
root.tag

'duke'

In [90]:
ET.dump(root)

<duke>
    <employee>
        <address>1063 داوود Stravenue Suite 188
        South ريتالstad, NE 15082</address>
        <birthdate>None</birthdate>
        <blood_group>AB+</blood_group>
        <company>آل عواض, آل سلطان and آل مقطة</company>
        <current_location>24.333449</current_location>
        <current_location>-31.313933</current_location>
        <job>Research scientist (physical sciences)</job>
        <mail>alqyl@yahoo.com</mail>
        <name>الأستاذة أسيل حجار</name>
        <residence>456 سرحان Inlet Apt. 218
        East سجى, WV 44739</residence>
        <sex>F</sex>
        <ssn>113-90-3545</ssn>
        <username>mhnhrth</username>
        <website>https://lhkyr-al.com/</website>
        <website>http://al.com/</website>
    </employee>
    <employee>
        <address>9516 Thomas Falls
        West Randall, OR 31568</address>
        <birthdate>None</birthdate>
        <blood_group>A-</blood_group>
        <company>Mitchell, Carroll and Cooper</company>
        

In [108]:
for employee in root:
    for elem in employee:
        print(f'{elem.tag:>20}: {elem.text}')
    break

             address: 1063 داوود Stravenue Suite 188
        South ريتالstad, NE 15082
           birthdate: None
         blood_group: AB+
             company: آل عواض, آل سلطان and آل مقطة
    current_location: 24.333449
    current_location: -31.313933
                 job: Research scientist (physical sciences)
                mail: alqyl@yahoo.com
                name: الأستاذة أسيل حجار
           residence: 456 سرحان Inlet Apt. 218
        East سجى, WV 44739
                 sex: F
                 ssn: 113-90-3545
            username: mhnhrth
             website: https://lhkyr-al.com/
             website: http://al.com/


In [96]:
root.findall('.')

[<Element 'duke' at 0x1270c99f8>]

In [109]:
root.findall('./')

[<Element 'employee' at 0x1270c9e08>,
 <Element 'employee' at 0x1270c9408>,
 <Element 'employee' at 0x126e309f8>]

In [118]:
root.findall('.//')[:5]

[<Element 'employee' at 0x1270c9e08>,
 <Element 'address' at 0x1270c9a98>,
 <Element 'birthdate' at 0x1270c99a8>,
 <Element 'blood_group' at 0x1270c9958>,
 <Element 'company' at 0x1270c9908>]

In [119]:
for item in root.findall('.//company'):
    print(item.text)

آل عواض, آل سلطان and آل مقطة
Mitchell, Carroll and Cooper
图龙信息传媒有限公司


### HDF5

Like XML and JSON, HDF5 files store hierarchical data that can be annotated. The strong points of HDF5 are its ability to store large numerical data sets so that selective loading of parts of the data into memory for analysis is possible. HDF5 are also easy to use for people familiar with `numpy` and widely used in the scientific community.

There are two popular libraries for working with HDF5. Pandas uses `pytables`, and the stored schema can be quite unintuitive, but that does not matter since we usually just use Pandas to read it back in.

#### Pandas and `tables`

In [148]:
import tables

In [149]:
f = tables.open_file('data/profiles.h5')

In [150]:
f

File(filename=data/profiles.h5, title='', mode='r', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/duke (Group) ''
/duke/axis0 (Array(14,)) ''
  atom := StringAtom(itemsize=11, shape=(), dflt=b'')
  maindim := 0
  flavor := 'numpy'
  byteorder := 'irrelevant'
  chunkshape := None
/duke/axis1 (Array(3,)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None
/duke/block0_items (Array(1,)) ''
  atom := StringAtom(itemsize=9, shape=(), dflt=b'')
  maindim := 0
  flavor := 'numpy'
  byteorder := 'irrelevant'
  chunkshape := None
/duke/block0_values (Array(3, 1)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None
/duke/block1_items (Array(2,)) ''
  atom := StringAtom(itemsize=10, shape=(), dflt=b'')
  maindim := 0
  flavor := 'numpy'
  byteorder := 'irrelevant'
 

In [162]:
f.root.duke.axis0[:]

array([b'location_x', b'location_y', b'job', b'company', b'ssn',
       b'residence', b'blood_group', b'website', b'username', b'name',
       b'sex', b'address', b'mail', b'birthdate'], dtype='|S11')

In [163]:
f.root.duke.axis1[:]

array([0, 1, 2])

In [169]:
f.root.duke.block1_items[:], f.root.duke.block1_values[:]

(array([b'location_x', b'location_y'], dtype='|S10'),
 array([[ 24.333449 , -31.313933 ],
        [ 45.5171425, -68.251421 ],
        [-63.7154915, 119.882111 ]]))

In [170]:
f.close()

#### Reading into `pandas`

In [171]:
df = pd.read_hdf('data/profiles.h5')

In [172]:
df

Unnamed: 0,location_x,location_y,job,company,ssn,residence,blood_group,website,username,name,sex,address,mail,birthdate
0,24.333449,-31.313933,Research scientist (physical sciences),"آل عواض, آل سلطان and آل مقطة",113-90-3545,"456 سرحان Inlet Apt. 218\nEast سجى, WV 44739",AB+,"[https://lhkyr-al.com/, http://al.com/]",mhnhrth,الأستاذة أسيل حجار,F,1063 داوود Stravenue Suite 188\nSouth ريتالsta...,alqyl@yahoo.com,1965-05-22
1,45.517142,-68.251421,"Surveyor, building control","Mitchell, Carroll and Cooper",055-43-6596,"21433 Lucero Way Apt. 512\nErinberg, FL 46113",A-,[https://www.levine.com/],lboyd,Kelly Miller,F,"9516 Thomas Falls\nWest Randall, OR 31568",xrichardson@yahoo.com,1903-11-22
2,-63.715491,119.882111,可靠度工程师,图龙信息传媒有限公司,350212197208264282,北京市柳县永川昆明路D座 447897,AB+,"[https://www.uj.com/, http://www.lei.cn/, http...",yinjun,王雷,M,甘肃省宁德市长寿崔路S座 747320,weiqin@gmail.com,2014-12-29


#### Using `h5py`

For actually working directly with HDF5, I find `h5py` more intuitive.

In [197]:
import os

In [215]:
filename = 'data/simulations.h5'
if os.path.exists(filename):
    os.remove(filename)
f = h5py.File(filename)

In [216]:
import pendulum

In [217]:
start = pendulum.now()
stop = start.add(days=3)
for day in pendulum.period(start, stop):
    g = f.create_group(day.format('ddd'))
    g.attrs['date'] = day.format('LLL')
    g.attrs['analyst'] = 'Mario'
    for expt in range(3):
        data = np.random.poisson(size=(100, 100))
        ds = g.create_dataset(f'expt-{expt:02d}', data=data)

In [218]:
f.close()

In [219]:
f = h5py.File(filename, 'r')

In [220]:
list(f.keys())

['Fri', 'Mon', 'Sat', 'Sun']

In [221]:
list(f['Sat'].attrs.keys())

['date', 'analyst']

In [222]:
f['Sat'].attrs['analyst']

'Mario'

In [223]:
f['Sat'].attrs['date']

'August 31, 2019 7:29 PM'

In [224]:
list(f['Sat'].keys())

['expt-00', 'expt-01', 'expt-02']

In [225]:
f['Sat']['expt-01'][5:10, 5:10]

array([[0, 1, 2, 1, 1],
       [0, 1, 2, 2, 2],
       [1, 0, 1, 1, 0],
       [2, 3, 3, 2, 5],
       [0, 2, 1, 1, 2]])

In [227]:
f['Sat']['expt-01'][5:10, 5:10].sum(axis=0)

array([ 3,  7,  9,  7, 10])

## Avro

In [235]:
%%bash --out s
fastavro --schema data/profiles.avro

In [236]:
schema = eval(s.replace('true', 'True'))

In [238]:
schema

{'__rec_avro_schema__': True,
 'type': 'record',
 'name': 'rec_avro.rec_object',
 'fields': [{'name': '_',
   'type': [{'type': 'map',
     'values': ['null',
      'boolean',
      'int',
      'long',
      'float',
      'double',
      'string',
      'bytes',
      'rec_avro.rec_object']},
    {'type': 'array',
     'items': ['null',
      'boolean',
      'int',
      'long',
      'float',
      'double',
      'string',
      'bytes',
      'rec_avro.rec_object']}]}]}

In [240]:
with open('data/profiles.avro', 'rb') as f:
    avro_reader = reader(f, reader_schema=schema)
    for record in avro_reader:
        print(record)

{'_': {'job': 'Research scientist (physical sciences)', 'company': 'آل عواض, آل سلطان and آل مقطة', 'ssn': '113-90-3545', 'residence': '456 سرحان Inlet Apt. 218\nEast سجى, WV 44739', 'current_location': {'_': [24.33344841003418, -31.313932418823242]}, 'blood_group': 'AB+', 'website': {'_': ['https://lhkyr-al.com/', 'http://al.com/']}, 'username': 'mhnhrth', 'name': 'الأستاذة أسيل حجار', 'sex': 'F', 'address': '1063 داوود Stravenue Suite 188\nSouth ريتالstad, NE 15082', 'mail': 'alqyl@yahoo.com', 'birthdate': None}}
{'_': {'job': 'Surveyor, building control', 'company': 'Mitchell, Carroll and Cooper', 'ssn': '055-43-6596', 'residence': '21433 Lucero Way Apt. 512\nErinberg, FL 46113', 'current_location': {'_': [45.51714324951172, -68.25141906738281]}, 'blood_group': 'A-', 'website': {'_': ['https://www.levine.com/']}, 'username': 'lboyd', 'name': 'Kelly Miller', 'sex': 'F', 'address': '9516 Thomas Falls\nWest Randall, OR 31568', 'mail': 'xrichardson@yahoo.com', 'birthdate': None}}
{'_': 

#### Avro to JSON

In [242]:
with open('data/profiles.avro', 'rb') as f:
    avro_reader = reader(f, reader_schema=schema)
    for record in avro_reader:
        print(from_rec_avro_destructive(record))

{'job': 'Research scientist (physical sciences)', 'company': 'آل عواض, آل سلطان and آل مقطة', 'ssn': '113-90-3545', 'residence': '456 سرحان Inlet Apt. 218\nEast سجى, WV 44739', 'current_location': [24.33344841003418, -31.313932418823242], 'blood_group': 'AB+', 'website': ['https://lhkyr-al.com/', 'http://al.com/'], 'username': 'mhnhrth', 'name': 'الأستاذة أسيل حجار', 'sex': 'F', 'address': '1063 داوود Stravenue Suite 188\nSouth ريتالstad, NE 15082', 'mail': 'alqyl@yahoo.com', 'birthdate': None}
{'job': 'Surveyor, building control', 'company': 'Mitchell, Carroll and Cooper', 'ssn': '055-43-6596', 'residence': '21433 Lucero Way Apt. 512\nErinberg, FL 46113', 'current_location': [45.51714324951172, -68.25141906738281], 'blood_group': 'A-', 'website': ['https://www.levine.com/'], 'username': 'lboyd', 'name': 'Kelly Miller', 'sex': 'F', 'address': '9516 Thomas Falls\nWest Randall, OR 31568', 'mail': 'xrichardson@yahoo.com', 'birthdate': None}
{'job': '可靠度工程师', 'company': '图龙信息传媒有限公司', 'ssn'

### Parquet

In [246]:
parq = fastparquet.ParquetFile('data/profiles.parq')

In [249]:
parq.columns

['location_x',
 'location_y',
 'job',
 'company',
 'ssn',
 'residence',
 'blood_group',
 'website',
 'username',
 'name',
 'sex',
 'address',
 'mail',
 'birthdate']

In [250]:
df = parq.to_pandas()

In [251]:
df.head(1)

Unnamed: 0,location_x,location_y,job,company,ssn,residence,blood_group,website,username,name,sex,address,mail,birthdate
0,24.333449,-31.313933,Research scientist (physical sciences),"آل عواض, آل سلطان and آل مقطة",113-90-3545,"456 سرحان Inlet Apt. 218\nEast سجى, WV 44739",AB+,"[https://lhkyr-al.com/, http://al.com/]",mhnhrth,الأستاذة أسيل حجار,F,1063 داوود Stravenue Suite 188\nSouth ريتالsta...,alqyl@yahoo.com,1965-05-22


In [253]:
df = pd.read_parquet('data/profiles.parq')

In [254]:
df.head(1)

Unnamed: 0,location_x,location_y,job,company,ssn,residence,blood_group,website,username,name,sex,address,mail,birthdate
0,24.333449,-31.313933,Research scientist (physical sciences),"آل عواض, آل سلطان and آل مقطة",113-90-3545,"456 سرحان Inlet Apt. 218\nEast سجى, WV 44739",AB+,"[https://lhkyr-al.com/, http://al.com/]",mhnhrth,الأستاذة أسيل حجار,F,1063 داوود Stravenue Suite 188\nSouth ريتالsta...,alqyl@yahoo.com,1965-05-22


## Writing data to different file formats

### Creating fake profiles

In [4]:
fakes = [
    Faker('zh_CN'), 
    Faker('ar_SA'), 
    Faker('en_US'), 
]

In [5]:
np.random.seed(1)

In [6]:
n = 3
p = [0.3, 0.2, 0.5]
locales = np.random.choice(len(fakes), size=n, p=p)

In [7]:
profiles = [fakes[locale].profile() for locale in locales]

In [8]:
profiles

[{'job': 'Research scientist (physical sciences)',
  'company': 'آل عواض, آل سلطان and آل مقطة',
  'ssn': '113-90-3545',
  'residence': '456 سرحان Inlet Apt. 218\nEast سجى, WV 44739',
  'current_location': (Decimal('24.333449'), Decimal('-31.313933')),
  'blood_group': 'AB+',
  'website': ['https://lhkyr-al.com/', 'http://al.com/'],
  'username': 'mhnhrth',
  'name': 'الأستاذة أسيل حجار',
  'sex': 'F',
  'address': '1063 داوود Stravenue Suite 188\nSouth ريتالstad, NE 15082',
  'mail': 'alqyl@yahoo.com',
  'birthdate': datetime.date(1965, 5, 22)},
 {'job': 'Surveyor, building control',
  'company': 'Mitchell, Carroll and Cooper',
  'ssn': '055-43-6596',
  'residence': '21433 Lucero Way Apt. 512\nErinberg, FL 46113',
  'current_location': (Decimal('45.5171425'), Decimal('-68.251421')),
  'blood_group': 'A-',
  'website': ['https://www.levine.com/'],
  'username': 'lboyd',
  'name': 'Kelly Miller',
  'sex': 'F',
  'address': '9516 Thomas Falls\nWest Randall, OR 31568',
  'mail': 'xrichard

### Convert to Parquet-compatible data types

In [9]:
df = pd.DataFrame(profiles)

In [10]:
df.to_csv('profiles.csv', index=False)

In [11]:
df.to_csv('profiles.txt', index=False, sep='\t')

In [12]:
df.birthdate = pd.to_datetime(df.birthdate)
df = (
    df.current_location.
    apply(pd.Series).
    merge(df, left_index=True, right_index=True).
    drop('current_location', axis=1).
    rename({0: 'location_x', 1: 'location_y'}, axis=1)
)
df['location_x'] = df['location_x'].astype('float')
df['location_y'] = df['location_y'].astype('float')

In [13]:
df.to_hdf('profiles.h5', key='duke')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['job', 'company', 'ssn', 'residence', 'blood_group', 'website', 'username', 'name', 'sex', 'address', 'mail']]

  pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [14]:
fastparquet.write('profiles.parq', df)

In [15]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///data/profiles.sqlite', echo=False)

In [16]:
df['website'] = df.website.apply(lambda s: ','.join(s))

In [17]:
df.to_sql('duke', con=engine, if_exists='replace', index_label='id')

In [74]:
def converter(o):
    if isinstance(o, datetime.datetime):
        return o.__str__()
    if isinstance(o, decimal.Decimal):
        return o.__str__()

In [19]:
with open('profiles.json', 'w') as f:
    simplejson.dump(profiles , f, default=converter)

In [76]:
with open('data/profiles.xml', 'w') as f:
    data = readfromstring(simplejson.dumps(profiles , f, default=converter))
    f.write(json2xml.Json2xml({'employee': data}, wrapper="duke").to_xml())

In [21]:
ps = simplejson.load(open('profiles.json'))

In [22]:
avro_objects = [to_rec_avro_destructive(rec) for rec in ps]

In [23]:
with open('profiles.avro', 'wb') as f_out:
    writer(f_out, parse_schema(rec_avro_schema()), avro_objects)

In [24]:
%%bash --out s
fastavro --schema profiles.avro

In [25]:
schema = eval(s.replace('true', 'True'))

In [26]:
schema

{'__rec_avro_schema__': True,
 'type': 'record',
 'name': 'rec_avro.rec_object',
 'fields': [{'name': '_',
   'type': [{'type': 'map',
     'values': ['null',
      'boolean',
      'int',
      'long',
      'float',
      'double',
      'string',
      'bytes',
      'rec_avro.rec_object']},
    {'type': 'array',
     'items': ['null',
      'boolean',
      'int',
      'long',
      'float',
      'double',
      'string',
      'bytes',
      'rec_avro.rec_object']}]}]}

In [233]:
with open('data/profiles.avro', 'rb') as f:
    avro_reader = reader(f, reader_schema=schema)
    for record in avro_reader:
        print(record)

NameError: name 'schema' is not defined

In [28]:
with open('profiles.avro', 'rb') as f:
    avro_reader = reader(f, reader_schema=schema)
    for record in avro_reader:
        print(from_rec_avro_destructive(record))

{'job': 'Research scientist (physical sciences)', 'company': 'آل عواض, آل سلطان and آل مقطة', 'ssn': '113-90-3545', 'residence': '456 سرحان Inlet Apt. 218\nEast سجى, WV 44739', 'current_location': [24.33344841003418, -31.313932418823242], 'blood_group': 'AB+', 'website': ['https://lhkyr-al.com/', 'http://al.com/'], 'username': 'mhnhrth', 'name': 'الأستاذة أسيل حجار', 'sex': 'F', 'address': '1063 داوود Stravenue Suite 188\nSouth ريتالstad, NE 15082', 'mail': 'alqyl@yahoo.com', 'birthdate': None}
{'job': 'Surveyor, building control', 'company': 'Mitchell, Carroll and Cooper', 'ssn': '055-43-6596', 'residence': '21433 Lucero Way Apt. 512\nErinberg, FL 46113', 'current_location': [45.51714324951172, -68.25141906738281], 'blood_group': 'A-', 'website': ['https://www.levine.com/'], 'username': 'lboyd', 'name': 'Kelly Miller', 'sex': 'F', 'address': '9516 Thomas Falls\nWest Randall, OR 31568', 'mail': 'xrichardson@yahoo.com', 'birthdate': None}
{'job': '可靠度工程师', 'company': '图龙信息传媒有限公司', 'ssn'

In [None]:
root = ET.Element("root")
doc = ET.SubElement(root, "doc")

ET.SubElement(doc, "field1", name="blah").text = "some value1"
ET.SubElement(doc, "field2", name="asdfasd").text = "some vlaue2"

tree = ET.ElementTree(root)
tree.write("filename.xml")

In [None]:
: %pprint

In [None]:
pprint(tree)

In [None]:
! cat filename.xml