# Introductory examples

## 1.usa.gov data from bit.ly

In [None]:
%pwd

In [None]:
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'

In [None]:
open(path).readline()

In [None]:
import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]

In [None]:
records[0]

In [None]:
records[0]['tz']

In [None]:
print(records[0]['tz'])

### Counting time zones in pure Python

In [None]:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]

In [None]:
time_zones[:10]

In [None]:
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [None]:
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) # values will initialize to 0
    for x in sequence:
        counts[x] += 1
    return counts

In [None]:
counts = get_counts(time_zones)

In [None]:
counts['America/New_York']

In [None]:
len(time_zones)

In [None]:
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [None]:
top_counts(counts)

In [None]:
from collections import Counter

In [None]:
counts = Counter(time_zones)

In [None]:
counts.most_common(10)

### Counting time zones with pandas

In [None]:
%matplotlib inline

In [None]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)

In [None]:
import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
lines = open(path).readlines()
records = [json.loads(line) for line in lines]

In [None]:
from pandas import DataFrame, Series
import pandas as pd

frame = DataFrame(records)
frame

In [None]:
frame['tz'][:10]

In [None]:
tz_counts = frame['tz'].value_counts()
tz_counts[:10]

In [None]:
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]

In [None]:
plt.figure(figsize=(10, 4))

In [None]:
tz_counts[:10].plot(kind='barh', rot=0)

In [None]:
frame['a'][1]

In [None]:
frame['a'][50]

In [None]:
frame['a'][51]

In [None]:
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]

In [None]:
results.value_counts()[:8]

In [None]:
cframe = frame[frame.a.notnull()]

In [None]:
operating_system = np.where(cframe['a'].str.contains('Windows'),
                            'Windows', 'Not Windows')
operating_system[:5]

In [None]:
by_tz_os = cframe.groupby(['tz', operating_system])

In [None]:
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]

In [None]:
# Use to sort in ascending order
indexer = agg_counts.sum(1).argsort()
indexer[:10]

In [None]:
count_subset = agg_counts.take(indexer)[-10:]
count_subset

In [None]:
plt.figure()

In [None]:
count_subset.plot(kind='barh', stacked=True)

In [None]:
plt.figure()

In [None]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)