#Intro

##Bit.ly

In [None]:
%pylab inline

In [None]:
path = 'data/01/usagov_bitly_data2012-03-16-1331923249.txt'

In [None]:
import json
import numpy as np
from pandas import DataFrame, Series
import pandas as pd

In [None]:
records = [json.loads(line) for line in open(path, 'rb')]

In [None]:
frame = DataFrame(records)

In [None]:
frame.info()

In [None]:
frame['tz'][:10]

In [None]:
tz_counts = frame['tz'].value_counts()

In [None]:
tz_counts[:10]

In [None]:
clean_tz = frame['tz'].fillna('Missing')

In [None]:
clean_tz[clean_tz == ''] = 'Unknown'


In [None]:

tz_counts = clean_tz.value_counts()

In [None]:
tz_counts[:10]

In [None]:
tz_counts[:10].plot(kind='barh', rot=0)


In [None]:
results = Series([x.split()[0] for x in frame.a.dropna()])

In [None]:
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
by_tz_os = cframe.groupby(['tz', operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
indexer = agg_counts.sum(1).argsort()
count_subset = agg_counts.take(indexer)[-10:]
count_subset.plot(kind='barh', stacked=True)
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)

##Movies

In [None]:
import os

encoding = 'latin1'

upath = os.path.expanduser('data/01/users.dat')
rpath = os.path.expanduser('data/01/ratings.dat')
mpath = os.path.expanduser('data/01/movies.dat')

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']

users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)

In [None]:
users.info()

In [None]:
users[:3]

In [None]:
ratings[:3]

In [None]:
movies[:3]

In [None]:
# merges frames automatically based on the overlapping names
data = pd.merge(pd.merge(ratings, users), movies)

In [None]:
data.info()

In [None]:
data.ix[0]

In [None]:
#mean_ratings = pd.pivot_table(data, values='rating', columns=['gender', 'title'], aggfunc=np.mean)
mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')
mean_ratings[:3]

In [None]:
ratings_by_title = data.groupby('title').size()
ratings_by_title[:3]

In [None]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

In [None]:
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings[:3]

In [None]:
top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)
top_female_ratings[:3]

In [None]:
# measuring rating disagreement
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_index(by='diff')

In [None]:
# movies that women liked but men did not
sorted_by_diff[:3]

In [None]:
# and the other way around
sorted_by_diff[::-1][:3]

In [None]:
ratings_std_by_title = data.groupby('title')['rating'].std()
ratings_std_by_title = ratings_std_by_title.ix[active_titles]

In [None]:
ratings_std_by_title.order(ascending=False)[:3]

##Baby Names

In [None]:
names1880 = pd.read_csv('data/01/yob1880.txt', names=['name', 'sex', 'births'])
names1880[:3]

In [None]:
names1880.groupby('sex')['births'].sum()

In [None]:
years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
    path = 'data/01/yob{0}.txt'.format(year)
    frame = pd.read_csv(path, names=columns)
    frame['year'] = year
    pieces.append(frame)
names = pd.concat(pieces, ignore_index=True)

In [None]:
names[:3]

In [None]:
import json
from pandas import DataFrame, Series
import pandas as pd

In [None]:
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=np.sum)
total_births.tail()

In [None]:
total_births.plot()

In [None]:
def add_prop(group):
    births = group.births.astype(float)
    group['prop'] = births / births.sum()
    return group

In [None]:
names = names.groupby(['year', 'sex']).apply(add_prop)

In [None]:
names[:3]

In [None]:
names[-3:]

In [None]:
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

In [None]:
def get_top_1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]

In [None]:
grouped = names.groupby(['year', 'sex'])
top_1000 = grouped.apply(get_top_1000)
top_1000.index = np.arange(len(top_1000))
top_1000[:3]

In [None]:
boys = top_1000[top_1000.sex == 'M']
girls = top_1000[top_1000.sex == 'F']

In [None]:
total_births = top_1000.pivot_table('births', index='year', columns='name', aggfunc=sum)
total_births.info()

In [None]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False, title="# births / year")

In [None]:
table = top_1000.pivot_table('prop', index='year', columns='sex', aggfunc=sum)
table.plot(title='Sum of top_1000.prop by year and sex', yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 1020, 10))

In [None]:
df = boys[boys.year == 2010]
df[:3]

In [None]:
prop_cumsum = df.sort_index(by='prop', ascending='False').prop.cumsum()
prop_cumsum[:3]

In [None]:
prop_cumsum.values.searchsorted(0.5)

In [None]:
def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

In [None]:
diversity = top_1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity[:3]

In [None]:
diversity.plot(title="# popular names in top 50%")

In [None]:
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

In [None]:
table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable[:3]

In [None]:
subtable.sum()

In [None]:
letter_prop = subtable / subtable.sum().astype(float)
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female')

In [None]:
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts[:3]

In [None]:
dny_ts.plot()

In [None]:
all_names = top_1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like[:3]

In [None]:
filtered = top_1000[top_1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()[:3]

In [None]:
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc=sum)
table = table.div(table.sum(1), axis=0)
table.plot(style={'M': 'k-', 'F': 'k--'})