In [None]:
import numpy as np
import pandas as pd
import os
import json

files_path = os.path.join(os.path.abspath(''), '4-class', 'files')

In [None]:
"""
PROJECT 1
Service between Bitly and US gov to provide a feed of anonymous data gathered from users who shorten links
ending with .gov or .mil  --> service started in 2011, ended in 2017
"""

In [None]:
file = os.path.join(files_path, 'proj1.txt')
records = [json.loads(line) for line in open(file)]
records[0]

In [None]:
#
# Comparison: counting time zones in pure Python
#
# time_zones = [record['tz'] for record in records]  # KeyError because not all data has tz!

In [None]:
time_zones = [record['tz'] for record in records if 'tz' in record]
time_zones[:10]

In [None]:
len(time_zones)

In [None]:
from collections import defaultdict

def get_counts(data):
    counts = defaultdict(int)  # non existent accessed keys are set to 0
    for x in data:
        counts[x] += 1
    return counts

tz_counts = get_counts(time_zones)

In [None]:
tz_counts['America/New_York']

In [None]:
tz_counts['Europe/Lisbon']

In [None]:
# top 10
def top_counts(count_dict, n):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort(reverse=True)
    return value_key_pairs[:n]

top_counts(tz_counts, 10)

In [None]:
from collections import Counter
Counter(time_zones).most_common(10)

In [None]:
# # # #
# Counting time zones with pandas
# # # #
df = pd.DataFrame(records)
df.info()  # info on the columns

In [None]:
df['tz'][:10]  # No need to make if for non existent tzs

In [None]:
# Counting, and top10
tz_counts = df['tz'].value_counts()
tz_counts[:10]  # though notice second place is empty string... As in the pure python...

In [None]:
# Cleaning
clean_tz = df.tz.fillna('Missing')  # there are non existent data...
clean_tz[clean_tz == ''] = 'Unknown'  # ...and there are empty strings
tz_counts = clean_tz.value_counts()
tz_counts[:10]  # notice Unknown and Missing creeping into the top10...

In [None]:
# Do horizontal bar plot
# Import seaborn, typical plotting lib for data science, with added features
# https://seaborn.pydata.org
import seaborn as sns
subset = tz_counts[:10]
sns.barplot(x=subset.values, y=subset.index)

In [None]:
# Checking app used to perform URL shortening
df['a'][:10]

In [None]:
results = pd.Series([x.split()[0] for x in df.a.dropna()])  # try fetching "browser" info
results[:10]

In [None]:
#
# Decompose top time zones into Windows and non Windows
#
other_df = df[df.a.notnull()]
other_df['os'] = np.where(other_df['a'].str.contains('Windows'), 'Windows', 'Not Windows')
other_df['os'][:10]

In [None]:
# group by tz and os
by_tz_os = other_df.groupby(['tz', 'os'])
agg_counts = by_tz_os.size().unstack().fillna(0)  # size is group counts, analogous to non group value_counts
agg_counts[:10]

In [None]:
# sort agg to check top overall tzs here. First sort through tz
agg_counts.sum(1).argsort()[:10]  # sum through axis 1

In [None]:
# preserve order but show os info
indexer = agg_counts.sum(1).argsort()  # argsort will skip nan
agg_counts.take(indexer[-10:]) # top 10

In [None]:
# method to easily show the biggest values
agg_counts.sum(1).nlargest(10)

In [None]:
# Rearrange data for plotting
count_subset = agg_counts.take(indexer[-10:]).stack()
count_subset

In [None]:
count_subset.name = 'total'
count_subset = count_subset.reset_index()  # removes multiindex, puts indexrange, dataframe
count_subset

In [None]:
count_subset[:10]  # Dataframe now bigger, since index is not multilevel

In [None]:
sns.barplot(x='total', y='tz', hue='os', data=count_subset)  # hue is, in essence, the data we want

In [None]:
# let's normalize the group percentages
def norm_total(group):
    group['normed_total'] = group.total / group.total.sum()  # becomes normalized, percentage
    return group

results = count_subset.groupby('tz').apply(norm_total)
sns.barplot(x='normed_total', y='tz', hue='os', data=results)

In [None]:
"""
PROJECT 2
US Baby Names 1880-2010, provided by the United States Social Security Administration (SSA)
"""
proj2_folder = os.path.join(files_path, 'proj2')
names1880 = pd.read_csv(os.path.join(proj2_folder, 'yob1880.txt'), names=['name', 'sex', 'births'])
names1880

In [None]:
names1880.groupby('sex').births.sum()  # quick stat

In [None]:
# data split into files, combine in df
years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    file = os.path.join(proj2_folder, f'yob{year}.txt')
    df_piece = pd.read_csv(file, names=columns)
    df_piece['year'] = year  # tag the year in each piece (each elem of this piece gets this year)
    pieces.append(df_piece)

# concat
names = pd.concat(pieces, ignore_index=True)
names

In [None]:
# Aggregate stuff
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=sum)
total_births

In [None]:
total_births.plot(title='Total births by sex and year')

In [None]:
# Percentage of babies with given name
def add_prop(group):
    group['prop'] = group.births / group.births.sum()  # easier than what we did earlier, but same thing
    return group

names = names.groupby(['year', 'sex']).apply(add_prop)
names

In [None]:
# sanity check
names.groupby(['year', 'sex']).prop.sum()

In [None]:
# extract subset of data for easier analysis -> top 1000 names for each sex/year comb

grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(lambda group: group.sort_values(by='births', ascending=False)[:1000])
top1000.reset_index(inplace=True, drop=True)  # drop means resets index without adding any column
top1000  # dataset is now only ~260.000 instead of ~1.690.000

In [None]:
# checking name trends
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)
total_births.info()

In [None]:
# plot table for a set of names
subset = total_births[['John', 'Harry', 'Jacob', 'Mary', 'Marilyn', 'Elizabeth']]
subset.plot(subplots=True, figsize=(12, 10), grid=False, title='Number of births per year')

In [None]:
# measuring increase in naming diversity -> proportion of babies with name in top1000
table = top1000.pivot_table('prop', index='year', columns='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))

In [None]:
# considering just boys name in 2010
df = boys[boys.year == 2010]
df

In [None]:
prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()
prop_cumsum[:10]

In [None]:
# Check how many of the most popular names it takes to reach 50%
prop_cumsum.values.searchsorted(0.5) + 1 # since arrays are 0-indexed

In [None]:
# Same for 1900
df = boys[boys.year == 1900]
in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()
in1900.values.searchsorted(0.5) + 1

In [None]:
# now for all years, using top1000
def get_quantile_count(group):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(0.5) + 1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count).unstack('sex')
diversity.head()

In [None]:
diversity.plot(title='Number of popular names in top 50%')

In [None]:
# nft certificate? :P