In [1]:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
import os
import re

## scans per center per month

In [2]:
selection = alt.selection_point(fields=['name'], bind='legend')

scans_per_month_chart = alt.Chart("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/csv_files/scans_per_center_per_month.csv").mark_bar().encode(
    x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="Months"),
    y= alt.Y('books_scanned:Q', title="Books Scanned"),
#     color=alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)),
    order=alt.Order('name:N',sort='ascending'),
#     opacity=alt.condition(selection, alt.value(1), alt.value(0.15)),
    color=alt.condition(selection, alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)), alt.value("black"))
    
).add_params(selection).configure_legend(
  orient='bottom'
).properties(
    # Adjust chart width and height to match size of legend
    width="container",
    height=400
).interactive()

scans_per_month_chart.save('/Users/e.schwartz/Documents/GitHub/ia_scanning_labor_data/center_visuals/scans_per_month_chart.json')
scans_per_month_chart.save('/Users/e.schwartz/Documents/GitHub/ia_scanning_labor_data/center_visuals/scans_per_month_chart.html')

In [3]:
scans_per_month_chart

## Scans per month for each center

In [4]:
data = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/csv_files/scans_per_center_per_month.csv")[['name', 'month_year', 'books_scanned']]

In [5]:
def make_scans_per_month(center_data, path):
    scans_per_month_chart = alt.Chart(center_data).mark_bar().encode(
        x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="Months"),
        y= alt.Y('books_scanned:Q', title="Books Scanned")
    ).configure_legend(
      orient='bottom'
    ).properties(
        # Adjust chart width and height to match size of legend
        width="container",
        height=400
    ).interactive()
    scans_per_month_chart.save(path+'scans_per_month.json')


In [6]:
make_scans_per_month(data.loc[data['name']== 'UNC Chapel Hill'])

TypeError: make_scans_per_month() missing 1 required positional argument: 'path'

In [None]:

centers = data['name'].unique().tolist()

for center in centers: 
    this_file = re.sub(' ', "_", str(center))
    this_file = re.sub('\.', '', this_file)
    this_file = re.sub(',', '', this_file).lower()
    try:
        os.mkdir('/Users/e.schwartz/Documents/Github/ia_scanning_labor_data/center_visuals/' +this_file+'/')
    except FileExistsError:
        pass
    make_scans_per_month(data.loc[data['name']== center], '/Users/e.schwartz/Documents/Github/ia_scanning_labor_data/center_visuals/' +this_file+'/')

## Ratio of Pages to Workers at each center each month

In [7]:
scans_workers = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/csv_files/scans_and_workers_month_stats.csv")[['name', 'month_year', 'books_scanned', 'operator', 'pages_scanned', 'scans_to_workers', 'pages_to_workers']]

allen = scans_workers.loc[scans_workers['name']== 'Allen County Public Library Geneaology Center']

allen

Unnamed: 0,name,month_year,books_scanned,operator,pages_scanned,scans_to_workers,pages_to_workers
0,Allen County Public Library Geneaology Center,2002-01,4,3,1366.0,1.333333,455.333333
1,Allen County Public Library Geneaology Center,2008-05,78,11,37763.0,7.090909,3433.000000
2,Allen County Public Library Geneaology Center,2008-06,583,21,240186.0,27.761905,11437.428571
3,Allen County Public Library Geneaology Center,2008-07,664,22,274770.0,30.181818,12489.545455
4,Allen County Public Library Geneaology Center,2008-08,641,21,257029.0,30.523810,12239.476190
...,...,...,...,...,...,...,...
176,Allen County Public Library Geneaology Center,2023-02,447,24,168444.0,18.625000,7018.500000
177,Allen County Public Library Geneaology Center,2023-03,1269,23,253923.0,55.173913,11040.130435
178,Allen County Public Library Geneaology Center,2023-04,1178,22,165378.0,53.545455,7517.181818
179,Allen County Public Library Geneaology Center,2023-05,1406,23,149438.0,61.130435,6497.304348


In [8]:
# data is a subset of a dataframe representing a single center
def make_scatters(data, file_name):
    file_name = file_name + 'pages_to_workers_ratio_over_time_scatters.json'
    alt.Chart(data).mark_circle(size=60).encode(
        x='month_year',
        y='pages_to_workers',
        tooltip=['month_year', 'pages_to_workers', 'pages_scanned', 'operator']
    ).interactive().save(file_name)

In [9]:
import re

centers = scans_workers['name'].unique().tolist()

for center in centers: 
    this_file = re.sub(' ', "_", str(center))
    this_file = re.sub('\.', '', this_file)
    this_file = re.sub(',', '', this_file).lower()
#     print(this_file)
#     os.mkdir('/Users/e.schwartz/Documents/Github/ia_scanning_labor_data/center_visuals/' +this_file+'/')
    make_scatters(scans_workers.loc[scans_workers['name']== center], '/Users/e.schwartz/Documents/Github/ia_scanning_labor_data/center_visuals/' +this_file+'/')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/e.schwartz/Documents/Github/ia_scanning_labor_data/center_visuals/california_acaddemy_of_sciences/pages_to_workers_ratio_over_time_scatters.json'

In [None]:
pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/csv_files/scans_and_workers_month_stats.csv")

In [None]:
selection = alt.selection_point(fields=['name'], bind='legend')

pages_scanned_to_workers_ratio_scatters = alt.Chart("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/csv_files/scans_and_workers_month_stats.csv").mark_circle().encode(
    x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="Months"),
    y= alt.Y('pages_to_workers:Q', title="Ratio of Pages Scanned to Workers"),
    color=alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)),
    order=alt.Order('name:N',sort='ascending'),
    opacity=alt.condition(selection, alt.value(1), alt.value(0)),
    tooltip=['name:N', 'pages_to_workers:Q']
).add_params(selection).configure_legend(
  orient='bottom'
).properties(
    # Adjust chart width and height to match size of legend
    width=2000,
    height=400
).interactive()


pages_scanned_to_workers_ratio_scatters.save('/Users/e.schwartz/Documents/GitHub/ia_scanning_labor_data/center_visuals/pages_scanned_to_workers_ratio_scatter.json')
pages_scanned_to_workers_ratio_scatters.save('/Users/e.schwartz/Documents/GitHub/ia_scanning_labor_data/center_visuals/pages_scanned_to_workers_ratio_scatter.html')


In [None]:
pages_scanned_to_workers_ratio_scatters

In [None]:
selector = alt.selection_point(encodings=['color'])
alt.Chart(scans_workers.loc[scans_workers['name']=='Innodata Knowledge Services, Inc.']).mark_circle().encode(
    x='month_year:N',
    y='pages_to_workers:Q'

#     color = alt.condition(selector, 'name:N', alt.value('lightgray'))
).add_params(selector).interactive()

In [None]:
scans_workers.loc[scans_workers['name']=='UIUC']

In [None]:
selector = alt.selection_point(encodings=['color'])
alt.Chart(scans_workers.loc[scans_workers['name']=='UIUC']).mark_circle().encode(
    x='month_year:N',
    y='pages_to_workers:Q',
    color = alt.condition(selector, 'name:N', alt.value('lightgray'))
).add_params(selector).interactive()

In [None]:
import altair as alt
from vega_datasets import data

source = data.barley()

alt.Chart(source).mark_bar().encode(
    x='variety',
    y='sum(yield)',
    color='site'
)

limear regression 

plotting a line for each center 

x=time 

y= books scanned 


this is a significant difference 

trivariate regression 


1. throw out bulk uploaders partner
2. get daily scans and workers'
3. send to dust 




In [None]:
pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/csv_files/scans_and_workers_month_stats.csv")

In [None]:
scans_month_data = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/csv_files/scans_and_workers_month_stats.csv")
allen_county = scans_month_data.loc[scans_month_data['name']== 'Allen County Public Library Geneaology Center']



alt.Chart(allen_county).mark_circle(size=60).encode(
    x='month_year',
    y='pages_to_workers'
  
    
).interactive()

In [None]:


input_dropdown = alt.binding_select(options=['Europe','Japan','USA'], name='Region ')
selection = alt.selection_point(fields=['Scanning Centers'], bind=input_dropdown)
color = alt.condition(
    selection,
    alt.Color('Origin:N').legend(None),
    alt.value('lightgray')
)

source = data.cars()

alt.Chart(source).mark_circle(size=60).encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color=color,
 
).add_params(
    selection
)

In [None]:
scans_month_data['name'].unique().tolist()




input_dropdown = alt.binding_select(options=scans_month_data['name'].unique().tolist(), name='scanning centers')
# selection = alt.selection_point(fields=['name'], bind=input_dropdown)
selection = "Yiddish Book Center"
color = alt.condition(
    selection,
    alt.Color('name:N').legend(None),
    alt.value('lightgray')
)



chart = alt.Chart(scans_month_data).mark_circle(size=60).encode(
    x='month_year:N',
    y='pages_to_workers:Q',
    color=color
).add_params(selection)



chart.save('/Users/e.schwartz/Documents/GitHub/ia_scanning_labor_data/troubleshoot_viz.json')

chart


In [None]:
input_dropdown = alt.binding_select(options=scans_month_data['name'].unique().tolist(), name='scanning centers')
selection = alt.selection_point(fields=['name'], bind=input_dropdown)
color = alt.condition(
    selection,
    alt.Color('name:N').legend(None),
    alt.value('lightgray')
)

alt.Chart(scans_month_data).mark_line().encode(
    x='month_year:N',
    y='pages_to_workers:Q'
  
    
).add_params(selection)

In [None]:
# putting the bar chart and the map together into a dash
from vega_datasets import data

selector=alt.selection_single(fields=['long', 'lat'], on='click', nearest=True)

# import point data
scan_center_url = "https://raw.githubusercontent.com/scanninglabor/IAScanningLabor/main/code/scans_per_center_per_year.csv"
scan_center_url = "https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/csv_files/scans_per_center_per_month.csv"

# globe background

world = alt.topo_feature(data.world_110m.url, 'countries')

# US states background
background = alt.Chart(world, title='Internet Archive Scanning Centers').mark_geoshape(
    fill='lightgray',
    stroke='white'
).properties(
    width=800,
    height=600
).project('equalEarth')


# scan center locations

points = alt.Chart(scan_center_url).mark_circle(
    size=20,
    color='blue'
).encode(
    longitude='long:Q',
    latitude='lat:Q', 
    tooltip=['name:N']
).add_params(selector)

this_map = background + points


# making a bar chart for scans over time 

scans = alt.Chart(scan_center_url).mark_bar(
    color='blue', 
#     interpolate='step-after', 
    line=True).encode(
    x=alt.X('yearmonth(date):T', axis=alt.Axis(title="Months")),
    y=alt.Y('count:Q', axis=alt.Axis(title="Books Scanned")),
    tooltip = ['date:T', 'count:Q']
).transform_filter(selector).interactive()

geodash = this_map | scans

geodash

In [None]:
# cebu pages to workers analysis
stats = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/csv_files/scans_and_workers_month_stats.csv")
cebu_stats = stats.loc[stats['name'] == 'Innodata Knowledge Services, Inc.']
selection = alt.selection_point(fields=['name'], bind='legend')

cebu_scatters = alt.Chart(cebu_stats).mark_circle().encode(
    x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="Months"),
    y= alt.Y('pages_to_workers:Q', title="Ratio of Pages Scanned to Workers"),
    color=alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)),
    order=alt.Order('name:N',sort='ascending'),
    opacity=alt.condition(selection, alt.value(1), alt.value(0)),
    tooltip=['name:N', 'pages_to_workers:Q', 'month_year:T']
).add_params(selection).configure_legend(
  orient='bottom'
).properties(
    # Adjust chart width and height to match size of legend
    width=2000,
    height=400
).interactive()

In [None]:
cebu_scatters

In [None]:
# alberta center 
stats = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/csv_files/scans_and_workers_month_stats.csv")
alberta_stats = stats.loc[stats['name'] == 'University of Alberta']
selection = alt.selection_point(fields=['name'], bind='legend')

alberta_scatters = alt.Chart(alberta_stats).mark_circle().encode(
    x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="Months"),
    y= alt.Y('pages_to_workers:Q', title="Ratio of Pages Scanned to Workers"),
    color=alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)),
    order=alt.Order('name:N',sort='ascending'),
    opacity=alt.condition(selection, alt.value(1), alt.value(0)),
    tooltip=['name:N', 'pages_to_workers:Q', 'month_year:T']
).add_params(selection).configure_legend(
  orient='bottom'
).properties(
    # Adjust chart width and height to match size of legend
    width=2000,
    height=400
).interactive()

In [None]:
alberta_scatters