In [1]:
import pandas as pd
import numpy as np
import altair as alt
from vega_datasets import data

# stats to calculate
- ## outsourcing:
  - percentage of books scanned at BPOs per year - done!
  - total weight of shipments
- ## pay
    - calculations on maximum wages workers at cebu could have made based on number of workers there per month and the contract IA had with Innodata at the time (at 1.5million per year / 12 months with x workers working per month, that leaves what max monthly wage).
- ## rates of work:
  - as measure of pages scanned to workers working per center per month - done! 
  - avergage pages scanned to workers working per center + standard deviation 
- ## turn over rates- how long do operators stay at their jobs across centers
  - avg. days worked by workers at each center + standard deviation - will need to throw out lots of junk

In [16]:
# accessing the geocoded texts dataset from my external hard drive. You can get it from this box link: https://wustl.account.box.com/login?redirect_url=%2Ffolder%2F271951204450&logout=true
texts = pd.read_csv('/Volumes/Samsung_T5/scanning_labor_in_IA/geocoded-texts-data.csv', low_memory=False)
texts = texts.drop(columns=['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0'])


In [43]:
len(texts)

7235237

In [2]:
dir = "/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/"

In [3]:
img_dir = "/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/debates_in_dh_chapter/chapter_visuals/"

In [2]:
# need to remove texts that are microfilm or microfiche as these are scanned using different machinery/their scanning is more automated
# drop records if scanner contains "microfilm", "microfiche", "Internet Archive Python Library", "-mf" 
# there are other ambiguous scanner fields, but based on spot checking about 10 of them, they are often scribe machines with
# very few scans attached to them. so they won't substantially muddy the data
# some of the records don't have scanner info attached to them, but they do have operator names that indicate they're microfilm/microfiche 
# for example, microfilm_processor@archive.org. We exclude these as well. 
# removed at total of 1794175 scans from the dataset for these measures because they're scanned by microfilm 

def weed_microfilm(ia_df):
    search_strs = ["microfilm", "microfiche", "-mf", "Internet Archive"]
    scanners = ia_df['scanner'].drop_duplicates().tolist()
    scribes = []
    data = []
    for scanner in scanners: 
        hit = True
        for search_str in search_strs: 
            if search_str in str(scanner): 
                hit = False
            else: pass
        if hit: 
            scribes.append(scanner)
        else: pass
    for i in range(len(ia_df)): 
        if str(ia_df.at[i,'scanner']) in scribes: 
                if "microfilm" in str(ia_df.at[i,'operator']):
                    pass
                else:
                    data.append(ia_df.iloc[i].to_dict())
        else: pass

    return pd.DataFrame(data)

# Outsourcing

In [48]:
# scans per center per month
texts = weed_microfilm(texts)
texts['month_year'] = pd.to_datetime(texts['scandate']).dt.to_period('M').astype('str')
texts['year'] = pd.to_datetime(texts['scandate']).dt.to_period('Y').astype('str')
scans_per_center = texts[['name', 'month_year', 'identifier']]
scans_per_center = scans_per_center.groupby(['name', 'month_year']).count().reset_index()
scans_per_center_month = scans_per_center.loc[scans_per_center['month_year'] != 'NaT']
scans_per_center_month = scans_per_center_month.rename(columns={'name':'name', 'month_year': 'month_year', 'identifier':'books_scanned'})
scans_per_center_month.to_csv(dir + 'scans_per_center_per_month.csv')

In [50]:
yearly_scans = texts[['name', 'year', 'identifier']]

In [22]:
selection = alt.selection_point(fields=['name'], bind='legend')

scans_per_month_chart = alt.Chart("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/metadata-analysis/metadata-records-analysis-csvs/scans_per_center_per_month.csv").mark_bar().encode(
    x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="Months"),
    y= alt.Y('books_scanned:Q', title="Books Scanned"),
    # color=alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)),
    order=alt.Order('name:N',sort='ascending'),
#     opacity=alt.condition(selection, alt.value(1), alt.value(0.15)),
    color=alt.condition(selection, alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)), alt.value("black"))
    
).add_params(selection).configure_legend(
  orient='bottom'
).properties(
    # Adjust chart width and height to match size of legend
    width=600,
    height=400
)

scans_per_month_chart

In [23]:
# pages per center per month - excluding microfilm + microfiche 
pages_per_center = texts[['name', 'month_year', 'imagecount', 'operator', 'scanner']]
# pages_per_center = weed_microfilm(pages_per_center)
pages_per_center = pages_per_center.loc[pages_per_center['month_year'] != 'NaT']
pages_per_center = pages_per_center[['name', 'month_year', 'imagecount']] 
pages_per_center_month = pages_per_center.groupby(['name', 'month_year']).sum().reset_index()
pages_per_center_month = pages_per_center_month.rename(columns={'name':'name', 'month_year': 'month_year', 'imagecount':'pages_scanned'})
pages_per_center_month.to_csv(dir+ 'pages_per_center_per_month')


In [24]:
pages_per_center_month

Unnamed: 0,name,month_year,pages_scanned
0,Allen County Public Library Geneaology Center,2002-01,1366.0
1,Allen County Public Library Geneaology Center,2008-05,37763.0
2,Allen County Public Library Geneaology Center,2008-06,240186.0
3,Allen County Public Library Geneaology Center,2008-07,274770.0
4,Allen County Public Library Geneaology Center,2008-08,257029.0
...,...,...,...
4760,Yiddish Book Center,2013-05,6893.0
4761,Yiddish Book Center,2013-06,8322.0
4762,Yiddish Book Center,2013-07,1957.0
4763,Yiddish Book Center,2013-08,1009.0


In [14]:
centers = centers[['lat', 'long', 'name']]
centers = centers.drop_duplicates()

In [15]:
counts = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/metadata-analysis/metadata-records-analysis-csvs/total_scans_per_center.csv")
counts = counts.rename(columns={"scan_center":"name", "total_scans":"total_scans", "percent_of_total":"percent_of_total"})

In [20]:
geocode_counts  = pd.merge(counts, centers, how='inner', on=['name'])

In [21]:
geocode_counts

Unnamed: 0.1,Unnamed: 0,name,total_scans,percent_of_total,lat,long
0,0,1 Dollar Scan,20,0.0002214984%,37.374551,-121.909902
1,1,Allen County Public Library Geneaology Center,214706,2.3778514038%,41.077732,-85.142248
2,2,American Museum of Natural History,29,0.0003211726%,40.781925,-73.972465
3,3,American Numismatic Society,15554,0.1722592789%,40.724013,-74.006444
4,4,American Printing House for the Blind,2326,0.0257602599%,38.257598,-85.714221
...,...,...,...,...,...,...
78,73,University of Victoria,6843,0.0757856658%,48.463681,-123.309665
79,74,University of Warwick,715,0.0079185666%,52.381916,-1.562005
80,75,Washington University in St. Louis,18103,0.2004892456%,38.650147,-90.328389
81,76,Yiddish Book Center,1242,0.0137550485%,42.321888,-72.527668


In [66]:
# geocoding isn't right - there are 2 datum datas and JHU is in antarctica 

centers = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/metadata-analysis/metadata-records-analysis-csvs/location_key.csv")
countries = alt.topo_feature(data.world_110m.url, 'countries')

background = alt.Chart(countries).mark_geoshape(
    fill='lightgrey',
    stroke='white'
).project(
    "equirectangular"
).properties(
    width=500,
    height=300
)

points = alt.Chart(geocode_counts).mark_circle().encode(
    longitude='long:Q',
    latitude='lat:Q',
    size='total_scans:Q',
    color = alt.Color('total_scans:Q').scale(scheme="turbo"),
    tooltip=['name:N']

)


geo_chart = background + points
geo_chart.save(img_dir+"center_map.html")
geo_chart.save(img_dir+"center_map.json")
geo_chart.save(img_dir+"center_map.png")
geo_chart

## Annual Counts + Percentage of Scans by Center Type

In [12]:
scans = scans_per_center_month[['name', 'month_year', 'books_scanned']]

In [13]:
scans['center_type'] = ''
scans = scans.reset_index()

In [14]:
type_centers = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/metadata-analysis/metadata-records-analysis-csvs/scan-center-type.csv")

In [15]:
for i in range(len(scans)):
    for j in range(len(type_centers)): 
        if type_centers.at[j, 'name'] == scans.at[i, 'name']: 
            scans.at[i, 'center_type'] = type_centers.at[j, 'type']
        else: pass

In [16]:
scans_type_month = scans.groupby(['month_year', 'center_type']).sum().reset_index()

In [17]:
scans_type_month.to_csv(dir+'scans_per_center_type_per_month.csv')

In [3]:

# selection = alt.selection_point(fields=['name'], bind='legend')

scans_per_month_type_chart = alt.Chart("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/metadata-analysis/metadata-records-analysis-csvs/scans_per_center_type_per_month.csv").mark_bar().encode(
    x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="Months"),
    y= alt.Y('books_scanned:Q', title="Books Scanned"),
    color=alt.Color('center_type:N', title="Center Type", legend=alt.Legend(columns=1, symbolLimit=0)),
    order=alt.Order('center_type:N',sort='ascending'),
#     opacity=alt.condition(selection, alt.value(1), alt.value(0.15)),
    # color=alt.condition(selection, alt.Color('center_type:N', legend=alt.Legend(columns=8, symbolLimit=0)), alt.value("black"))
    
).add_params().configure_legend(
  orient='right'
).properties(
    # Adjust chart width and height to match size of legend
    width=600,
    height=400
)

scans_per_month_type_chart.save('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/debates_in_dh_chapter/chapter_visuals/scans_per_center_type_per_month.html')
scans_per_month_type_chart.save('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/debates_in_dh_chapter/chapter_visuals/scans_per_center_type_per_month.json')
scans_per_month_type_chart.save('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/debates_in_dh_chapter/chapter_visuals/scans_per_center_type_per_month.png')
scans_per_month_type_chart

In [19]:

condition = ['Innodata Knowledge Services, Inc.', 'Datum Data Co. Ltd.',  'Hong Kong']

bpo_scans_vs_total_over_time = alt.Chart("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/metadata-analysis/metadata-records-analysis-csvs/pages_per_center_per_month.csv").mark_bar().encode(
    x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="Months"),
    y= alt.Y('pages_scanned:Q', title="Books Scanned"),
    order=alt.Order('name:N'),
    color= alt.condition(alt.FieldOneOfPredicate(field='name', oneOf=condition), alt.value('red'), alt.value('black'))
).configure_legend(
  orient='bottom'
).properties(
    # Adjust chart width and height to match size of legend
    width=600,
    height=400
).properties(
    title='Pages Scanned at Outsourced Centers vs. All Books in the Dataset'
)

# bpo_scans_vs_total_over_time.save('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/debates_in_dh_chapter/chapter_visuals/bpo_scans_vs_total_over_time.html')
# bpo_scans_vs_total_over_time.save("/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/debates_in_dh_chapter/chapter_visuals/bpo_scans_vs_total_over_time.json")

bpo_scans_vs_total_over_time.save(img_dir+"bpo_scans_vs_total_over_time.html")
bpo_scans_vs_total_over_time.save(img_dir+"bpo_scans_vs_total_over_time.json")
bpo_scans_vs_total_over_time.save(img_dir+"bpo_scans_vs_total_over_time.png")
bpo_scans_vs_total_over_time

In [20]:
scans_type_year = scans_type_month.groupby(['center_type','month_year']).sum().reset_index()

In [21]:
scans_type_year['year'] = ""

for i in range(len(scans_type_year)):
    scans_type_year.at[i, 'year'] = str(scans_type_year.at[i,'month_year']).split('-')[0]

In [22]:
scans_type_year = scans_type_year.groupby(['center_type','year']).sum().reset_index()[['center_type', 'year', 'books_scanned']]

In [23]:
scans_per_year = scans_type_year.groupby(['year']).sum().reset_index()[['year','books_scanned']]
scans_per_year.loc[scans_per_year['year']== '2010']['books_scanned'].tolist()[0]

248956

In [24]:
scans_type_year['percent'] = ''

for i in range(len(scans_type_year)):
    scans_type_year.at[i, 'percent'] = scans_type_year.at[i, 'books_scanned'] / scans_per_year.loc[scans_per_year['year'] == scans_type_year.at[i, 'year']]['books_scanned'].tolist()[0]

 

In [25]:
scans_type_year['center_type'].tolist()
this_dict = {'year':"",'academic':"", 'bpo':"", 'government':"", 'hq':"", 'museum':"", 'public':"", 'total_books':""}
year_list = []
for i in range(20): 
    year_list.append(this_dict)


In [26]:
def get_center_count(center_type, year): 
    try:
        result = scans_type_year.loc[(scans_type_year['center_type'] == center_type) & (scans_type_year['year'] == year)].reset_index().to_dict()['books_scanned'][0]
    except KeyError: 
        result = 0
    return result


def get_center_percent(center_type, year):
    try:
        result = scans_type_year.loc[(scans_type_year['center_type'] == center_type) & (scans_type_year['year'] == year)].reset_index().to_dict()['percent'][0]
        result = f"{result:.2%}"
    except KeyError: 
        result = 0
    return result


In [27]:
get_center_count('academic', '1998')

0

In [28]:
the_years = scans_per_year['year'].tolist()
yearly_counts = []
for this_year in the_years: 
    yearly_counts.append({'year': this_year,
                         'academic': get_center_count('academic', this_year),
                         'bpo': get_center_count('bpo', this_year), 
                         'government': get_center_count('government', this_year), 
                         'hq': get_center_count('hq', this_year), 
                         'museum': get_center_count('museum', this_year),
                         'public': get_center_count('public', this_year), 
                         'total_books':""})

yearly_percents = []
for this_year in the_years: 
    yearly_percents.append({'year': this_year,
                         'academic': get_center_percent('academic', this_year),
                         'bpo': get_center_percent('bpo', this_year), 
                         'government': get_center_percent('government', this_year), 
                         'hq': get_center_percent('hq', this_year), 
                         'museum': get_center_percent('museum', this_year),
                         'public': get_center_percent('public', this_year)})
  

In [29]:
the_years = scans_per_year['year'].tolist()

In [30]:
scans_type_year.loc[(scans_type_year['center_type'] == 'academic') & (scans_type_year['year'] == '2012')]

Unnamed: 0,center_type,year,books_scanned,percent
17,academic,2012,71500,0.207611


In [31]:
yearly_percents

[{'year': '2001',
  'academic': '100.00%',
  'bpo': 0,
  'government': 0,
  'hq': 0,
  'museum': 0,
  'public': 0},
 {'year': '2002',
  'academic': '97.91%',
  'bpo': 0,
  'government': '0.93%',
  'hq': 0,
  'museum': '0.23%',
  'public': '0.93%'},
 {'year': '2006',
  'academic': '58.94%',
  'bpo': 0,
  'government': 0,
  'hq': '41.06%',
  'museum': 0,
  'public': 0},
 {'year': '2007',
  'academic': '68.21%',
  'bpo': 0,
  'government': '0.25%',
  'hq': '28.47%',
  'museum': '0.57%',
  'public': '2.50%'},
 {'year': '2008',
  'academic': '63.90%',
  'bpo': 0,
  'government': '9.90%',
  'hq': '11.78%',
  'museum': '0.59%',
  'public': '13.83%'},
 {'year': '2009',
  'academic': '52.39%',
  'bpo': '0.00%',
  'government': '19.39%',
  'hq': '9.99%',
  'museum': '0.64%',
  'public': '17.45%'},
 {'year': '2010',
  'academic': '48.57%',
  'bpo': '2.30%',
  'government': '10.30%',
  'hq': '23.76%',
  'museum': '0.31%',
  'public': '14.63%'},
 {'year': '2011',
  'academic': '28.91%',
  'bpo': '4

In [32]:
yearly_counts_df = pd.DataFrame.from_records(yearly_counts) 
yearly_counts_df


Unnamed: 0,year,academic,bpo,government,hq,museum,public,total_books
0,2001,3,0,0,0,0,0,
1,2002,421,0,4,0,1,4,
2,2006,37107,0,0,25845,0,0,
3,2007,108070,0,402,45098,899,3963,
4,2008,162414,0,25163,29933,1510,35165,
5,2009,115643,11,42811,22049,1414,38515,
6,2010,120923,5714,25634,59152,784,36433,
7,2011,85211,130389,30253,24,465,44821,
8,2012,71500,188777,34341,685,282,39506,
9,2013,74593,20919,48684,64,1819,35295,


In [33]:
yearly_percents_df = pd.DataFrame.from_records(yearly_percents) 

In [34]:
yearly_percents_df

Unnamed: 0,year,academic,bpo,government,hq,museum,public
0,2001,100.00%,0,0,0,0,0
1,2002,97.91%,0,0.93%,0,0.23%,0.93%
2,2006,58.94%,0,0,41.06%,0,0
3,2007,68.21%,0,0.25%,28.47%,0.57%,2.50%
4,2008,63.90%,0,9.90%,11.78%,0.59%,13.83%
5,2009,52.39%,0.00%,19.39%,9.99%,0.64%,17.45%
6,2010,48.57%,2.30%,10.30%,23.76%,0.31%,14.63%
7,2011,28.91%,44.23%,10.26%,0.01%,0.16%,15.20%
8,2012,20.76%,54.81%,9.97%,0.20%,0.08%,11.47%
9,2013,40.98%,11.49%,26.75%,0.04%,1.00%,19.39%


In [35]:
yearly_counts_df.to_csv(dir + 'count_books_scanned_per_year_per_type.csv')
yearly_percents_df.to_csv(dir + 'percentage_books_scanned_per_year_per_type.csv')


In [4]:
percents = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/metadata-analysis/metadata-records-analysis-csvs/percentage_books_scanned_per_year_per_type.csv")

In [7]:

source = pd.DataFrame({"category": [1, 2, 3, 4, 5, 6], "value": [4, 6, 10, 3, 7, 8]})

source

Unnamed: 0,category,value
0,1,4
1,2,6
2,3,10
3,4,3
4,5,7
5,6,8


## Overall Count + Percent of Books Scanned by Center Type

In [36]:
scans_type_year

total_scans_by_type =  scans_type_year.groupby(['center_type']).sum().reset_index()[['center_type','books_scanned']]

In [37]:
total_scans_by_type

Unnamed: 0,center_type,books_scanned
0,,15540
1,academic,1229836
2,archive,20359
3,bpo,4048396
4,government,679532
5,hq,184076
6,museum,35328
7,public,468129


In [40]:
total_scans_by_type['percent']= total_scans_by_type['books_scanned']/total_scans_by_type['books_scanned'].sum() * 100

In [41]:
total_scans_by_type.to_csv(dir+ 'total_scans_by_center_type.csv')

In [42]:
total_scans_by_type

Unnamed: 0,center_type,books_scanned,percent
0,,15540,0.232593
1,academic,1229836,18.407423
2,archive,20359,0.304721
3,bpo,4048396,60.593882
4,government,679532,10.170814
5,hq,184076,2.755135
6,museum,35328,0.528768
7,public,468129,7.006665


## Bills of Lading
Bills of Lading record shipments of goods from overseas to the US. Internet Archive ships books to BPOs overseas for digitization and then back to the US for storage. 

In [7]:
bols = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/ia_bols/combined_ia_bols_manual_dedupe%20-%20deduped_results.csv")

In [26]:
shipped_weight = bols['weight_kg'].sum()

In [27]:
datum_data = bols.loc[(bols['supplier']=='Datum Data Co Ltd') & ((bols['company_location_id'] == '300funston') | (bols['company_location_id'] == '2512florida'))]

In [28]:
datum_data['weight_kg'].sum()

np.int64(235324)

In [57]:
yearly_scans.loc[texts['name']== 'Datum Data Co. Ltd.']['year'].drop_duplicates().tolist()

datum_scans = yearly_scans.loc[texts['name']== 'Datum Data Co. Ltd.'] 
datum_scans.loc[(datum_scans['year'] != '2017') & (datum_scans['year'] != '2019') & 
(datum_scans['year'] != '2020') & 
(datum_scans['year'] != '2021') &
(datum_scans['year'] != '2022') &
(datum_scans['year'] != '2023') &
(datum_scans['year'] != 'NaT')]


Unnamed: 0,name,year,identifier
0,Datum Data Co. Ltd.,2013,geometrysuccessi00thom
1,Datum Data Co. Ltd.,2012,berdiemobilittde00hebe
2,Datum Data Co. Ltd.,2011,beilsteinshandb28beil
3,Datum Data Co. Ltd.,2012,theoryapplicatio00adva
4,Datum Data Co. Ltd.,2012,specialeditionus00edbo_0
...,...,...,...
402249,Datum Data Co. Ltd.,2012,shadowoftreasonn00goye
402250,Datum Data Co. Ltd.,2014,mirrorsofselfarc00chri
402251,Datum Data Co. Ltd.,2012,dajasbook00pier
402252,Datum Data Co. Ltd.,2011,alternativeheali00burr


In [59]:
hk_scans = texts.loc[texts['name'] == 'Hong Kong']

In [61]:
hk_scans.loc[(hk_scans['donor'] == 'bostonpubliclibrary') | (hk_scans['donor'] == 'Boston Public')]


Unnamed: 0,identifier,scandate,sponsor,scanner,donor,imagecount,operator,shipping_container,mediatype,scanningcenter,name,lat,long,month_year,year
402260,isbn_9785802901045,2017-02-14 15:26:34,Kahle/Austin Foundation,ttScribe9.hongkong.archive.org,bostonpubliclibrary,282.0,associate-chunlai-li@archive.org,SZ0024,texts,hongkong,Hong Kong,22.31836625,114.181248,2017-02,2017
402276,365funniestgolfj00gefe,2017-03-19 07:00:43,Kahle/Austin Foundation,ttscribe7.hongkong.archive.org,bostonpubliclibrary,278.0,associate-hongmei-liang@archive.org,SZ0023,texts,hongkong,Hong Kong,22.31836625,114.181248,2017-03,2017
402283,yogiberrasbaseba00yogi,2017-11-20 08:52:41,Kahle/Austin Foundation,ttscribe17.hongkong.archive.org,bostonpubliclibrary,86.0,associate-chuiyuk-cheung@archive.org,,texts,hongkong,Hong Kong,22.31836625,114.181248,2017-11,2017
402286,wheelestaterised00wall,2017-12-22 03:47:09,Kahle/Austin Foundation,ttscribe25.hongkong.archive.org,bostonpubliclibrary,314.0,associate-manyee-fung@archive.org,,texts,hongkong,Hong Kong,22.31836625,114.181248,2017-12,2017
402294,sevenislands00godd,2018-01-22 05:54:40,Kahle/Austin Foundation,ttscribe20.hongkong.archive.org,bostonpubliclibrary,184.0,associate-manyee-fung@archive.org,,texts,hongkong,Hong Kong,22.31836625,114.181248,2018-01,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695690,dignityessential00hick,2017-03-02 08:55:44,Kahle/Austin Foundation,ttscribe8.hongkong.archive.org,bostonpubliclibrary,250.0,associate-yaqin-wen@archive.org,SZ0023,texts,hongkong,Hong Kong,22.31836625,114.181248,2017-03,2017
695694,mrsjeffriestakes00brig_1,2017-06-15 12:04:56,Kahle/Austin Foundation,ttscribe20.hongkong.archive.org,bostonpubliclibrary,278.0,associate-taobin@archive.org,SZ0025,texts,hongkong,Hong Kong,22.31836625,114.181248,2017-06,2017
695696,gospelaccordingt00gilm,2016-12-13 04:05:22,Kahle/Austin Foundation,ttscribe7.hongkong.archive.org,bostonpubliclibrary,222.0,operator7.hongkong@archive.org,SZ0023,texts,hongkong,Hong Kong,22.31836625,114.181248,2016-12,2016
695697,laescuelademagia00ende,2018-08-11 20:14:04,Kahle/Austin Foundation,ttscribe24.hongkong.archive.org,bostonpubliclibrary,170.0,associate-lansze-wong@archive.org,,texts,hongkong,Hong Kong,22.31836625,114.181248,2018-08,2018


In [63]:
len(hk_scans)

293454

In [67]:
texts.loc[texts['name']== 'Innodata Knowledge Services, Inc.']

Unnamed: 0,identifier,scandate,sponsor,scanner,donor,imagecount,operator,shipping_container,mediatype,scanningcenter,name,lat,long,month_year,year
776307,thankheavenforte0000unse,2023-01-06 08:51:04,Kahle/Austin Foundation,station40.cebu.archive.org,,66.0,associate-jeneil-colonia@archive.org,,texts,cebu,"Innodata Knowledge Services, Inc.",10.31803327,123.920545,2023-01,2023
776308,theoryofelastici0007unse,2022-06-02 15:55:38,Kahle/Austin Foundation,station43.cebu.archive.org,,154.0,associate-irene-getutua@archive.org,,texts,cebu,"Innodata Knowledge Services, Inc.",10.31803327,123.920545,2022-06,2022
776309,sim_detroit-monthly_1984-10_7_10,,Kahle/Austin Foundation,SCAN02.cebu.archive.org,,199.0,,,texts,cebu,"Innodata Knowledge Services, Inc.",10.31803327,123.920545,NaT,NaT
776310,dejosuecronicas0000furs,2022-11-15 13:50:45,Kahle/Austin Foundation,station34.cebu.archive.org,,142.0,associate-markriel-dingcong@archive.org,,texts,cebu,"Innodata Knowledge Services, Inc.",10.31803327,123.920545,2022-11,2022
776311,cheeselandnovel0000rich,2022-06-15 23:25:01,Kahle/Austin Foundation,station42.cebu.archive.org,,216.0,associate-jobert-apor@archive.org,,texts,cebu,"Innodata Knowledge Services, Inc.",10.31803327,123.920545,2022-06,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4591700,henryribsy0000clea_w6l2,2021-03-29 02:10:00,Kahle/Austin Foundation,station44.cebu.archive.org,,214.0,associate-jessanen-becari@archive.org,,texts,cebu,"Innodata Knowledge Services, Inc.",10.31803327,123.920545,2021-03,2021
4591701,sprinthowtosolve0000unse,2022-03-04 13:24:01,Kahle/Austin Foundation,station43.cebu.archive.org,,294.0,associate-jeneil-colonia@archive.org,,texts,cebu,"Innodata Knowledge Services, Inc.",10.31803327,123.920545,2022-03,2022
4591702,moralistsmoderni0000mint,2019-01-26 04:19:55,Kahle/Austin Foundation,station02.cebu.archive.org,,214.0,associate-carla-igot@archive.org,,texts,cebu,"Innodata Knowledge Services, Inc.",10.31803327,123.920545,2019-01,2019
4591703,americanshogunma0000harv_y5p6,2022-03-25 23:48:02,Kahle/Austin Foundation,station65.cebu.archive.org,,502.0,associate-irene-capada@archive.org,,texts,cebu,"Innodata Knowledge Services, Inc.",10.31803327,123.920545,2022-03,2022


In [71]:
bols.loc[bols['supplier'] == 'Innodata Knowledge Services Inc']['weight_kg'].sum()

np.int64(1316172)

In [74]:
bols.loc[ (bols['supplier'] == 'Internet Archive China') | (bols['supplier'] == 'Internet Archive')]

Unnamed: 0.1,Unnamed: 0,source,arrival_date,company,company_location_id,company_address,company_lat,company_lon,supplier,supplier_location_id,...,hs_code_detail,bol,port_entry,port_entry_code,port_entry_lat,port_entry_lon,shipping_port,shipping_port_code,shipping_port_lar,shipping_port_lon
10,10,combo,2015-05-10,Internet Archive,2512florida,2512 Florida Ave Richmond Ca94804 Usa,37.929504,-122.345987,Internet Archive China,18excellence,...,Electrical machinery and equipment and parts t...,CHKMSOA504054103,"Oakland, Ca",2811.0,37.796506,-122.279134,Hong Kong,58201.0,22.33801,114.130381
11,11,combo,2015-08-09,Internet Archive,2512florida,2512 Florida Ave Richmond Ca94804 Usa,37.929504,-122.345987,Internet Archive China,18excellence,...,Electrical machinery and equipment and parts t...,CHKMSOA507013517,"Oakland, Ca",2811.0,37.796506,-122.279134,Hong Kong,58201.0,22.33801,114.130381
13,13,combo,2016-06-26,Internet Archive,2512florida,2512 Florida Ave Richmond Ca94804 Usa,37.929504,-122.345987,Internet Archive China,18excellence,...,Electrical machinery and equipment and parts t...,CHKMSOA605072810,"Oakland, Ca",2811.0,37.796506,-122.279134,Hong Kong,58201.0,22.33801,114.130381
15,15,combo,2017-02-13,Internet Archive,380carlson,380 Carlson Blvd Richmond Ca 94084,37.929504,-122.345987,Internet Archive,veristrong_industrial,...,"Printed books, newspapers, pictures and other ...",CCLTHKGCS027507,"Oakland, Ca",2811.0,37.796506,-122.279134,Hong Kong,58201.0,22.33801,114.130381
16,16,combo,2017-06-02,Internet Archive,380carlson,380 Carlson Bldv Richmond Ca 94084,37.929504,-122.345987,Internet Archive,veristrong_industrial,...,"Printed books, newspapers, pictures and other ...",CCLTHKGCS028345,"Oakland, Ca",2811.0,37.796506,-122.279134,Hong Kong,58201.0,22.33801,114.130381
17,17,combo,2017-09-29,Internet Archive,380carlson,380 Carlson Bldv Richmond Ca 94084,37.929504,-122.345987,Internet Archive,veristrong_industrial,...,"Printed books, newspapers, pictures and other ...",CCLTHKGCS029676,"Oakland, Ca",2811.0,37.796506,-122.279134,Hong Kong,58201.0,22.33801,114.130381
18,18,combo,2018-01-06,Internet Archive,380carlson,380 Carlson Bldv Richmond Ca 94084,37.929504,-122.345987,Internet Archive,veristrong_industrial,...,"Printed books, newspapers, pictures and other ...",CCLTHKGCS030413,"Oakland, Ca",2811.0,37.796506,-122.279134,Hong Kong,58201.0,22.33801,114.130381
20,20,combo,2018-02-23,Internet Archive,380carlson,380 Carlson Blvd Richmond Ca 94084,37.927437,-122.337982,Internet Archive,veristrong_industrial,...,"Printed books, newspapers, pictures and other ...",CCLTHKGCS030902,"Oakland, Ca",2811.0,37.796506,-122.279134,Hong Kong,58201.0,22.33801,114.130381
21,21,combo,2018-05-11,Internet Archive,380carlson,380 Carlson Blvd Richmond Ca 94084,37.927437,-122.337982,Internet Archive,veristrong_industrial,...,"Printed books, newspapers, pictures and other ...",CCLTHKGCS031478,"Oakland, Ca",2811.0,37.796506,-122.279134,Hong Kong,58201.0,22.33801,114.130381
22,22,combo,2018-07-06,Internet Archive,380carlson,380 Carlson Blvd Richmond Ca 94084,37.927437,-122.337982,Internet Archive,veristrong_industrial,...,"Printed books, newspapers, pictures and other ...",CCLTHKGCS031892,"Oakland, Ca",2811.0,37.796506,-122.279134,Hong Kong,58201.0,22.33801,114.130381


IA has shipped 1,813,237 kg of books, equipment, lps, and microfilm/fiche overseas for digitization and back to the US between 2011 and 2023. That's 3,997,503 pounds.  

# Rate of Work

## Rate of Work as a Measure of Pages Scanned to Workers Working per Center per Month

### Books Scanned per Worker per Day 
This isn't a good way of understanding the amount of work being done per worker per day because some books are much shorter than others. David Sutton of Princeton's scanning center consistently scans some of the most books per day using this method because he scanned much of their postcard collection: https://archive.org/details/ar050jtanis 

However, the data is more complete than other measures. Only 4537 records are missing date informaiton or operator information.

In [196]:
texts['scan_day'] = pd.to_datetime(texts['scandate']).dt.to_period('D').astype('str')
texts['operator'] = texts['operator'].astype('str')
texts['operator'] = texts['operator'].map(str.lower)
scans_per_day = texts[['name', 'scan_day', 'identifier', 'operator', 'scanner']]
scans_per_day = weed_microfilm(scans_per_day)

In [58]:
scans_per_day = scans_per_day[['name', 'scan_day', 'identifier', 'operator']]

In [59]:
scans_per_day

Unnamed: 0,name,scan_day,identifier,operator
0,Datum Data Co. Ltd.,2013-10-10,geometrysuccessi00thom,scanner-shenzhen-leo@archive.org
1,Datum Data Co. Ltd.,2012-09-08,berdiemobilittde00hebe,scanner-shenzhen-mary@archive.org
2,Datum Data Co. Ltd.,2011-07-16,beilsteinshandb28beil,scanner-shenzhen-lina@archive.org
3,Datum Data Co. Ltd.,2012-08-14,theoryapplicatio00adva,scanner-shenzhen-dragon@archive.org
4,Datum Data Co. Ltd.,2012-09-21,specialeditionus00edbo_0,scanner-shenzhen-yan@archive.org
...,...,...,...,...
7235232,New York Botanical Garden,2017-12-08,lrobokibotanikaf01agar,operator1.nybg@archive.org
7235233,New York Botanical Garden,2017-04-05,alphonsedecandol00cand,operator1.nybg@archive.org
7235234,New York Botanical Garden,2017-12-20,memoirsoftorreyb2019torr,operator1.nybg@archive.org
7235235,New York Botanical Garden,2017-05-11,bulletinofpopula1191arno,operator1.nybg@archive.org


In [61]:
scans_per_worker_day = scans_per_day.groupby(['name','operator', 'scan_day']).count().reset_index()

In [62]:
missing_scans = scans_per_worker_day.loc[(scans_per_worker_day['scan_day'] == 'NaT')|(scans_per_worker_day['operator'] == 'nan')]

missing_scans

Unnamed: 0,name,operator,scan_day,identifier
1728,Allen County Public Library Geneaology Center,associate-christine-calhoun@archive.org,NaT,1
3097,Allen County Public Library Geneaology Center,associate-janet-breitenwischer@archive.org,NaT,1
6514,Allen County Public Library Geneaology Center,associate-ladonna-hartmann@archive.org,NaT,18
8957,Allen County Public Library Geneaology Center,associate-rosa-guzman@archive.org,NaT,1
10137,Allen County Public Library Geneaology Center,associate-sam-shorter@archive.org,NaT,16
...,...,...,...,...
426094,Yiddish Book Center,scanner-mitt-raj@archive.org,NaT,10
426214,Yiddish Book Center,scanner-zoe-schacht-levine@archive.org,NaT,2
426234,Yiddish Book Center,volunteer-allison-posner@archive.org,NaT,6
426254,Yiddish Book Center,volunteer-jessica-parker@archive.org,NaT,5


In [63]:
# weeding out missing data- we want to throw out rows where operator = nan and/or scanday = nat
scans_per_worker_day = scans_per_worker_day.loc[scans_per_worker_day['scan_day'] != 'NaT']
scans_per_worker_day = scans_per_worker_day.loc[scans_per_worker_day['operator'] != 'nan']

In [64]:
scans_per_worker_day = scans_per_worker_day.rename(columns={'name':'name', 'operator': 'operator','scan_day':'scan_day','identifier':'books_scanned'})

In [65]:
scans_per_worker_day.sort_values(by="books_scanned", ascending=False)
scans_per_worker_day.to_csv(dir+"books_scanned_per_worker_per_day.csv")


## Pages Scanned per Worker per Day 
The imagecount measures how many images make up a scanned item. It corresponds to the actual number of images a worker captured as they pressed the capture button on the scribe machine, lowered the book using a foot pedal, flipped a page, and repeated. As such, it's a much better way to approximate rate of work. Unfortunately, the metadata field is often left blank in an IA metadata record. 2,284,561 records do not have any imagecount information and we exclude them from these analyses. 

Of these, 87,678 are microfilm. So for the pages scanned per worker calculations, we are working from a shrunken dataset of 6,657,173 records.

there are points where one "operator" is logged into all the scribe machines at the same time, and that can't be right. this makes it look like one person is scanning 60,000 pages per day. 


In [5]:
pages = texts[['name', 'scan_day', 'month_year','imagecount', 'operator', 'scanner']]

KeyError: "['scan_day', 'month_year'] not in index"

In [67]:
missing_pages = pages.loc[(pages['scan_day'] == 'NaT') | (pages['operator'] == 'nan') |  (pages['imagecount'] == 'NaN')]

len(missing_pages)

2284561

In [68]:
pages = pages.loc[(pages['scan_day'] != 'NaT') & (pages['operator'] != 'nan') &  (pages['imagecount'] != 'NaN')]

In [69]:
pages = pages.reset_index()
pages = weed_microfilm(pages)

In [70]:
pages_per_worker_per_day = pages[['name', 'scan_day', 'imagecount', 'operator']]
pages_per_worker_per_day  = pages_per_worker_per_day .groupby(['name', 'scan_day', 'operator']).sum()
pages_per_worker_per_day  = pages_per_worker_per_day .reset_index()

pages_per_worker_per_day .sort_values(by="imagecount", ascending=False)


Unnamed: 0,name,scan_day,operator,imagecount
91510,Datum Data Co. Ltd.,2013-10-17,scanner-shenzhen-leo@archive.org,101942.0
82287,Datum Data Co. Ltd.,2011-07-08,scanner-shenzhen-leo@archive.org,92954.0
82241,Datum Data Co. Ltd.,2011-07-01,scanner-shenzhen-leo@archive.org,86450.0
91513,Datum Data Co. Ltd.,2013-10-22,scanner-shenzhen-leo@archive.org,84322.0
91515,Datum Data Co. Ltd.,2013-10-24,scanner-shenzhen-leo@archive.org,78542.0
...,...,...,...,...
105121,Getty Research Institute Valencia Warehouse,2023-05-26,associate-sean-kim@archive.org,0.0
91104,Datum Data Co. Ltd.,2012-12-21,scanner-shenzhen-thomas@archive.org,0.0
91190,Datum Data Co. Ltd.,2013-01-05,scanner-shenzhen-leo@archive.org,0.0
91167,Datum Data Co. Ltd.,2012-12-31,scanner-shenzhen-david@archive.org,0.0


In [71]:
# center level data 
pages_per_day = pages[['name', 'scan_day', 'imagecount']]
pages_per_day = pages_per_day.groupby(['name', 'scan_day']).sum()
pages_per_day = pages_per_day.reset_index()

pages_per_day

Unnamed: 0,name,scan_day,imagecount
0,Allen County Public Library Geneaology Center,2002-01-09,752.0
1,Allen County Public Library Geneaology Center,2002-01-10,108.0
2,Allen County Public Library Geneaology Center,2002-01-15,506.0
3,Allen County Public Library Geneaology Center,2008-05-15,564.0
4,Allen County Public Library Geneaology Center,2008-05-16,454.0
...,...,...,...
71974,Yiddish Book Center,2013-07-19,114.0
71975,Yiddish Book Center,2013-07-26,1663.0
71976,Yiddish Book Center,2013-07-29,180.0
71977,Yiddish Book Center,2013-08-02,1009.0


In [214]:
workers = pages[['name', 'scan_day', 'operator']]
workers = workers.drop_duplicates()
workers = workers.groupby(['name', 'scan_day']).count().reset_index()
workers


Unnamed: 0,name,scan_day,operator
0,1 Dollar Scan,NaT,1
1,Allen County Public Library Geneaology Center,2002-01-09,2
2,Allen County Public Library Geneaology Center,2002-01-10,1
3,Allen County Public Library Geneaology Center,2002-01-15,1
4,Allen County Public Library Geneaology Center,2008-05-15,1
...,...,...,...
74558,Yiddish Book Center,2013-07-26,1
74559,Yiddish Book Center,2013-07-29,1
74560,Yiddish Book Center,2013-08-02,1
74561,Yiddish Book Center,NaT,6


In [208]:

pages_stats = pd.merge(pages_per_day, workers, how='inner', on=['name', 'scan_day'])


In [209]:
pages_stats['pages_to_worker_ratio'] = ''
i = 0
for i in range(len(pages_stats)): 
    pages_stats.at[i,'pages_to_worker_ratio'] = float(pages_stats.at[i, 'imagecount']) / float(pages_stats.at[i, 'operator']) 
    i += 0

In [210]:
pages_stats

Unnamed: 0,name,scan_day,imagecount,operator,pages_to_worker_ratio
0,Allen County Public Library Geneaology Center,2002-01-09,752.0,2,376.0
1,Allen County Public Library Geneaology Center,2002-01-10,108.0,1,108.0
2,Allen County Public Library Geneaology Center,2002-01-15,506.0,1,506.0
3,Allen County Public Library Geneaology Center,2008-05-15,564.0,1,564.0
4,Allen County Public Library Geneaology Center,2008-05-16,454.0,2,227.0
...,...,...,...,...,...
71698,Yiddish Book Center,2013-07-19,114.0,1,114.0
71699,Yiddish Book Center,2013-07-26,1663.0,1,1663.0
71700,Yiddish Book Center,2013-07-29,180.0,1,180.0
71701,Yiddish Book Center,2013-08-02,1009.0,1,1009.0


In [211]:
pages_stats.sort_values(by='pages_to_worker_ratio', ascending=False)
pages_stats = pages_stats[['name', 'scan_day', 'imagecount', 'operator', 'pages_to_worker_ratio']].reset_index()

In [212]:
pages_stats.sort_values(by='pages_to_worker_ratio', ascending=False)

Unnamed: 0,index,name,scan_day,imagecount,operator,pages_to_worker_ratio
21476,21476,Datum Data Co. Ltd.,2015-11-11,66394.0,1,66394.0
21474,21474,Datum Data Co. Ltd.,2015-11-09,65814.0,1,65814.0
21043,21043,Datum Data Co. Ltd.,2013-12-04,64902.0,1,64902.0
21475,21475,Datum Data Co. Ltd.,2015-11-10,63758.0,1,63758.0
50929,50929,UCLA,2007-05-18,60230.0,1,60230.0
...,...,...,...,...,...,...
27882,27882,Getty Research Institute Valencia Warehouse,2023-05-25,0.0,1,0.0
27881,27881,Getty Research Institute Valencia Warehouse,2023-05-24,0.0,1,0.0
27880,27880,Getty Research Institute Valencia Warehouse,2023-05-23,0.0,1,0.0
27878,27878,Getty Research Institute Valencia Warehouse,2023-05-20,0.0,1,0.0


In [77]:
pages_stats.to_csv('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/pages_scanned_per_worker_per_day.csv')
workers.to_csv('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/number_workers_per_day.csv')


In [213]:
# pages to workers ratio visulization 

selection = alt.selection_point(fields=['name'], bind='legend')

pages_scanned_to_workers_ratio_scatters = alt.Chart("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/metadata-analysis/metadata-records-analysis-csvs/pages_scanned_per_worker_per_day.csv").mark_line().encode(
    x= alt.X('scan_day:T', axis=alt.Axis(labelAngle=-4), title="Days"),
    y= alt.Y('pages_to_worker_ratio:Q', title="Ratio of Pages Scanned to Workers"),
    color=alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)),
    order=alt.Order('name:N',sort='ascending'),
    opacity=alt.condition(selection, alt.value(1), alt.value(0)),
    tooltip=['name:N', 'pages_to_worker_ratio:Q', 'scan_day:T']
).add_params(selection).configure_legend(
  orient='bottom'
).properties(
    # Adjust chart width and height to match size of legend
    width=600,
    height=400
).interactive()

pages_scanned_to_workers_ratio_scatters

In [172]:
pages_stats.loc[pages_stats['name'] == 'Datum Data Co. Ltd.'].sort_values(by='pages_to_worker_ratio', ascending=False)
# texts['month_year'] = pd.to_datetime(texts['scandate']).dt.to_period('M').astype('str')

# texts.loc[texts['name']

texts.loc[(texts['scan_day']=='2013-10-17') &  (texts['name']== 'Datum Data Co. Ltd.')]['scanner'].tolist()

['scribe8.shenzhen.archive.org',
 'scribe7.shenzhen.archive.org',
 'scribe8.shenzhen.archive.org',
 'scribe7.shenzhen.archive.org',
 'scribe5.shenzhen.archive.org',
 'scribe8.shenzhen.archive.org',
 'scribe6.shenzhen.archive.org',
 'scribe10.shenzhen.archive.org',
 'scribe4.shenzhen.archive.org',
 'scribe6.shenzhen.archive.org',
 'scribe5.shenzhen.archive.org',
 'scribe6.shenzhen.archive.org',
 'scribe10.shenzhen.archive.org',
 'scribe7.shenzhen.archive.org',
 'scribe7.shenzhen.archive.org',
 'scribe6.shenzhen.archive.org',
 'scribe5.shenzhen.archive.org',
 'scribe17.shenzhen.archive.org',
 'scribe6.shenzhen.archive.org',
 'scribe4.shenzhen.archive.org',
 'scribe4.shenzhen.archive.org',
 'scribe4.shenzhen.archive.org',
 'scribe6.shenzhen.archive.org',
 'scribe5.shenzhen.archive.org',
 'scribe8.shenzhen.archive.org',
 'scribe13.shenzhen.archive.org',
 'scribe5.shenzhen.archive.org',
 'scribe4.shenzhen.archive.org',
 'scribe17.shenzhen.archive.org',
 'scribe4.shenzhen.archive.org',
 'scr

In [84]:
# need to get avg daily pages to worker ratios for centers + standard deviation 
# page_to_workers_stat_analysis = pd.DataFrame(columns=['name', 'std_pages_to_workers_ratio', 'min', 'max'])
pages_to_workers_stat_analysis = []

centers = pages_stats['name'].unique().tolist()

for center in centers: 
    center_stats = pages_stats.loc[pages_stats['name']==center]
    std = float(center_stats['pages_to_worker_ratio'].std())
    center_min = center_stats['pages_to_worker_ratio'].min()
    center_max = center_stats['pages_to_worker_ratio'].max()
#     min_month = center_stats.loc[center_stats['pages_to_workers'] == center_min]['month_year']
    std = pd.to_numeric(std, errors="coerce")
    pages_to_workers_stat_analysis.append(
        {"name":center,
        "std_pages_to_workers_ratio":std, 
        "min":center_min,
        "max":center_max}
    )
   

In [87]:
pages_to_workers_stat_analysis = pd.DataFrame(pages_to_workers_stat_analysis)

pages_to_workers_stat_analysis.to_csv(dir + "standard_deviation_workers_to_pages_ratio.csv")

### Pages Scanned Per Worker Per Month

In [177]:
month_pages = weed_microfilm(pages)
month_pages = month_pages.loc[(month_pages['month_year'] != 'NaT') & (month_pages['operator'] != 'nan') &  (month_pages['imagecount'] != 'NaN')]


In [178]:
month_pages

Unnamed: 0,name,scan_day,month_year,imagecount,operator,scanner
0,Datum Data Co. Ltd.,2013-10-10,2013-10,200.0,scanner-shenzhen-leo@archive.org,scribe7.shenzhen.archive.org
1,Datum Data Co. Ltd.,2012-09-08,2012-09,250.0,scanner-shenzhen-mary@archive.org,scribe18.shenzhen.archive.org
2,Datum Data Co. Ltd.,2011-07-16,2011-07,962.0,scanner-shenzhen-lina@archive.org,scribe3.shenzhen.archive.org
3,Datum Data Co. Ltd.,2012-08-14,2012-08,590.0,scanner-shenzhen-dragon@archive.org,scribe15.shenzhen.archive.org
4,Datum Data Co. Ltd.,2012-09-21,2012-09,888.0,scanner-shenzhen-yan@archive.org,scribe27.shenzhen.archive.org
...,...,...,...,...,...,...
7235232,New York Botanical Garden,2017-12-08,2017-12,450.0,operator1.nybg@archive.org,ttscribe1.nybg.archive.org
7235233,New York Botanical Garden,2017-04-05,2017-04,4.0,operator1.nybg@archive.org,ttscribe1.nybg.archive.org
7235234,New York Botanical Garden,2017-12-20,2017-12,530.0,operator1.nybg@archive.org,ttscribe1.nybg.archive.org
7235235,New York Botanical Garden,2017-05-11,2017-05,86.0,operator1.nybg@archive.org,ttscribe1.nybg.archive.org


In [185]:
pages_per_month = month_pages[['name', 'month_year', 'imagecount']]
pages_per_month = pages_per_month .groupby(['name', 'month_year']).sum()
pages_per_month = pages_per_month.reset_index()


In [186]:
workers_month = pages[['name', 'month_year', 'operator']]
workers_month = workers_month.drop_duplicates()
workers_month = workers_month.groupby(['name', 'month_year']).count().reset_index()
workers_month= workers_month.reset_index()

In [187]:
pages_per_worker_per_month  = pd.merge(pages_per_month, workers_month, how='inner', on=['name', 'month_year'])
pages_per_worker_per_month['pages_to_worker_ratio'] = ''
i = 0
for i in range(len(pages_per_worker_per_month)): 
    pages_per_worker_per_month.at[i,'pages_to_worker_ratio'] = float(pages_per_worker_per_month.at[i, 'imagecount']) / float(pages_per_worker_per_month.at[i, 'operator']) 
    i += 0

In [189]:
pages_per_worker_per_month = pages_per_worker_per_month[['name', 'month_year', 'imagecount', 'operator', 'pages_to_worker_ratio']]

In [192]:
selection = alt.selection_point(fields=['name'], bind='legend')

test = alt.Chart(pages_per_worker_per_month).mark_circle().encode(
    x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="months"),
    y= alt.Y('pages_to_worker_ratio:Q', title="Ratio of Pages Scanned to Workers"),
    color=alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)),
    order=alt.Order('name:N',sort='ascending'),
    opacity=alt.condition(selection, alt.value(1), alt.value(0)),
    tooltip=['name:N', 'pages_to_worker_ratio:Q', 'month_year:T']
).add_params(selection).configure_legend(
  orient='bottom'
).properties(
    # Adjust chart width and height to match size of legend
    width=600,
    height=400
).interactive()

test

In [205]:
pages.loc[(pages['name']=='Datum Data Co. Ltd.')& (pages['month_year']=='2013-09')]


# scans_type_year.loc[(scans_type_year['center_type'] == 'academic') & (scans_type_year['year'] == '2012')]

Unnamed: 0,name,scan_day,month_year,imagecount,operator,scanner
75,Datum Data Co. Ltd.,2013-09-23,2013-09,346.0,scanner-shenzhen-leo@archive.org,scribe6.shenzhen.archive.org
673,Datum Data Co. Ltd.,2013-09-23,2013-09,1022.0,scanner-shenzhen-leo@archive.org,scribe7.shenzhen.archive.org
1175,Datum Data Co. Ltd.,2013-09-11,2013-09,294.0,scanner-shenzhen-leo@archive.org,scribe17.shenzhen.archive.org
1269,Datum Data Co. Ltd.,2013-09-13,2013-09,538.0,scanner-shenzhen-leo@archive.org,scribe4.shenzhen.archive.org
1664,Datum Data Co. Ltd.,2013-09-17,2013-09,74.0,scanner-shenzhen-leo@archive.org,scribe18.shenzhen.archive.org
...,...,...,...,...,...,...
401155,Datum Data Co. Ltd.,2013-09-13,2013-09,290.0,scanner-shenzhen-leo@archive.org,scribe3.shenzhen.archive.org
401221,Datum Data Co. Ltd.,2013-09-24,2013-09,202.0,scanner-shenzhen-leo@archive.org,scribe6.shenzhen.archive.org
401531,Datum Data Co. Ltd.,2013-09-05,2013-09,518.0,scanner-shenzhen-leo@archive.org,scribe4.shenzhen.archive.org
402215,Datum Data Co. Ltd.,2013-09-29,2013-09,554.0,scanner-shenzhen-leo@archive.org,scribe7.shenzhen.archive.org


I think some of these with really high numbers of pages scanned per day are actually microfilm or microfiche. see zack@archive.org: https://archive.org/details/albertahomestead2053cana/page/n647/mode/2up 
the scanner = microfilm something. 
we may need to actually make sure everything is scanned on a SCRIBE MACHINE to be sure of the media type and materiality of the working conditions

we might also be accidentally capturing republishing operators which is not the same thing as a scribe machine operator: https://archive.org/details/futurepresentsit00skee/page/8/mode/2up 

## Pages Scanned Per Scribe Machines in Use
Because we cannot guarantee that the operator signed into the machine corresponds to a single person working, we need to use scribe machine metadata as a proxy for rate of work. 

Because some workers are logged in on multiple scribe machines at the same time, the operator field is not always a good indicator of a unique worker working. For example, the operator scanner-shenzhen-leo@archive.org is logged into roughly 10 scribe machines at the Datum Data center on any given day. Because of this, it seems like he is scanning over 1 million pages a month. 

We can try and approximate rate of work based on the number of scribe machines in use on a given day and comparing that to the pages scanned per workers working measures.

In [110]:
pages_per_center_month 

Unnamed: 0,name,month_year,pages_scanned
0,Allen County Public Library Geneaology Center,2002-01,1366.0
1,Allen County Public Library Geneaology Center,2008-05,37763.0
2,Allen County Public Library Geneaology Center,2008-06,240186.0
3,Allen County Public Library Geneaology Center,2008-07,274770.0
4,Allen County Public Library Geneaology Center,2008-08,257029.0
...,...,...,...
4760,Yiddish Book Center,2013-05,6893.0
4761,Yiddish Book Center,2013-06,8322.0
4762,Yiddish Book Center,2013-07,1957.0
4763,Yiddish Book Center,2013-08,1009.0


In [135]:
scribes = texts[['name', 'month_year', 'scanner', 'operator']]
scribes = weed_microfilm(scribes)

In [136]:
scribes = scribes[['name', 'month_year', 'scanner']]

In [140]:
scribes

Unnamed: 0,name,month_year,scanner
0,Datum Data Co. Ltd.,2013-10,scribe7.shenzhen.archive.org
1,Datum Data Co. Ltd.,2012-09,scribe18.shenzhen.archive.org
2,Datum Data Co. Ltd.,2011-07,scribe3.shenzhen.archive.org
3,Datum Data Co. Ltd.,2012-08,scribe15.shenzhen.archive.org
4,Datum Data Co. Ltd.,2012-09,scribe27.shenzhen.archive.org
...,...,...,...
7235203,New York Botanical Garden,2017-02,ttscribe1.nybg.archive.org
7235206,New York Botanical Garden,2017-09,ttscribe1.nybg.archive.org
7235210,New York Botanical Garden,2017-12,ttscribe1.nybg.archive.org
7235218,New York Botanical Garden,2018-01,ttscribe1.nybg.archive.org


In [137]:
scribes = scribes.drop_duplicates()
scribes_month = scribes.groupby(['name', 'month_year']).count()
scribes_month= scribes_month.reset_index()

In [139]:
scribes_month.sort_values(by="scanner", ascending=False)

Unnamed: 0,name,month_year,scanner
1953,"Innodata Knowledge Services, Inc.",2022-09,63
1951,"Innodata Knowledge Services, Inc.",2022-07,63
1954,"Innodata Knowledge Services, Inc.",2022-10,63
1952,"Innodata Knowledge Services, Inc.",2022-08,62
1922,"Innodata Knowledge Services, Inc.",2020-02,62
...,...,...,...
2530,"Natural History Museum Library, London",2009-12,1
2531,"Natural History Museum Library, London",2010-01,1
2532,"Natural History Museum Library, London",2010-02,1
2533,"Natural History Museum Library, London",2010-03,1


In [162]:
pages_per_scribe_month = pd.merge(pages_per_center_month, scribes_month, how='inner', on=['name', 'month_year'])

In [163]:
pages_per_scribe_month['pages_to_machines'] = ''

In [164]:
pages_per_scribe_month

Unnamed: 0,name,month_year,pages_scanned,scanner,pages_to_machines
0,Allen County Public Library Geneaology Center,2002-01,1366.0,1,
1,Allen County Public Library Geneaology Center,2008-05,37763.0,5,
2,Allen County Public Library Geneaology Center,2008-06,240186.0,10,
3,Allen County Public Library Geneaology Center,2008-07,274770.0,9,
4,Allen County Public Library Geneaology Center,2008-08,257029.0,9,
...,...,...,...,...,...
4760,Yiddish Book Center,2013-05,6893.0,1,
4761,Yiddish Book Center,2013-06,8322.0,1,
4762,Yiddish Book Center,2013-07,1957.0,1,
4763,Yiddish Book Center,2013-08,1009.0,1,


In [165]:
for i in range(len(pages_per_scribe_month)): 
    pages_per_scribe_month.at[i,'pages_to_machines'] = float(pages_per_scribe_month.at[i, 'pages_scanned']) / float(pages_per_scribe_month.at[i, 'scanner']) 
    i += 1

In [166]:
selection = alt.selection_point(fields=['name'], bind='legend')

pages_per_scribe_per_month = alt.Chart(pages_per_scribe_month).mark_circle().encode(
    x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="months"),
    y= alt.Y('pages_to_machines:Q', title="Ratio of Pages Scanned to Machines in Use"),
    color=alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)),
    order=alt.Order('name:N',sort='ascending'),
    opacity=alt.condition(selection, alt.value(1), alt.value(0)),
    tooltip=['name:N', 'pages_to_machines:Q', 'month_year:T']
).add_params(selection).configure_legend(
  orient='bottom'
).properties(
    # Adjust chart width and height to match size of legend
    width=600,
    height=400
).interactive()

pages_per_scribe_per_month.save(img_dir + "pages_per_scribe_per_month.html")
pages_per_scribe_per_month.save(img_dir + "pages_per_scribe_per_month.json")
pages_per_scribe_per_month.save(img_dir + "pages_per_scribe_per_month.png")
pages_per_scribe_per_month


The above chart approximates the rate of work at scanning centers as a measure of the number of pages scanned per machines in use per month. We use scribe machines to approximate rates of work instead of machine operator metadata because sometimes the same user account is signed into multiple scribe machines at a scanning center.

The above graph plots the ratio of pages scanned to scribe machines in use at a scanning center each month. This seems reasonable to me as the most productive months correspond with those when the most books were scanned. 

We expect the rate of work to vary dependent on worker experience and the types of books workers are scanning. Brittle, older books are more difficult to flip the pages of than newer books. Likewise, workers become more attuned to using scanning equipment over time. 

## Books Scanned Per Scribe Machines in Use
- doing this to sanity check that nothing bizarre has happened because it's hard to grasp the volume of scanning from pages

In [158]:
scans_per_center_month

Unnamed: 0,name,month_year,books_scanned
1,Allen County Public Library Geneaology Center,2002-01,4
2,Allen County Public Library Geneaology Center,2008-05,78
3,Allen County Public Library Geneaology Center,2008-06,583
4,Allen County Public Library Geneaology Center,2008-07,664
5,Allen County Public Library Geneaology Center,2008-08,641
...,...,...,...
4923,Yiddish Book Center,2013-05,43
4924,Yiddish Book Center,2013-06,56
4925,Yiddish Book Center,2013-07,12
4926,Yiddish Book Center,2013-08,8


In [170]:
scans_per_scribe_month = pd.merge(scans_per_center_month, scribes_month, how='inner', on=['name', 'month_year'])

In [171]:
scans_per_scribe_month['books_to_machines'] = ''

In [106]:
for i in range(len(scans_per_scribe_month)): 
    scans_per_scribe_month.at[i,'books_to_machines'] = float(scans_per_scribe_month.at[i, 'pages_scanned']) / float(scans_per_scribe_month.at[i, 'scanner']) 
    i += 1

In [172]:
for i in range(len(scans_per_scribe_month)): 
    scans_per_scribe_month.at[i,'books_to_machines'] = float(scans_per_scribe_month.at[i, 'books_scanned']) / float(scans_per_scribe_month.at[i, 'scanner']) 
    i += 1

In [173]:
scans_per_scribe_month 

Unnamed: 0,name,month_year,books_scanned,scanner,books_to_machines
0,Allen County Public Library Geneaology Center,2002-01,4,1,4.0
1,Allen County Public Library Geneaology Center,2008-05,78,5,15.6
2,Allen County Public Library Geneaology Center,2008-06,583,10,58.3
3,Allen County Public Library Geneaology Center,2008-07,664,9,73.777778
4,Allen County Public Library Geneaology Center,2008-08,641,9,71.222222
...,...,...,...,...,...
4760,Yiddish Book Center,2013-05,43,1,43.0
4761,Yiddish Book Center,2013-06,56,1,56.0
4762,Yiddish Book Center,2013-07,12,1,12.0
4763,Yiddish Book Center,2013-08,8,1,8.0


In [174]:
selection = alt.selection_point(fields=['name'], bind='legend')

books_per_scribe_per_month = alt.Chart(scans_per_scribe_month).mark_circle().encode(
    x= alt.X('month_year:T', axis=alt.Axis(labelAngle=-4), title="months"),
    y= alt.Y('books_to_machines:Q', title="Ratio of Books Scanned to Machines in Use"),
    color=alt.Color('name:N', legend=alt.Legend(columns=8, symbolLimit=0)),
    order=alt.Order('name:N',sort='ascending'),
    opacity=alt.condition(selection, alt.value(1), alt.value(0)),
    tooltip=['name:N', 'books_to_machines:Q', 'month_year:T']
).add_params(selection).configure_legend(
  orient='bottom'
).properties(
    # Adjust chart width and height to match size of legend
    width=600,
    height=400
).interactive()

books_per_scribe_per_month.save(img_dir + "books_per_scribe_per_month.html")
books_per_scribe_per_month.save(img_dir + "books_per_scribe_per_month.json")
books_per_scribe_per_month.save(img_dir + "books_per_scribe_per_month.png")
books_per_scribe_per_month


# Turnover Rate
We calculate turnover rates as a measure of days worked per worker. To do so, subtract the last day an operator value appears in the text dataset from the first day it appears.
We are working from the whole texts dataset wherever there is an operator value present for this calculation because media type is less relevant. Only 3,211 records are missing an operator field, so this dataset consists of 9026201 text records

In [88]:
operators = texts[['name', 'operator']]
len(operators)

9029412

In [89]:
len(operators.loc[operators['operator'] != 'nan'])

6758857

In [90]:
operators = operators.loc[operators['operator'] != 'nan']
operators = operators.drop_duplicates()

In [91]:
operators ['days_worked'] = ''
operators['first_day'] = ''
operators['last_day'] = ''

In [92]:
operators = operators.reset_index()

In [93]:
workers_days = texts[['name', 'operator', 'scan_day']]
workers_days['scan_day'] = pd.to_datetime(workers_days['scan_day']).dt.to_period('D')

workers_days = workers_days.loc[workers_days['operator'] != 'nan']
workers_days = workers_days.loc[workers_days['scan_day'] != 'NaT']

workers_days = workers_days.drop_duplicates()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workers_days['scan_day'] = pd.to_datetime(workers_days['scan_day']).dt.to_period('D')


In [94]:
workers_days

Unnamed: 0,name,operator,scan_day
0,Datum Data Co. Ltd.,scanner-shenzhen-leo@archive.org,2013-10-10
1,Datum Data Co. Ltd.,scanner-shenzhen-mary@archive.org,2012-09-08
2,Datum Data Co. Ltd.,scanner-shenzhen-lina@archive.org,2011-07-16
3,Datum Data Co. Ltd.,scanner-shenzhen-dragon@archive.org,2012-08-14
4,Datum Data Co. Ltd.,scanner-shenzhen-yan@archive.org,2012-09-21
...,...,...,...
9028837,New York Botanical Garden,operator1.nybg@archive.org,2017-12-20
9028844,New York Botanical Garden,operator1.nybg@archive.org,2017-12-07
9028846,New York Botanical Garden,operator1.nybg@archive.org,2017-12-08
9028849,New York Botanical Garden,operator1.nybg@archive.org,2017-05-11


In [95]:
def get_days_worked(scan_center, worker): 
    min_date = workers_days.loc[ (workers_days['name'] == scan_center) & (workers_days['operator'] == worker)]['scan_day'].min()
    max_date = workers_days.loc[ (workers_days['name'] == scan_center) & (workers_days['operator'] == worker)]['scan_day'].max()
    return{'days_worked':(max_date - min_date)/ pd.Timedelta(days=1),
          'min_date': min_date,
          'max_date': max_date}
#     return min_date

In [96]:
i = 0 
for i in range(len(operators)):
    days_worked = get_days_worked(operators.iloc[i]['name'], operators.iloc[i]['operator'])
    operators.loc[i, 'days_worked'] = days_worked['days_worked']
    operators.loc[i, 'first_day'] = days_worked['min_date']
    operators.loc[i, 'last_day'] = days_worked['max_date']


In [97]:
operators

Unnamed: 0,index,name,operator,days_worked,first_day,last_day
0,0,Datum Data Co. Ltd.,scanner-shenzhen-leo@archive.org,2045.0,2010-08-26,2016-04-01
1,1,Datum Data Co. Ltd.,scanner-shenzhen-mary@archive.org,886.0,2010-12-23,2013-05-27
2,2,Datum Data Co. Ltd.,scanner-shenzhen-lina@archive.org,666.0,2010-12-24,2012-10-20
3,3,Datum Data Co. Ltd.,scanner-shenzhen-dragon@archive.org,242.0,2012-02-21,2012-10-20
4,4,Datum Data Co. Ltd.,scanner-shenzhen-yan@archive.org,253.0,2012-05-09,2013-01-17
...,...,...,...,...,...,...
3206,9024459,Hong Kong,scanner-shenzhen-wei@archive.org,0.0,2012-03-26,2012-03-26
3207,9024460,Hong Kong,associate-hedawei@archive.org,0.0,2013-11-06,2013-11-06
3208,9024461,Hong Kong,tracey.g@archive.org,0.0,2011-05-19,2011-05-19
3209,9028612,Hamilton Public Library,associate-emily-skewes-donaldson@archive.org,44.0,2023-04-26,2023-06-09


In [111]:
operators.to_csv('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/days_worked_per_worker.csv')


In [117]:
avg_days_worked = operators[['name', 'days_worked']].groupby(['name']).mean()
avg_days_worked = avg_days_worked.rename(columns={"name":"name", "days_worked": "avg_turnover"})


In [118]:
avg_days_worked = avg_days_worked.reset_index()

In [119]:
standard_dev = operators[['name', 'days_worked']].groupby(['name']).std().reset_index()
standard_dev = standard_dev.rename(columns={"name":"name", "days_worked": "standard_deviation"})


# avg_days_worked.to_csv(dir+"avg_turnover_per_center.csv")


In [120]:
avg_days_worked = pd.merge(avg_days_worked, standard_dev, how='inner', on=['name'])


In [124]:
avg_days_worked.to_csv(dir+"avg_turnover_per_center.csv")

NameError: name 'pages_per_day' is not defined