In [1]:
import pandas as pd
import numpy as np

# stats to calculate
- ## outsourcing:
  - percentage of books scanned at BPOs per year - done!
- ## pay
    - calculations on maximum wages workers at cebu could have made based on number of workers there per month and the contract IA had with Innodata at the time (at 1.5million per year / 12 months with x workers working per month, that leaves what max monthly wage).
- ## rates of work:
  - as measure of pages scanned to workers working per center per month - done! 
  - avergage pages scanned to workers working per center + standard deviation 
- ## turn over rates- how long do operators stay at their jobs across centers
  - avg. days worked by workers at each center + standard deviation - will need to throw out lots of junk

In [2]:
# accessing the geocoded texts dataset from my external hard drive. You can get it from this box link: https://wustl.account.box.com/login?redirect_url=%2Ffolder%2F271951204450&logout=true
texts = pd.read_csv('/Volumes/Samsung_T5/scanning_labor_in_IA/geocoded-texts-data.csv', low_memory=False)
texts = texts.drop(columns=['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0'])

In [3]:
dir = "/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/"

# Outsourcing

In [4]:
# scans per center per month
texts['month_year'] = pd.to_datetime(texts['scandate']).dt.to_period('M').astype('str')
scans_per_center = texts[['name', 'month_year', 'identifier']]
scans_per_center = scans_per_center.groupby(['name', 'month_year']).count().reset_index()
scans_per_center_month = scans_per_center.loc[scans_per_center['month_year'] != 'NaT']
scans_per_center_month = scans_per_center_month.rename(columns={'name':'name', 'month_year': 'month_year', 'identifier':'books_scanned'})
scans_per_center_month.to_csv(dir + 'scans_per_center_per_month.csv')

In [5]:
# need to remove texts that are microfilm or microfiche as these are scanned using different machinery/their scanning is more automated
# drop records if scanner contains "microfilm", "microfiche", "Internet Archive Python Library", "-mf" 
# there are other ambiguous scanner fields, but based on spot checking about 10 of them, they are often scribe machines with
# very few scans attached to them. so they won't substantially muddy the data
# some of the records don't have scanner info attached to them, but they do have operator names that indicate they're microfilm/microfiche 
# for example, microfilm_processor@archive.org. We exclude these as well. 
# removed at total of 1794175 scans from the dataset for these measures because they're scanned by microfilm 

def weed_microfilm(ia_df):
    search_strs = ["microfilm", "microfiche", "-mf", "Internet Archive"]
    scanners = ia_df['scanner'].drop_duplicates().tolist()
    scribes = []
    data = []
    for scanner in scanners: 
        hit = True
        for search_str in search_strs: 
            if search_str in str(scanner): 
                hit = False
            else: pass
        if hit: 
            scribes.append(scanner)
        else: pass
    for i in range(len(ia_df)): 
        if str(ia_df.at[i,'scanner']) in scribes: 
            if "microfilm" in str(ia_df.at[i,'operator']):
                pass
            else:
                data.append(ia_df.iloc[i].to_dict())
           
        else: pass

    return pd.DataFrame(data)

In [12]:
# pages per center per month - excluding microfilm + microfiche 
pages_per_center = texts[['name', 'month_year', 'imagecount', 'operator', 'scanner']]
pages_per_center = weed_microfilm(pages_per_center)
pages_per_center = pages_per_center.loc[pages_per_center['month_year'] != 'NaT']
pages_per_center = pages_per_center[['name', 'month_year', 'imagecount']] 
pages_per_center_month = pages_per_center.groupby(['name', 'month_year']).sum().reset_index()
pages_per_center_month = pages_per_center_month.rename(columns={'name':'name', 'month_year': 'month_year', 'imagecount':'pages_scanned'})
pages_per_center_month.to_csv(dir+ 'pages_per_center_per_month')


In [13]:
pages_per_center_month

Unnamed: 0,name,month_year,pages_scanned
0,Allen County Public Library Geneaology Center,2002-01,1366.0
1,Allen County Public Library Geneaology Center,2008-05,37763.0
2,Allen County Public Library Geneaology Center,2008-06,240186.0
3,Allen County Public Library Geneaology Center,2008-07,274770.0
4,Allen County Public Library Geneaology Center,2008-08,257029.0
...,...,...,...
4760,Yiddish Book Center,2013-05,6893.0
4761,Yiddish Book Center,2013-06,8322.0
4762,Yiddish Book Center,2013-07,1957.0
4763,Yiddish Book Center,2013-08,1009.0


## Annual Counts + Percentage of Scans by Center Type

In [21]:
scans = scans_per_center_month[['name', 'month_year', 'books_scanned']]

In [22]:
scans['center_type'] = ''
scans = scans.reset_index()

In [23]:
type_centers = pd.read_csv("https://raw.githubusercontent.com/ers6/ia_scanning_labor_data/main/metadata-analysis/metadata-records-analysis-csvs/scan-center-type.csv")

In [28]:
for i in range(len(scans)):
    for j in range(len(type_centers)): 
        if type_centers.at[j, 'name'] == scans.at[i, 'name']: 
            scans.at[i, 'center_type'] = type_centers.at[j, 'type']
        else: pass

In [29]:
scans_type_month = scans.groupby(['month_year', 'center_type']).sum().reset_index()

In [30]:
scans_type_month.to_csv(dir+'scans_per_center_type_per_month.csv')

In [31]:
scans_type_year = scans_type_month.groupby(['center_type','month_year']).sum().reset_index()

In [32]:
scans_type_year['year'] = ""

for i in range(len(scans_type_year)):
    scans_type_year.at[i, 'year'] = str(scans_type_year.at[i,'month_year']).split('-')[0]

In [33]:
scans_type_year = scans_type_year.groupby(['center_type','year']).sum().reset_index()[['center_type', 'year', 'books_scanned']]

In [34]:
scans_type_year

Unnamed: 0,center_type,year,books_scanned
0,,2015,92
1,,2016,3028
2,,2017,1753
3,,2018,1793
4,,2019,2037
...,...,...,...
127,public,2019,21036
128,public,2020,8342
129,public,2021,16017
130,public,2022,29775


In [35]:
scans_per_year = scans_type_year.groupby(['year']).sum().reset_index()[['year','books_scanned']]
scans_per_year.loc[scans_per_year['year']== '2010']['books_scanned'].tolist()[0]

261573

In [36]:
scans_type_year['percent'] = ''

for i in range(len(scans_type_year)):
    scans_type_year.at[i, 'percent'] = scans_type_year.at[i, 'books_scanned'] / scans_per_year.loc[scans_per_year['year'] == scans_type_year.at[i, 'year']]['books_scanned'].tolist()[0]

 

In [9]:
scans_type_year.to_csv('percentage_of_books_scanned_per_center_type_per_year.csv')

In [10]:
scans_type_year

Unnamed: 0,center_type,year,books_scanned,percent
0,academic,2001,3,1.0
1,academic,2002,421,0.97907
2,academic,2006,37248,0.589554
3,academic,2007,108590,0.683085
4,academic,2008,164714,0.639723
...,...,...,...,...
118,public,2019,21036,0.043464
119,public,2020,8342,0.010275
120,public,2021,16017,0.016478
121,public,2022,29775,0.027982


In [13]:
scans_type_year['center_type'].tolist()
this_dict = {'year':"",'academic':"", 'bpo':"", 'government':"", 'hq':"", 'museum':"", 'public':"", 'total_books':""}
year_list = []
for i in range(20): 
    year_list.append(this_dict)


[{'year': '',
  'academic': '',
  'bpo': '',
  'government': '',
  'hq': '',
  'museum': '',
  'public': '',
  'total_books': ''},
 {'year': '',
  'academic': '',
  'bpo': '',
  'government': '',
  'hq': '',
  'museum': '',
  'public': '',
  'total_books': ''},
 {'year': '',
  'academic': '',
  'bpo': '',
  'government': '',
  'hq': '',
  'museum': '',
  'public': '',
  'total_books': ''},
 {'year': '',
  'academic': '',
  'bpo': '',
  'government': '',
  'hq': '',
  'museum': '',
  'public': '',
  'total_books': ''},
 {'year': '',
  'academic': '',
  'bpo': '',
  'government': '',
  'hq': '',
  'museum': '',
  'public': '',
  'total_books': ''},
 {'year': '',
  'academic': '',
  'bpo': '',
  'government': '',
  'hq': '',
  'museum': '',
  'public': '',
  'total_books': ''},
 {'year': '',
  'academic': '',
  'bpo': '',
  'government': '',
  'hq': '',
  'museum': '',
  'public': '',
  'total_books': ''},
 {'year': '',
  'academic': '',
  'bpo': '',
  'government': '',
  'hq': '',
  'mus

In [12]:
def get_center_count(center_type, year): 
    try:
        result = scans_type_year.loc[(scans_type_year['center_type'] == center_type) & (scans_type_year['year'] == year)].reset_index().to_dict()['books_scanned'][0]
    except KeyError: 
        result = 0
    return result


def get_center_percent(center_type, year):
    try:
        result = scans_type_year.loc[(scans_type_year['center_type'] == center_type) & (scans_type_year['year'] == year)].reset_index().to_dict()['percent'][0]
        result = f"{result:.2%}"
    except KeyError: 
        result = 0
    return result


In [13]:
get_center_count('academic', '1998')

0

In [27]:
the_years = scans_per_year['year'].tolist()
yearly_counts = []
for this_year in the_years: 
    yearly_counts.append({'year': this_year,
                         'academic': get_center_count('academic', this_year),
                         'bpo': get_center_count('bpo', this_year), 
                         'government': get_center_count('government', this_year), 
                         'hq': get_center_count('hq', this_year), 
                         'museum': get_center_count('museum', this_year),
                         'public': get_center_count('public', this_year), 
                         'total_books':""})

yearly_percents = []
for this_year in the_years: 
    yearly_percents.append({'year': this_year,
                         'academic': get_center_percent('academic', this_year),
                         'bpo': get_center_percent('bpo', this_year), 
                         'government': get_center_percent('government', this_year), 
                         'hq': get_center_percent('hq', this_year), 
                         'museum': get_center_percent('museum', this_year),
                         'public': get_center_percent('public', this_year)})
  

In [152]:
the_years = scans_per_year['year'].tolist()

In [16]:
scans_type_year.loc[(scans_type_year['center_type'] == 'academic') & (scans_type_year['year'] == '2012')]

Unnamed: 0,center_type,year,books_scanned,percent
8,academic,2012,91679,0.250427


In [28]:
yearly_percents

[{'year': '2001',
  'academic': '100.00%',
  'bpo': 0,
  'government': 0,
  'hq': 0,
  'museum': 0,
  'public': 0},
 {'year': '2002',
  'academic': '97.91%',
  'bpo': 0,
  'government': '0.93%',
  'hq': 0,
  'museum': '0.23%',
  'public': '0.93%'},
 {'year': '2006',
  'academic': '58.96%',
  'bpo': 0,
  'government': 0,
  'hq': '41.04%',
  'museum': '0.00%',
  'public': 0},
 {'year': '2007',
  'academic': '68.31%',
  'bpo': 0,
  'government': '0.25%',
  'hq': '28.38%',
  'museum': '0.57%',
  'public': '2.49%'},
 {'year': '2008',
  'academic': '63.97%',
  'bpo': 0,
  'government': '9.77%',
  'hq': '12.01%',
  'museum': '0.59%',
  'public': '13.66%'},
 {'year': '2009',
  'academic': '56.18%',
  'bpo': '0.00%',
  'government': '17.74%',
  'hq': '9.40%',
  'museum': '0.59%',
  'public': '15.96%'},
 {'year': '2010',
  'academic': '51.05%',
  'bpo': '2.18%',
  'government': '9.80%',
  'hq': '22.61%',
  'museum': '0.30%',
  'public': '13.93%'},
 {'year': '2011',
  'academic': '31.95%',
  'bpo

In [19]:
yearly_counts_df = pd.DataFrame.from_records(yearly_counts) 
yearly_counts_df


Unnamed: 0,year,academic,bpo,government,hq,museum,public,total_books
0,2001,3,0,0,0,0,0,
1,2002,421,0,4,0,1,4,
2,2006,37248,0,0,25931,1,0,
3,2007,108590,0,402,45116,899,3963,
4,2008,164714,0,25163,30925,1510,35165,
5,2009,135579,11,42811,22681,1414,38515,
6,2010,133532,5714,25634,59152,784,36441,
7,2011,98462,130463,30265,25,465,44831,
8,2012,91679,188778,35093,685,282,39508,
9,2013,89153,20919,48684,64,1819,36198,


In [31]:
yearly_percents_df = pd.DataFrame.from_records(yearly_percents) 

In [32]:
yearly_percents_df

Unnamed: 0,year,academic,bpo,government,hq,museum,public
0,2001,100.00%,0,0,0,0,0
1,2002,97.91%,0,0.93%,0,0.23%,0.93%
2,2006,58.96%,0,0,41.04%,0.00%,0
3,2007,68.31%,0,0.25%,28.38%,0.57%,2.49%
4,2008,63.97%,0,9.77%,12.01%,0.59%,13.66%
5,2009,56.18%,0.00%,17.74%,9.40%,0.59%,15.96%
6,2010,51.05%,2.18%,9.80%,22.61%,0.30%,13.93%
7,2011,31.95%,42.34%,9.82%,0.01%,0.15%,14.55%
8,2012,25.04%,51.57%,9.59%,0.19%,0.08%,10.79%
9,2013,45.14%,10.59%,24.65%,0.03%,0.92%,18.33%


In [33]:
yearly_counts_df.to_csv('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/count_books_scanned_per_year_per_type.csv')
yearly_percents_df.to_csv('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/percentage_books_scanned_per_year_per_type.csv')


## Overall Count + Percent of Books Scanned by Center Type

In [47]:
scans_type_year

total_scans_by_type =  scans_type_year.groupby(['center_type']).sum().reset_index()[['center_type','books_scanned']]

In [48]:
total_scans_by_type

Unnamed: 0,center_type,books_scanned
0,academic,1314866
1,archive,21134
2,bpo,4048959
3,government,700435
4,hq,185807
5,museum,35502
6,public,472975


In [51]:
total_scans_by_type['percent']= total_scans_by_type['books_scanned']/total_scans_by_type['books_scanned'].sum() * 100

In [52]:
total_scans_by_type

Unnamed: 0,center_type,books_scanned,percent
0,academic,1314866,19.394225
1,archive,21134,0.311726
2,bpo,4048959,59.721996
3,government,700435,10.33139
4,hq,185807,2.740646
5,museum,35502,0.523653
6,public,472975,6.976364


# Rate of Work

## Rate of Work as a Measure of Pages Scanned to Workers Working per Center per Month

### Books Scanned per Worker per Day 
This isn't a good way of understanding the amount of work being done per worker per day because some books are much shorter than others. David Sutton of Princeton's scanning center consistently scans some of the most books per day using this method because he scanned much of their postcard collection: https://archive.org/details/ar050jtanis 

However, the data is more complete than other measures. Only 4537 records are missing date informaiton or operator information.

In [66]:
texts['scan_day'] = pd.to_datetime(texts['scandate']).dt.to_period('D').astype('str')
texts['operator'] = texts['operator'].astype('str')
texts['operator'] = texts['operator'].map(str.lower)
scans_per_day = texts[['name', 'scan_day', 'identifier', 'operator', 'scanner']]
scans_per_day = weed_microfilm(scans_per_day)

In [9]:
scans_per_day = scans_per_day[['name', 'scan_day', 'identifier', 'operator']]

In [10]:
scans_per_day

Unnamed: 0,name,scan_day,identifier,operator
0,Datum Data Co. Ltd.,2013-10-10,geometrysuccessi00thom,scanner-shenzhen-leo@archive.org
1,Datum Data Co. Ltd.,2012-09-08,berdiemobilittde00hebe,scanner-shenzhen-mary@archive.org
2,Datum Data Co. Ltd.,2011-07-16,beilsteinshandb28beil,scanner-shenzhen-lina@archive.org
3,Datum Data Co. Ltd.,2012-08-14,theoryapplicatio00adva,scanner-shenzhen-dragon@archive.org
4,Datum Data Co. Ltd.,2012-09-21,specialeditionus00edbo_0,scanner-shenzhen-yan@archive.org
...,...,...,...,...
7235232,New York Botanical Garden,2017-12-08,lrobokibotanikaf01agar,operator1.nybg@archive.org
7235233,New York Botanical Garden,2017-04-05,alphonsedecandol00cand,operator1.nybg@archive.org
7235234,New York Botanical Garden,2017-12-20,memoirsoftorreyb2019torr,operator1.nybg@archive.org
7235235,New York Botanical Garden,2017-05-11,bulletinofpopula1191arno,operator1.nybg@archive.org


In [11]:
scans_per_worker_day = scans_per_day.groupby(['name','operator', 'scan_day']).count().reset_index()

In [11]:
missing_scans = scans_per_worker_day.loc[(scans_per_worker_day['scan_day'] == 'NaT')|(scans_per_worker_day['operator'] == 'nan')]

missing_scans

Unnamed: 0,name,operator,scan_day,identifier
1728,Allen County Public Library Geneaology Center,associate-christine-calhoun@archive.org,NaT,1
3097,Allen County Public Library Geneaology Center,associate-janet-breitenwischer@archive.org,NaT,1
6514,Allen County Public Library Geneaology Center,associate-ladonna-hartmann@archive.org,NaT,18
8957,Allen County Public Library Geneaology Center,associate-rosa-guzman@archive.org,NaT,1
10137,Allen County Public Library Geneaology Center,associate-sam-shorter@archive.org,NaT,16
...,...,...,...,...
426094,Yiddish Book Center,scanner-mitt-raj@archive.org,NaT,10
426214,Yiddish Book Center,scanner-zoe-schacht-levine@archive.org,NaT,2
426234,Yiddish Book Center,volunteer-allison-posner@archive.org,NaT,6
426254,Yiddish Book Center,volunteer-jessica-parker@archive.org,NaT,5


In [12]:
# weeding out missing data- we want to throw out rows where operator = nan and/or scanday = nat
scans_per_worker_day = scans_per_worker_day.loc[scans_per_worker_day['scan_day'] != 'NaT']
scans_per_worker_day = scans_per_worker_day.loc[scans_per_worker_day['operator'] != 'nan']

In [13]:
scans_per_worker_day = scans_per_worker_day.rename(columns={'name':'name', 'operator': 'operator','scan_day':'scan_day','identifier':'books_scanned'})

In [17]:
scans_per_worker_day.sort_values(by="books_scanned", ascending=False)
scans_per_worker_day.to_csv(dir+"books_scanned_per_worker_per_day.csv")


## Pages Scanned per Worker per Day 
The imagecount measures how many images make up a scanned item. It corresponds to the actual number of images a worker captured as they pressed the capture button on the scribe machine, lowered the book using a foot pedal, flipped a page, and repeated. As such, it's a much better way to approximate rate of work. Unfortunately, the metadata field is often left blank in an IA metadata record. 2,284,561 records do not have any imagecount information and we exclude them from these analyses. 

Of these, 87,678 are microfilm. So for the pages scanned per worker calculations, we are working from a shrunken dataset of 6,657,173 records.

In [67]:
pages = texts[['name', 'scan_day', 'imagecount', 'operator', 'scanner']]

In [68]:
missing_pages = pages.loc[(pages['scan_day'] == 'NaT') | (pages['operator'] == 'nan') |  (pages['imagecount'] == 'NaN')]

len(missing_pages)

2284561

In [69]:
pages = pages.loc[(pages['scan_day'] != 'NaT') & (pages['operator'] != 'nan') &  (pages['imagecount'] != 'NaN')]

In [70]:
pages = pages.reset_index()
pages = weed_microfilm(pages)

In [27]:
pages_per_worker_per_day = pages[['name', 'scan_day', 'imagecount', 'operator']]
pages_per_worker_per_day  = pages_per_worker_per_day .groupby(['name', 'scan_day', 'operator']).sum()
pages_per_worker_per_day  = pages_per_worker_per_day .reset_index()

pages_per_worker_per_day .sort_values(by="imagecount", ascending=False)


Unnamed: 0,name,scan_day,operator,imagecount
91510,Datum Data Co. Ltd.,2013-10-17,scanner-shenzhen-leo@archive.org,101942.0
82287,Datum Data Co. Ltd.,2011-07-08,scanner-shenzhen-leo@archive.org,92954.0
82241,Datum Data Co. Ltd.,2011-07-01,scanner-shenzhen-leo@archive.org,86450.0
91513,Datum Data Co. Ltd.,2013-10-22,scanner-shenzhen-leo@archive.org,84322.0
91515,Datum Data Co. Ltd.,2013-10-24,scanner-shenzhen-leo@archive.org,78542.0
...,...,...,...,...
105121,Getty Research Institute Valencia Warehouse,2023-05-26,associate-sean-kim@archive.org,0.0
91104,Datum Data Co. Ltd.,2012-12-21,scanner-shenzhen-thomas@archive.org,0.0
91190,Datum Data Co. Ltd.,2013-01-05,scanner-shenzhen-leo@archive.org,0.0
91167,Datum Data Co. Ltd.,2012-12-31,scanner-shenzhen-david@archive.org,0.0


In [71]:
# center level data 
pages_per_day = pages[['name', 'scan_day', 'imagecount']]
pages_per_day = pages_per_day.groupby(['name', 'scan_day']).sum()
pages_per_day = pages_per_day.reset_index()

pages_per_day

Unnamed: 0,name,scan_day,imagecount
0,Allen County Public Library Geneaology Center,2002-01-09,752.0
1,Allen County Public Library Geneaology Center,2002-01-10,108.0
2,Allen County Public Library Geneaology Center,2002-01-15,506.0
3,Allen County Public Library Geneaology Center,2008-05-15,564.0
4,Allen County Public Library Geneaology Center,2008-05-16,454.0
...,...,...,...
71974,Yiddish Book Center,2013-07-19,114.0
71975,Yiddish Book Center,2013-07-26,1663.0
71976,Yiddish Book Center,2013-07-29,180.0
71977,Yiddish Book Center,2013-08-02,1009.0


In [72]:
workers = pages[['name', 'scan_day', 'operator']]
workers = workers.drop_duplicates()
workers = workers.groupby(['name', 'scan_day']).count().reset_index()
workers

Unnamed: 0,name,scan_day,operator
0,Allen County Public Library Geneaology Center,2002-01-09,2
1,Allen County Public Library Geneaology Center,2002-01-10,1
2,Allen County Public Library Geneaology Center,2002-01-15,1
3,Allen County Public Library Geneaology Center,2008-05-15,1
4,Allen County Public Library Geneaology Center,2008-05-16,2
...,...,...,...
71974,Yiddish Book Center,2013-07-19,1
71975,Yiddish Book Center,2013-07-26,1
71976,Yiddish Book Center,2013-07-29,1
71977,Yiddish Book Center,2013-08-02,1


In [32]:

pages_stats = pd.merge(pages_per_day, workers, how='inner', on=['name', 'scan_day'])


In [33]:
pages_stats['pages_to_worker_ratio'] = ''
i = 0
for i in range(len(pages_stats)): 
    pages_stats.at[i,'pages_to_worker_ratio'] = float(pages_stats.at[i, 'imagecount']) / float(pages_stats.at[i, 'operator']) 
    i += 0

In [34]:
pages_stats

Unnamed: 0,name,scan_day,imagecount,operator,pages_to_worker_ratio
0,Allen County Public Library Geneaology Center,2002-01-09,752.0,2,376.0
1,Allen County Public Library Geneaology Center,2002-01-10,108.0,1,108.0
2,Allen County Public Library Geneaology Center,2002-01-15,506.0,1,506.0
3,Allen County Public Library Geneaology Center,2008-05-15,564.0,1,564.0
4,Allen County Public Library Geneaology Center,2008-05-16,454.0,2,227.0
...,...,...,...,...,...
71974,Yiddish Book Center,2013-07-19,114.0,1,114.0
71975,Yiddish Book Center,2013-07-26,1663.0,1,1663.0
71976,Yiddish Book Center,2013-07-29,180.0,1,180.0
71977,Yiddish Book Center,2013-08-02,1009.0,1,1009.0


In [35]:
pages_stats.sort_values(by='pages_to_worker_ratio', ascending=False)
pages_stats = pages_stats[['name', 'scan_day', 'imagecount', 'operator', 'pages_to_worker_ratio']].reset_index()

In [36]:
pages_stats.sort_values(by='pages_to_worker_ratio', ascending=False)

Unnamed: 0,index,name,scan_day,imagecount,operator,pages_to_worker_ratio
21173,21173,Datum Data Co. Ltd.,2013-10-17,101942.0,1,101942.0
21176,21176,Datum Data Co. Ltd.,2013-10-22,84322.0,1,84322.0
21178,21178,Datum Data Co. Ltd.,2013-10-24,78542.0,1,78542.0
21139,21139,Datum Data Co. Ltd.,2013-08-27,77625.0,1,77625.0
21166,21166,Datum Data Co. Ltd.,2013-10-08,66030.0,1,66030.0
...,...,...,...,...,...,...
28105,28105,Getty Research Institute Valencia Warehouse,2023-05-20,0.0,1,0.0
28109,28109,Getty Research Institute Valencia Warehouse,2023-05-25,0.0,1,0.0
28107,28107,Getty Research Institute Valencia Warehouse,2023-05-23,0.0,1,0.0
28108,28108,Getty Research Institute Valencia Warehouse,2023-05-24,0.0,1,0.0


In [37]:
pages_stats.to_csv('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/pages_scanned_per_worker_per_day.csv')
workers.to_csv('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/number_workers_per_day.csv')


I think some of these with really high numbers of pages scanned per day are actually microfilm or microfiche. see zack@archive.org: https://archive.org/details/albertahomestead2053cana/page/n647/mode/2up 
the scanner = microfilm something. 
we may need to actually make sure everything is scanned on a SCRIBE MACHINE to be sure of the media type and materiality of the working conditions

we might also be accidentally capturing republishing operators which is not the same thing as a scribe machine operator: https://archive.org/details/futurepresentsit00skee/page/8/mode/2up 

# Turnover Rate
We calculate turnover rates as a measure of days worked per worker. To do so, subtract the last day an operator value appears in the text dataset from the first day it appears.
We are working from the whole texts dataset wherever there is an operator value present for this calculation because media type is less relevant. Only 3,211 records are missing an operator field, so this dataset consists of 9026201 text records

In [6]:
operators = texts[['name', 'operator']]
len(operators)

9029412

In [7]:
len(operators.loc[operators['operator'] != 'nan'])

9029412

In [8]:
operators = operators.loc[operators['operator'] != 'nan']
operators = operators.drop_duplicates()

In [9]:
operators ['days_worked'] = ''
operators['first_day'] = ''
operators['last_day'] = ''

In [10]:
operators = operators.reset_index()

In [11]:
workers_days = texts[['name', 'operator', 'scan_day']]
workers_days['scan_day'] = pd.to_datetime(workers_days['scan_day']).dt.to_period('D')

workers_days = workers_days.loc[workers_days['operator'] != 'nan']
workers_days = workers_days.loc[workers_days['scan_day'] != 'NaT']

workers_days = workers_days.drop_duplicates()


KeyError: "['scan_day'] not in index"

In [44]:
workers_days

Unnamed: 0,name,operator,scan_day
0,Datum Data Co. Ltd.,scanner-shenzhen-leo@archive.org,2013-10-10
1,Datum Data Co. Ltd.,scanner-shenzhen-mary@archive.org,2012-09-08
2,Datum Data Co. Ltd.,scanner-shenzhen-lina@archive.org,2011-07-16
3,Datum Data Co. Ltd.,scanner-shenzhen-dragon@archive.org,2012-08-14
4,Datum Data Co. Ltd.,scanner-shenzhen-yan@archive.org,2012-09-21
...,...,...,...
9028837,New York Botanical Garden,operator1.nybg@archive.org,2017-12-20
9028844,New York Botanical Garden,operator1.nybg@archive.org,2017-12-07
9028846,New York Botanical Garden,operator1.nybg@archive.org,2017-12-08
9028849,New York Botanical Garden,operator1.nybg@archive.org,2017-05-11


In [45]:
def get_days_worked(scan_center, worker): 
    min_date = workers_days.loc[ (workers_days['name'] == scan_center) & (workers_days['operator'] == worker)]['scan_day'].min()
    max_date = workers_days.loc[ (workers_days['name'] == scan_center) & (workers_days['operator'] == worker)]['scan_day'].max()
    return{'days_worked':(max_date - min_date)/ pd.Timedelta(days=1),
          'min_date': min_date,
          'max_date': max_date}
#     return min_date

In [46]:
i = 0 
for i in range(len(operators)):
    days_worked = get_days_worked(operators.iloc[i]['name'], operators.iloc[i]['operator'])
    operators.loc[i, 'days_worked'] = days_worked['days_worked']
    operators.loc[i, 'first_day'] = days_worked['min_date']
    operators.loc[i, 'last_day'] = days_worked['max_date']


In [47]:
operators

Unnamed: 0,index,name,operator,days_worked,first_day,last_day
0,0,Datum Data Co. Ltd.,scanner-shenzhen-leo@archive.org,2045.0,2010-08-26,2016-04-01
1,1,Datum Data Co. Ltd.,scanner-shenzhen-mary@archive.org,886.0,2010-12-23,2013-05-27
2,2,Datum Data Co. Ltd.,scanner-shenzhen-lina@archive.org,666.0,2010-12-24,2012-10-20
3,3,Datum Data Co. Ltd.,scanner-shenzhen-dragon@archive.org,242.0,2012-02-21,2012-10-20
4,4,Datum Data Co. Ltd.,scanner-shenzhen-yan@archive.org,253.0,2012-05-09,2013-01-17
...,...,...,...,...,...,...
3206,9024459,Hong Kong,scanner-shenzhen-wei@archive.org,0.0,2012-03-26,2012-03-26
3207,9024460,Hong Kong,associate-hedawei@archive.org,0.0,2013-11-06,2013-11-06
3208,9024461,Hong Kong,tracey.g@archive.org,0.0,2011-05-19,2011-05-19
3209,9028612,Hamilton Public Library,associate-emily-skewes-donaldson@archive.org,44.0,2023-04-26,2023-06-09


In [48]:
operators.to_csv('/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/days_worked_per_worker.csv')


In [52]:
avg_days_worked = operators[['name', 'days_worked']].groupby(['name']).mean()
avg_days_worked.to_csv(dir+"avg_turnover_per_center.csv")

In [50]:
avg_days_worked.sort_values(by="days_worked", ascending = False)

Unnamed: 0_level_0,days_worked
name,Unnamed: 1_level_1
"BYU, Provo",2027.864407
Brown University,1478.857143
New York Botanical Garden,1193.0
American Numismatic Society,1169.333333
American Printing House for the Blind,1114.5
...,...
Servants of knowledge,82.5
Hamilton Public Library,44.0
Research Institute of Korean Studies,35.0
Zhejiang University,0.0


In [85]:
operators[['name', 'days_worked']].groupby(['name']).std().sort_values(by="days_worked", ascending = False)

Unnamed: 0_level_0,days_worked
name,Unnamed: 1_level_1
"BYU, Provo",1957.236594
Duke University,1320.185067
Brown University,1319.334356
National Agricultural Library,1164.229154
University of Toronto,1030.907080
...,...
New York Botanical Garden,
Peabody Essex Museum,
Perkins School for the Blind,
Trent University,
