# Recalculate Stats for Manifest
Because manifest can't display times on a month to month rolling basis, we need to recalculate stats for the whole dataset. 
We can do this by getting full data (not reflective of time) and getting data for each center per year so that we can see the evolution of the supply chain overtime. 

In [7]:
import pandas as pd
import os

In [3]:
texts = pd.read_csv('/Volumes/Samsung_T5/scanning_labor_in_IA/geocoded-texts-data.csv', low_memory=False).drop(columns=['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0'])

## Create Subdirectories for Centers

In [6]:
center_names = texts['name'].unique()

In [8]:
file_path = "/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/annual_stats/"

In [11]:
subdirs = []
for name in center_names: 
    subdirs.append(name.lower().replace(" ", "_").replace('.','').replace(',',''))

In [12]:
for this_dir in subdirs: 
    

['datum_data_co_ltd',
 'hong_kong',
 'innodata_knowledge_services_inc',
 'university_of_alberta',
 'internet_archive_headquarters',
 'ucla',
 'allen_county_public_library_geneaology_center',
 'british_library',
 'university_of_toronto',
 'princeton_university',
 'boston_public_library',
 'national_agricultural_library',
 'library_of_congress',
 'columbia_university',
 'unc_chapel_hill',
 'internt_archive_physical_archive',
 'uiuc',
 'national_library_of_scotland',
 'san_francisco_public_library',
 'university_of_maryland_college_park',
 'north_carolina_state_university',
 'byu_provo',
 'smithsonian_libraries_and_archives',
 'georgetown_university',
 'internet_archive_sheridan_headquarters',
 'duke_university',
 'brown_university',
 'natural_history_museum_library_london',
 'the_archive_of_contemporary_music',
 'byu_hawaii',
 'byu_idaho_family_history_library',
 'getty_research_institute_valencia_warehouse',
 'university_of_florida',
 'the_ohio_state_university',
 'american_numismatic_s

## Yearly Stats

In [18]:
# starting with just one center, datum data! 
datum_data = texts.where(texts['name'] == "Datum Data Co. Ltd.").dropna(how="all")

In [25]:
# number of workers ever working 
len(datum_data['operator'].unique().tolist())

159

### Scans per year

In [43]:
texts['year'] = pd.to_datetime(texts['scandate']).dt.to_period('Y').astype('str')

In [44]:
annual_scans = texts[['name', 'year', 'identifier']]


In [45]:
annual_scans

Unnamed: 0,name,year,identifier
0,Datum Data Co. Ltd.,2013,geometrysuccessi00thom
1,Datum Data Co. Ltd.,2012,berdiemobilittde00hebe
2,Datum Data Co. Ltd.,2011,beilsteinshandb28beil
3,Datum Data Co. Ltd.,2012,theoryapplicatio00adva
4,Datum Data Co. Ltd.,2012,specialeditionus00edbo_0
...,...,...,...
9029407,Perpustakaan Provinsi Bali,2012,pratekaning-wong-mara-mandeg
9029408,Perpustakaan Provinsi Bali,2012,kramaning-caru-bang-bonggalan
9029409,Perpustakaan Provinsi Bali,2012,puja-parikrama-panigang-sasih
9029410,Perpustakaan Provinsi Bali,2012,puja-pancasanak-buda


In [46]:
annual_scans = annual_scans.groupby(['name', 'year']).count().reset_index()
annual_scans = annual_scans.loc[annual_scans['year'] != 'NaT']



In [47]:
annual_scans = annual_scans.rename(columns={'name':'name', 'year': 'year', 'identifier':'books_scanned'})

In [50]:
annual_scans.to_csv("/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/annual_scans_per_center.csv")

### Pages per Year

In [64]:
pages_scan = texts[['name', 'year', 'imagecount']].dropna(subset=['imagecount'])



In [65]:
pages_scan.groupby(['name', 'year']).sum().reset_index()



Unnamed: 0,name,year,imagecount
0,1 Dollar Scan,NaT,627.0
1,Allen County Public Library Geneaology Center,2002,1366.0
2,Allen County Public Library Geneaology Center,2008,1933243.0
3,Allen County Public Library Geneaology Center,2009,3488177.0
4,Allen County Public Library Geneaology Center,2010,3919992.0
...,...,...,...
599,Yiddish Book Center,2011,12854.0
600,Yiddish Book Center,2012,117137.0
601,Yiddish Book Center,2013,58942.0
602,Yiddish Book Center,NaT,29945.0


In [68]:
pages_scan = pages_scan.drop(pages_scan[pages_scan.year == "NaT"].index)

Unnamed: 0,name,year,imagecount
0,Datum Data Co. Ltd.,2013,200.0
1,Datum Data Co. Ltd.,2012,250.0
2,Datum Data Co. Ltd.,2011,962.0
3,Datum Data Co. Ltd.,2012,590.0
4,Datum Data Co. Ltd.,2012,888.0
...,...,...,...
9029156,Perpustakaan Provinsi Bali,2012,58.0
9029180,Perpustakaan Provinsi Bali,2012,71.0
9029255,Perpustakaan Provinsi Bali,2012,25.0
9029264,Perpustakaan Provinsi Bali,2012,114.0


In [69]:
pages_scan.to_csv("/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/annual_pages_scanned_per_center.csv")
