# Recalculate Stats for Manifest
Because manifest can't display times on a month to month rolling basis, we need to recalculate stats for the whole dataset. 
We can do this by getting full data (not reflective of time) and getting data for each center per year so that we can see the evolution of the supply chain overtime. 

In [7]:
import pandas as pd
import os

In [3]:
texts = pd.read_csv('/Volumes/Samsung_T5/scanning_labor_in_IA/geocoded-texts-data.csv', low_memory=False).drop(columns=['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0'])

## Create Subdirectories for Centers

In [6]:
center_names = texts['name'].unique()

In [8]:
file_path = "/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/annual_stats/"

In [11]:
subdirs = []
for name in center_names: 
    subdirs.append(name.lower().replace(" ", "_").replace('.','').replace(',',''))

In [13]:
for this_dir in subdirs: 
    os.mkdir(file_path+this_dir)
    

In [14]:
def get_path(center_name): 
    return file_path + center_name.lower().replace(" ", "_").replace('.','').replace(',','')

## Yearly Stats

In [18]:
# starting with just one center, datum data! 
datum_data = texts.where(texts['name'] == "Datum Data Co. Ltd.").dropna(how="all")

In [25]:
# number of workers ever working 
len(datum_data['operator'].unique().tolist())

159

### Scans per year

In [17]:
texts['year'] = pd.to_datetime(texts['scandate']).dt.to_period('Y').astype('str')

In [18]:
annual_scans = texts[['name', 'year', 'identifier']]


In [19]:
annual_scans

Unnamed: 0,name,year,identifier
0,Datum Data Co. Ltd.,2013,geometrysuccessi00thom
1,Datum Data Co. Ltd.,2012,berdiemobilittde00hebe
2,Datum Data Co. Ltd.,2011,beilsteinshandb28beil
3,Datum Data Co. Ltd.,2012,theoryapplicatio00adva
4,Datum Data Co. Ltd.,2012,specialeditionus00edbo_0
...,...,...,...
9029407,Perpustakaan Provinsi Bali,2012,pratekaning-wong-mara-mandeg
9029408,Perpustakaan Provinsi Bali,2012,kramaning-caru-bang-bonggalan
9029409,Perpustakaan Provinsi Bali,2012,puja-parikrama-panigang-sasih
9029410,Perpustakaan Provinsi Bali,2012,puja-pancasanak-buda


In [20]:
annual_scans = annual_scans.groupby(['name', 'year']).count().reset_index()
annual_scans = annual_scans.loc[annual_scans['year'] != 'NaT']



In [21]:
annual_scans = annual_scans.rename(columns={'name':'name', 'year': 'year', 'identifier':'books_scanned'})

In [30]:
annual_scans

Unnamed: 0,name,year,books_scanned
1,Allen County Public Library Geneaology Center,2002,4
2,Allen County Public Library Geneaology Center,2008,4255
3,Allen County Public Library Geneaology Center,2009,9908
4,Allen County Public Library Geneaology Center,2010,16870
5,Allen County Public Library Geneaology Center,2011,22833
...,...,...,...
618,Washington University in St. Louis,2023,841
620,Yiddish Book Center,2011,80
621,Yiddish Book Center,2012,672
622,Yiddish Book Center,2013,344


In [22]:
# annual_scans.to_csv("/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/annual_scans_per_center.csv")

In [25]:
for this_center in center_names: 
    center_data = annual_scans.loc[annual_scans['name'] == this_center]
    center_data.to_csv(get_path(this_center)+ '/annual_scans.csv')

### Pages per Year

In [36]:
pages_scan = texts[['name', 'year', 'imagecount']].dropna(subset=['imagecount'])

pd.melt(pages_scan, id_vars=[('name', 'year')], value_vars=['imagecount'])

KeyError: "The following id_vars or value_vars are not present in the DataFrame: [('name', 'year')]"

In [37]:
pages_scan = pages_scan.groupby(['name', 'year']).sum().reset_index()

pages_scan.loc[pages_scan['year'] == "2011"]

Unnamed: 0,name,year,imagecount
5,Allen County Public Library Geneaology Center,2011,4564108.0
39,"BYU, Hawaii",2011,28887.0
44,"BYU, Idaho Family History Library",2011,35905.0
53,"BYU, Provo",2011,1853797.0
72,Boston Public Library,2011,5613958.0
99,Brown University,2011,241975.0
116,California State Library,2011,85096.0
126,Church History Library,2011,170733.0
144,Columbia University,2011,1152491.0
148,Datum Data Co. Ltd.,2011,34429269.0


In [38]:
pages_scan = pages_scan.drop(pages_scan[pages_scan.year == "NaT"].index)

In [39]:
# pages_scan.to_csv("/Users/elizabethschwartz/Documents/GitHub/ia_scanning_labor_data/metadata-analysis/metadata-records-analysis-csvs/annual_pages_scanned_per_center.csv")
for this_center in center_names: 
    center_data = pages_scan.loc[pages_scan['name'] == this_center]
    center_data.to_csv(get_path(this_center)+ '/annual_pages_scanned.csv')