# Goals

# Libraries

In [1]:
import glob 
import csv
import pandas as pd
import os

# Parameters & Directories

In [2]:
%config InlineBackend.figure_format='retina'
pd.set_option('mode.chained_assignment', None)

In [3]:
DIR = os.path.dirname(os.getcwd()) + "/"
DATA = DIR + "data/"

# Process RIS files
Read in and parse each RIS file.

In [4]:
# Records are separated by 200 x '-'
record_split = "-"*200

# Grab all files RIS files
files = glob.glob(DATA + "raw/worldcat_ris_files/" + "DirectExport*")

# Initialize a csv to write RIS data into
with open(DATA + "interim/" + "19c_journals.csv", 'w') as csvf:
    writer = csv.writer(csvf)
    
    # Initialize our columns
    writer.writerow(['title', 'auth', 'lang', 'year', 'start', 'end', 'soc', 'stats'])
    
    # Loop thru, open, read, and parse each file
    for file in files:
        with open(file, 'r') as f:
            records = f.read()
            records = records.split(record_split)
            records.pop() # delete last empty record
            
            # In each file, loop thru each hit
            record_n = -1
            for record in records: 
                record_n += 1
                line_n = -1
                lines = record.split('\n')
                title = ""
                year = ""
                auth = ""
                lang = ""
                
                # In each record, loop thru each line
                for line in lines:
                    line_n += 1
                    
                    # Is there a title? If so, clean it up.
                    if "Title:" in line:
                        title = line.split("Title:")[-1]
                        title = title.strip()
                        title = title.replace("         ", " ")
                        if ":" in title: 
                            title += lines[line_n + 1]
                            
                    # Is there a year? If so, clean it up.
                    if "Year:" in line: 
                        year = line.split(":")[-1].strip()
                        
                        # data errors; mark as missing
                        if ("s" or "?") in year: 
                            start = -99
                            end = - 99
                            
                        # ranges; grab start and end
                        elif "-" in year: 
                            start = year.split("-")[0]
                            if len(year.split("-")[1]) == 4:
                                end = year.split("-")[1]
                            else: 
                                end = 1914
                                
                        # if no range, then one-year pub
                        else: 
                            start = year
                            end = year
                            
                    # Is there an auth, lang, and subject code? 
                    # If so, clean them up.
                    if "Corp Author(s):" in line:
                        auth = line.split(":")[-1].strip()
                    if "Language:" in line: 
                        lang = line.split(":")[-1].strip()
                    if "Descriptor:" in line: 
                        if "Social science" in line:
                            soc_sci = 1
                        elif "Statistics" in line:
                            stats = 1
                        else: 
                            soc_sci = 0
                            stats = 0
                            
                        # lines sometimes get broken; so, peak to next line
                        # this line wouldn't get detected in the above bc
                        # the above looks for 'descriptor'
                        if soc_sci == 0 and stats == 0:
                            if "Social sciences" in lines[line_n + 1]:
                                soc_sci = 1
                            elif "Statistics" in lines[line_n + 1]:
                                stats = 1
                                
                # write row in csv
                writer.writerow([title, auth, lang, year, start, end, soc_sci, stats])

In [5]:
df = pd.read_csv(DATA + "interim/" + "19c_journals.csv")
df.sample(5)

Unnamed: 0,title,auth,lang,year,start,end,soc,stats
7892,Monthly bulletin of the Missouri Weather Servi...,"Missouri Weather Service, ; issuing body.; Uni...",English,1800s-1896,-99,-99,0,1
4805,Monthly statement /,Illinois State Reformatory.,English,1800s-,-99,-99,1,1
905,Revue catholique sociale et juridique,,French,1897-1920,1897,1920,1,0
6744,Utah. Bureau of Statistics.; Report of the Bur...,Utah. Bureau of Statistics.,English,1901-1906,1901,1906,0,0
674,United States. Weather Bureau.; Climatological...,United States.; Weather Bureau.,English,1914-1948,1914,1948,0,1


In [6]:
df.shape

(8928, 8)

# Clean journal-level data
Filter out unknown. years and duplicates

In [7]:
df_filtered = df[df['start'] != -99]
df_filtered.shape

(6170, 8)

In [8]:
df_filtered.drop_duplicates(subset=['title', 'auth', 'start'], 
                            keep='first', 
                            inplace=True, 
                            ignore_index=True)
df_filtered.shape

(5196, 8)

Replace csv

In [9]:
df_filtered.to_csv(DATA + "interim/" + "19c_journals.csv")

# Create world level annual time series

Count how many stats and soc pubs per year

In [10]:
stats_journals = {}
soc_journals = {}
for year in range(1803, 1915):
    if year not in (stats_journals or soc_journals): 
        stats_journals[year] = 0
        soc_journals[year] = 0
    for index, journal in df_filtered.iterrows():
        if journal['soc'] == 1: 
            if year in range(journal['start'], journal['end']):
                soc_journals[year] += 1
        if journal['stats'] == 1:
            if year in range(journal['start'], journal['end']):
                stats_journals[year] += 1

Create a data frame with count dicts

In [11]:
years_df = pd.DataFrame.from_dict(stats_journals, 
                                  orient="index", 
                                  columns=["wy_stats_journals"])

years_df['year'] = years_df.index
years_df.reset_index(drop=True, inplace=True)
years_df['wy_soc_journals'] = years_df['year'].map(soc_journals)
years_df.sample(5)

Unnamed: 0,wy_stats_journals,year,wy_soc_journals
35,87,1838,22
21,31,1824,7
55,312,1858,120
3,13,1806,5
109,2138,1912,831


# Export world-level time series

In [12]:
years_df.to_csv(DATA + "interim/" + "19c_journals_wy.csv", index=False)