In [1]:
import os
import re
import numpy as np

In [2]:
from textbook_analysis.helpers import *
import nltk

def get_sentences(book, remove_stopwords=False, remove_numeric=False, remove_short=False):
    sents = nltk.sent_tokenize(book)
    return [' '.join(clean_text(s, remove_stopwords=remove_stopwords, remove_numeric=remove_numeric, remove_short=remove_short)) for s in sents]

In [3]:
chapters_directory = 'final_textbook_chapters/'
years_directory = 'final_textbook_years/'
directory = os.fsencode(chapters_directory)

In [4]:
year_range_regex = re.compile(r'[12][0-9][0-9][0-9][-–][12][0-9][0-9][0-9]')
present_range_regex = re.compile(r'[12][0-9][0-9][0-9][-–]\bPresent\b')
year_regex = re.compile(r'[12][0-9][0-9][0-9]')

In [5]:
year_to_chapters = dict()
for year in range(1300, 2020, 50):
    year_to_chapters[year] = []
    os.makedirs(years_directory + str(year), exist_ok=True)

In [6]:
year_to_chapters

{1300: [],
 1350: [],
 1400: [],
 1450: [],
 1500: [],
 1550: [],
 1600: [],
 1650: [],
 1700: [],
 1750: [],
 1800: [],
 1850: [],
 1900: [],
 1950: [],
 2000: []}

In [7]:
chapter_start_lines = 5

for title in os.listdir(directory):
    title_name = os.fsdecode(title)
    subdir_name = chapters_directory + title_name + '/'
    subdir = os.fsencode(subdir_name)
    for file in os.listdir(subdir):
        filename = os.fsdecode(file)
        with open(subdir_name + filename, 'r') as textbook_reader:
            # 1. Find the average year mentioned in the chapter
            years = []
            avg_year = None
            line_count = 1
            all_lines = textbook_reader.readlines()
            for line in all_lines:
                if line_count <= chapter_start_lines: # check for year range in start of chapter
                    present_range = present_range_regex.search(line) # set avg_year to 2000 if <year>-Present is found
                    if present_range:
                        avg_year = 2000
                        break
                    year_range = year_range_regex.search(line)
                    if year_range: # set avg_year to average of year range found
                        start_year = int(year_range.group()[:4])
                        end_year = int(year_range.group()[5:])
                        avg_year = (start_year + end_year)/2
                        break
                else: # otherwise, add any year to list to be averaged
                    year_range = year_range_regex.search(line)
                    if year_range:
                        start_year = int(year_range.group()[:4])
                        end_year = int(year_range.group()[5:])
                        years.append((start_year + end_year)/2)
                    else:
                        year = year_regex.search(line)
                        # avoid adding page numbers
                        if year and line.split()[0] != year.group() and 'chapter' not in line.lower():
                            years.append(int(year.group()))
                line_count += 1
            if not avg_year:
                avg_year = np.median(years)
                
            # 2. Determine which bucket the average year falls in
            century = int(avg_year // 100 * 100)
            half_century = 50 if avg_year % 100 > 50 else 0
            bucket_year = century + half_century
            
            # 3. Write chapter text into years_directory --> bucket_year --> textbook_name
            with open(years_directory + str(bucket_year) + '/' + title_name + ".txt", 'a') as textbook_writer:
                lines = []
                text = '\n'.join(all_lines)

                for sent in get_sentences(text):
                    if len(sent) >= 10:
                        lines.extend([sent + '.\n'])

                textbook_writer.writelines(lines)
                    
            year_to_chapters[bucket_year].append('\n'.join(all_lines))

In [8]:
for bucket_year, chapters in year_to_chapters.items():
    print(bucket_year, len(chapters))

1300 1
1350 0
1400 2
1450 1
1500 9
1550 1
1600 9
1650 11
1700 12
1750 43
1800 82
1850 83
1900 77
1950 50
2000 8


In [9]:
os.makedirs(years_directory + 'all_textbooks', exist_ok=True)
for bucket_year, chapters in year_to_chapters.items():
    file = open(years_directory + 'all_textbooks/' + str(bucket_year) + ".txt", "w")
    lines = []
    text = '\n'.join(chapters)
    
    for sent in get_sentences(text):
        if len(sent) >= 10:
            lines.extend([sent + '.\n'])
        
    file.writelines(lines)
    file.close()