# Chronicling America API Assignment
In this assignment, you are tasked with:
* searching Chronicling America's API for a key word of your choice
* parsing your results from a dictionary to a `DataFrame` with headings "title", "city", 'date", and "raw_text"
* processing the raw text by removing "\n" characters, stopwords, and then lemmatizing the raw text before adding it to a new column called "lemmas."
* saving your `DataFrame` to a csv file

If you need any help with this assignment please email micah.saxton@tufts.edu


In [1]:
# imports
import requests
import json
import math
import pandas as pd
import spacy

In [2]:
# initial search
url = 'https://chroniclingamerica.loc.gov/search/pages/results/?state=&date1=1960&date2=1963&proxtext=second+amendment&x=20&y=20&dateFilterType=yearRange&rows=20&searchType=basic&format=json'
response = requests.get(url)
raw = response.text
results = json.loads(raw)

In [3]:
# find total amount of pages
total_pages = math.ceil(results['totalItems'] / results['itemsPerPage'])
print(total_pages)

7


In [None]:
# query the api and save to dict 
data = []
start_date = '1960'
end_date = '1963'
search_term = 'second amendment'
for i in range(1, 8): 
    url = (f'https://chroniclingamerica.loc.gov/search/pages/results/?state=&date1={start_date}'
           f'&date2={end_date}&proxtext={search_term}&x=20&y=20&dateFilterType=yearRange&rows=20'
           f'&searchType=basic&format=json&page={i}')  # f-string
    response = requests.get(url)
    raw = response.text
    print(response.status_code)
    results = json.loads(raw)
    items_ = results['items']
    for item_ in items_:
        temp_dict = {}
        temp_dict['title'] = item_['title_normal']
        temp_dict['city'] = item_['city']
        temp_dict['date'] = item_['date']
        temp_dict['raw_text'] = item_['ocr_eng']
        data.append(temp_dict)

In [4]:
# I tried reducing the amount of data as much as possible, and it still timed out. I even made it so that only one article would appear, but it still timed out. I am just going to do it manually for now until I meet with Peter.
with open('../_exercises/second-amendment.json', 'r') as f:
   data = json.load(f)

JSONDecodeError: Extra data: line 1 column 358715 (char 358714)

In [None]:
# convert dict to dataframe
df = pd.DataFrame.from_dict(data)
df.head()

In [None]:
# convert date column from string to date-time object
df['date'] = pd.to_datetime(df['date'])

In [None]:
# sort by date
sorted_df = df.sort_values(by='date')

In [None]:
# write fuction to process text
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')

def process_text(text):
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stops = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stops if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [None]:
# apply process_text function to raw_text column
sorted_df['lemmas'] = sorted_df['raw_text'].apply(process_text)

In [None]:
# save to csv
sorted_df.to_csv(f'../_exercises/{search_term}{start_date}-{end_date}.csv', index=False)