In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import os
import spacy
import pandas as pd

url = 'https://millercenter.org/the-presidency/presidential-speeches?field_president_target_id[8396]=8396'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
speeches = []
transcripts = soup.findAll("div", attrs={"class": "views-field-title"})

Output directory for saving txt files

In [None]:
output_dir = 'Trump_speech_txt_files'
os.makedirs(output_dir, exist_ok=True)

for i, transcript in enumerate(transcripts):
    if i < 24:
        link_url = transcript.find('a')['href']
        link_response = requests.get(link_url)
        link_html_content = link_response.content
        link_soup = BeautifulSoup(link_html_content, 'html.parser')
        link_text = link_soup.find("div", attrs={'class': "transcript-inner"}).text.strip()
        speeches.append(link_text)
        SpeechTitle = transcript.text.strip()
        URL = link_url
        president = link_soup.find("p", attrs={'class': "president-name"}).text.strip()
        date = link_soup.find("p", attrs={'class': "episode-date"}).text.strip()
        summary = link_soup.find("div", attrs={'class': "about-sidebar--intro"}).text.strip()

        # Constructing the filename
        filename = f"{SpeechTitle}_{date.replace(' ', '_')}.txt"
        filepath = os.path.join(output_dir, filename)

        # Writing the transcript to the txt file
        with open(filepath, 'w', encoding='utf-8') as txt_file:
            txt_file.write(link_text)


Load spaCy English model

In [None]:
nlp = spacy.load("en_core_web_sm")

Input and output directories

In [None]:
input_dir = 'Trump_speech_txt_files'
output_csv_path = 'annotated_corpus.csv'

Create a list to store data

In [6]:
corpus_data = []

Iterate through txt files

In [5]:
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(input_dir, filename)

        # Read the content of the txt file
        with open(filepath, 'r', encoding='utf-8') as txt_file:
            document_text = txt_file.read()

        # Use spaCy for tokenization, lemmatization, and POS tagging
        doc = nlp(document_text)

        # Extract tokens, lemmas, and POS tags
        tokens = [token.text for token in doc]
        lemmas = [token.lemma_ for token in doc]
        pos_tags = [token.pos_ for token in doc]

        # Extract title
        title = filename.split(':')[-1].split('_')[0]

        # Store the data in a dictionary
        document_data = {
            'Filename': filename,
            'Title': title,
            'Document': document_text,
            'Noun': ' '.join([token.lemma_ for token in doc if token.pos_ == 'NOUN']),
            'Tokens': tokens,
            'Lemmas': lemmas,
            'Parts-of-speech': pos_tags,
        }

        # Append the dictionary to the corpus_data list
        corpus_data.append(document_data)



Convert the list of dictionaries to a DataFrame

In [7]:
corpus_df = pd.DataFrame(corpus_data)

Save the annotated corpus as a CSV file

In [None]:
corpus_df.to_csv(output_csv_path, index=False)