In [1]:
import os
import re
import csv
import nltk
import string
import PyPDF2
import requests
import builtins
import warnings
import wikipedia
import numpy as np
import unicodedata
import pandas as pd
from tqdm import tqdm
import language_tool_python
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
warnings.filterwarnings("ignore")

### The following are helper methods used for Data Preprocessing

In [2]:
lang_tool = language_tool_python.LanguageTool('en-US')

In [3]:
def remove_non_utf8_chars(text):
    pattern = r'[^\x00-\x7F]+'  # Pattern to match non-UTF-8 characters
    return re.sub(pattern, ' ', text)

def grammar_check(text):
    matches = lang_tool.check(text)
    return len(matches)

def remove_encoded_chars(text):
    clean_text = ""
    for c in text:
        if not unicodedata.category(c) == 'Co':
            clean_text+=c
    return clean_text

def check_and_remove(text, tags):
    for tag in tags:
        if tag in text:
            text = text.replace(tag, " ")
    return text

In [4]:
def clean_text(text):

    tags_removed = ['caroline barrière', 'csi5180', 'winter 2024', 'csi4106' , 'fall 2023']
    
    translator = str.maketrans(" ", " ", string.punctuation)
    text = text.translate(translator)
    text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces

    text = text.lower().strip()
    words = word_tokenize(text) 
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    text = " ".join(word for word in words)
    
    text = check_and_remove(text, tags_removed) # REMOVES COMMON OCCURRING TAGS
    text = remove_non_utf8_chars(text) # REMOVES NON UTF-8 CHRACTERS
    text = remove_encoded_chars(text)

    return text

In [5]:
def chunk_text(text, chunk_size=300):

    words = text.split()  # Split the text into a list of words
    chunk = []

    for word in words:
        chunk.append(word)
        if len(chunk) >= chunk_size:
            yield ' '.join(chunk)  # Yield a chunk as a joined string
            chunk = []  # Reset the chunk

    # Yield the final chunk (if any words are left)
    if chunk:
        yield ' '.join(chunk)

In [6]:
folder_path = './'
pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
text_results = {}

### Extract textual data from the course PDFs.

In [7]:
def extract_pdf_text(pdf_file):
    
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)

        # Extract Text (page-by-page)
        text_data = []
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            page_text = clean_text(page_text)
            
            if len(page_text.split()) > 20:
                text_data.append(page_text)
            else:
                text_data.append("")

        return text_data

In [8]:
for file_name in pdf_files:
    
    file_path = os.path.join(folder_path, file_name)
    try:
        text_data = extract_pdf_text(file_path)
        combined_text = " ".join(text_data)
        text_results[file_name] = combined_text
        
    except PyPDF2.errors.PdfReadError:
        print(f"Error processing file (pdferror): {file_name}")

In [9]:
pdf_text_dataframe = pd.DataFrame(columns=['text', 'text_length','gram_score'])


for texts in text_results.values():
    chunks = chunk_text(texts)
    
    for t in chunks:
        row = [t, len(t.split()) ,grammar_check(t)]
        pdf_text_dataframe.loc[len(pdf_text_dataframe)] = row
        

pdf_text_dataframe.to_csv("PDF_file_text_300.csv", index= False)

### Extract textual data from URLs referenced in the course PDFs.

In [10]:
pdf_urls = [] # Stores the URLs that point to a PDF file
forbidden_urls = [] # Keeps track of the URLs that could not be accessed

In [11]:
# Returns a list of all the unique URLs
def extract_urls(pdf_file):
    
    urls = []
    with open(pdf_file, 'rb') as file:
        
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)

        for page_num in range(num_pages):
            page = reader.pages[page_num]
            objects = page.get('/Annots', {})
            if not objects:
                continue
                
            for obj in objects:
                annot = obj.get_object()
                
                if annot['/Subtype'] == '/Link' and '/A' in annot:
                    if '/URI' in annot['/A']:
                        url = annot['/A'].get_object()['/URI']
                        urls.append(url)
                    elif '/S' in annot['/A'] and annot['/A']['/S'] == '/URI':    
                        url = annot['/A']['/URI']
                        urls.append(url)
                
    urls = list(set(urls)) # Avoids repetation of URLs
    return urls

In [12]:
# Gets texts from webpages and cleans it for processing and updates the the PDF URLs.
def extract_text_from_url(url):
    
    if is_pdf_url_headers(url):
        return ""
    else:
        text = extract_text_from_webpage(url)

    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = " ".join(chunk for chunk in chunks if chunk)
    
    text = clean_text(text)
    return text

In [13]:
# Extracts and returns text from a webpage
def extract_text_from_webpage(url):
    
    extracted_text = " "
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()  # Raise an exception for error status codes

        soup = BeautifulSoup(response.content, 'html.parser', from_encoding="iso-8859-1")
        for element in soup(["script", "style"]):
            element.extract() # Remove script and style elements as they often contain non-content text

        content_elements = soup.find_all(['p']) 
        for element in content_elements:
            extracted_text += element.get_text()
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        forbidden_urls.append(url)
        extracted_text+= " "

    except requests.exceptions.Timeout as e:
        print(f"Error fetching URL: {e}")
        forbidden_urls.append(url)
        extracted_text+= " "
        
    return extracted_text

In [14]:
# Checks if the given URL points to a PDF or not
def is_pdf_url_headers(pdf_url):
    try:
        response = requests.head(pdf_url, timeout=5)
        content_type = response.headers.get('content-type')
        return content_type == 'application/pdf'
    
    except requests.exceptions.RequestException:
        return False 

In [15]:
# GET LIST OF ALL URLs
url_list = []

for file_name in tqdm(pdf_files):

    file_path = os.path.join(folder_path, file_name)
    try:
        urls = extract_urls(file_path)
        for i in urls:
            url_list.append(i)
    except PyPDF2.errors.PdfReadError:
        print(f"Error processing file (pdferror): {file_name}")

100%|███████████████████████████████████████████| 17/17 [00:00<00:00, 72.99it/s]


In [16]:
# GET TEXT FROM URLS
url_results = {}
url_dataframe = pd.DataFrame(columns=['text', 'text_length','gram_score'])


for url in tqdm(url_list):
    
    if is_pdf_url_headers(url):
        pdf_urls.append(url)
        continue
    
    article_texts = extract_text_from_url(url) # Input: URL; Output: Text retrieved from the link 
    chunks = chunk_text(article_texts)
    
    for t in chunks:
        row = [t, len(t.split()) ,grammar_check(t)]
        url_dataframe.loc[len(url_dataframe)] = row
            
        
url_dataframe.to_csv("URL_text_300.csv", index= False)

  1%|▎                                          | 2/249 [00:04<07:55,  1.92s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.datacamp.com/tutorial


  4%|█▌                                         | 9/249 [00:23<07:05,  1.77s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/Illustration-of-the-BERT-model-for-joint-intent-detection-and-slot-filling_fig3_359661835


  4%|█▋                                        | 10/249 [00:24<05:28,  1.38s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/Basic-models-for-fingerprint-verification-and-identification-processes_fig1_215585741


  4%|█▊                                        | 11/249 [00:24<04:17,  1.08s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/The-general-pipeline-for-face-verification-in-this-paper-where-classifier-loss-function_fig6_323025952


  9%|███▋                                      | 22/249 [00:49<08:33,  2.26s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://learnopencv.com/face-recognition-an-introduction-for-beginners/


  9%|███▉                                      | 23/249 [00:49<06:28,  1.72s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/Basic-models-for-fingerprint-verification-and-identification-processes_fig1_215585741


 11%|████▌                                     | 27/249 [00:57<06:25,  1.74s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/The-general-pipeline-for-face-verification-in-this-paper-where-classifier-loss-function_fig6_323025952


 12%|████▉                                     | 29/249 [01:04<08:18,  2.27s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/Singular-points-and-minutiae-types-in-a-fingerprint_fig5_306328339


 12%|█████                                     | 30/249 [01:05<06:49,  1.87s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.iqt.org/voices-at-speech-odyssey-2020-advances-in-speaker-embeddings/


 13%|█████▍                                    | 32/249 [01:21<20:46,  5.74s/it]

Error fetching URL: HTTPSConnectionPool(host='www.embedded.com', port=443): Read timed out. (read timeout=5)


 15%|██████▏                                   | 37/249 [01:36<10:11,  2.89s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://voicebot.ai/2018/03/21/data-breakdown-consumers-use-smart-speakers-today/


 21%|████████▊                                 | 52/249 [02:38<19:13,  5.85s/it]

Error fetching URL: 410 Client Error: Gone for url: https://medium.com/ibm-data-ai/ibm-watson-text-to-speech-neural-voices-added-to-service-e562106ff9c7


 22%|█████████▍                                | 56/249 [02:52<11:40,  3.63s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.explainthatstuff.com/how-speech-synthesis-works.html
Error fetching URL: HTTPSConnectionPool(host='text-to-speech-demo.ng.bluemix.net', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x16adc2e30>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


 24%|██████████                                | 60/249 [02:59<07:36,  2.41s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://venturebeat.com/2019/04/16/how-to-prevent-alexa-cortana-siri-google-assistant-and-bixby-from-recording-you/


 27%|███████████▍                              | 68/249 [03:31<11:21,  3.77s/it]Certificate did not match expected hostname: lionbridge.ai. Certificate: {'subject': ((('countryName', 'US'),), (('stateOrProvinceName', 'Massachusetts'),), (('localityName', 'Waltham'),), (('organizationName', 'Lionbridge Technologies, LLC'),), (('commonName', '*.lionbridge.com'),)), 'issuer': ((('countryName', 'US'),), (('organizationName', 'DigiCert Inc'),), (('commonName', 'DigiCert Global G2 TLS RSA SHA256 2020 CA1'),)), 'version': 3, 'serialNumber': '0C6F36D65B0D8DA95AF5457D07A78D06', 'notBefore': 'Oct 10 00:00:00 2023 GMT', 'notAfter': 'Oct 23 23:59:59 2024 GMT', 'subjectAltName': (('DNS', '*.lionbridge.com'), ('DNS', 'lionbridge.com')), 'OCSP': ('http://ocsp.digicert.com',), 'caIssuers': ('http://cacerts.digicert.com/DigiCertGlobalG2TLSRSASHA2562020CA1-1.crt',), 'crlDistributionPoints': ('http://crl3.digicert.com/DigiCertGlobalG2TLSRSASHA2562020CA1-1.crl', 'http://crl4.digicert.com/DigiCertGlobalG2T

Error fetching URL: HTTPSConnectionPool(host='lionbridge.ai', port=443): Max retries exceeded with url: /datasets/best-speech-recognition-datasets-for-machine-learning/ (Caused by SSLError(CertificateError("hostname 'lionbridge.ai' doesn't match either of '*.lionbridge.com', 'lionbridge.com'")))


 41%|████████████████▋                        | 101/249 [05:12<08:19,  3.37s/it]

Error fetching URL: 404 Client Error: Not Found for url: https://www.kormax.co.kr/en/sound-and-ultrasound/mechanism-for-generating-the-human-voice/?ckattempt=1


 46%|██████████████████▊                      | 114/249 [06:24<12:56,  5.75s/it]

Error fetching URL: HTTPSConnectionPool(host='text-to-speech-demo.ng.bluemix.net', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x16ac43010>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


 48%|███████████████████▌                     | 119/249 [06:44<10:12,  4.71s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.voices.com/blog/accessible-technology/


 48%|███████████████████▊                     | 120/249 [06:45<07:33,  3.52s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://venturebeat.com/2019/05/21/amazons-ai-improves-emotion-detection-in-voices/


 49%|████████████████████▎                    | 123/249 [06:53<06:14,  2.97s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/publication/335948444_Speech_Synthesis_Evaluation_-_State-of-the-Art_Assessment_and_Suggestion_for_a_Novel_Research_Program


 52%|█████████████████████▍                   | 130/249 [07:13<04:44,  2.39s/it]

Error fetching URL: 404 Client Error: Not Found for url: https://opendatahub.io/news/2019-09-04/sentiment-analysis-blog.html


 53%|█████████████████████▌                   | 131/249 [07:13<03:33,  1.81s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.voices.com/blog/text-to-speech-software-use-cases/


 59%|████████████████████████▏                | 147/249 [08:25<05:23,  3.17s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.explainthatstuff.com/how-speech-synthesis-works.html


 59%|████████████████████████▎                | 148/249 [08:25<03:58,  2.36s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/An-illustration-of-the-phoneme-mapping-procedure-used-for-DNN-training-a-The-speech_fig4_341084647


 60%|████████████████████████▌                | 149/249 [08:26<02:57,  1.78s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/Workflow-of-BERT-Sentiment-Analysis-detailing-Self-Attention_fig3_352081817


 63%|█████████████████████████▋               | 156/249 [09:11<05:44,  3.70s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/Concatenative-synthesis-of-an-activation-command-The-MFCC-feature-for-each-segment-in-a_fig5_319415640


 63%|██████████████████████████               | 158/249 [09:27<09:58,  6.57s/it]

Error fetching URL: HTTPSConnectionPool(host='wiki.inf.ed.ac.uk', port=443): Max retries exceeded with url: /twiki/pub/CSTR/Speak14To15/evaluation.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x16abaa9b0>, 'Connection to wiki.inf.ed.ac.uk timed out. (connect timeout=5)'))


 65%|██████████████████████████▋              | 162/249 [09:46<07:57,  5.49s/it]

Error fetching URL: 410 Client Error: Gone for url: https://medium.com/ibm-data-ai/ibm-watson-text-to-speech-neural-voices-added-to-service-e562106ff9c7


 66%|███████████████████████████              | 164/249 [09:51<05:20,  3.77s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/Waveforms-and-pitch-contours-for-examples-of-the-happy-left-and-sad-right-pitch_fig3_281403058


 68%|███████████████████████████▊             | 169/249 [10:11<05:00,  3.75s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://voicebot.ai/2018/05/14/google-home-beats-amazon-echo-in-two-audio-recognition-performance-tests-but-alexa-delivers-highest-composite-score/


 73%|█████████████████████████████▉           | 182/249 [10:49<02:22,  2.13s/it]

Error fetching URL: 406 Client Error: Not Acceptable for url: https://www.aclweb.org/anthology/W18-6529.pdf


 74%|██████████████████████████████▎          | 184/249 [10:53<02:13,  2.05s/it]

Error fetching URL: 406 Client Error: Not Acceptable for url: https://www.aclweb.org/anthology/venues/inlg/


 75%|██████████████████████████████▋          | 186/249 [10:54<01:18,  1.25s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.datacamp.com/tutorial


 75%|██████████████████████████████▊          | 187/249 [10:56<01:21,  1.31s/it]

Error fetching URL: 404 Client Error: Not Found for url: https://sites.google.com/site/hwinteractionlab//E2E/


 78%|███████████████████████████████▊         | 193/249 [11:14<02:35,  2.77s/it]

Error fetching URL: HTTPSConnectionPool(host='webnlg-challenge.loria.fr', port=443): Max retries exceeded with url: /challenge_2017/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)')))


 80%|████████████████████████████████▌        | 198/249 [11:24<01:37,  1.92s/it]

Error fetching URL: HTTPSConnectionPool(host='blog.sdl.com', port=443): Max retries exceeded with url: /blog/understanding-mt-quality-bleu-scores.html (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x16a50d750>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


 82%|█████████████████████████████████▊       | 205/249 [11:41<01:49,  2.50s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.chatbotpack.com/design-conversational-interfaces


 89%|████████████████████████████████████▌    | 222/249 [12:35<01:21,  3.03s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://machinelearningmastery.com/what-is-deep-learning/


 91%|█████████████████████████████████████▏   | 226/249 [12:46<00:52,  2.30s/it]

Error fetching URL: HTTPSConnectionPool(host='www.nrybbs.top', port=443): Max retries exceeded with url: /products.aspx?cname=reinforcement+learning+machine+learning&cid=95 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x169b6c430>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


 94%|██████████████████████████████████████▎  | 233/249 [13:05<00:41,  2.57s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://www.researchgate.net/figure/Illustration-of-the-BERT-model-for-joint-intent-detection-and-slot-filling_fig3_359661835


 96%|███████████████████████████████████████▎ | 239/249 [13:15<00:14,  1.49s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/


 99%|████████████████████████████████████████▋| 247/249 [13:36<00:03,  1.99s/it]

Error fetching URL: 403 Client Error: Forbidden for url: https://deepchecks.com/5-approaches-to-solve-llm-token-limits/


100%|█████████████████████████████████████████| 249/249 [13:40<00:00,  3.30s/it]


### Extract textual data from URLs pointing to PDF files.

In [17]:
pdf_urls

['https://arxiv.org/pdf/2205.06573.pdf',
 'https://arxiv.org/pdf/2106.07447.pdf',
 'https://www.speech.kth.se/~rolf/NGSLT/presentations/ASV.pdf',
 'https://s3.amazonaws.com/assets.datacamp.com/production/course_3631/slides/chapter2.pdf',
 'https://arxiv.org/pdf/2106.07447.pdf',
 'https://cs.stanford.edu/~acoates/ba_dls_speech2016.pdf',
 'https://people.kth.se/~ghe/pubs/pdf/wagner2019speech.pdf',
 'https://nld.ict.usc.edu/cs644-spring2020/discussions/novikova-etal-emnlp2017.pdf']

In [18]:
# Extracts and returns text from URLs pointing towards PDF files. (Most likely research articles or reviews)
def extract_text_from_pdf_url(pdf_url):

    extracted_text = " "
    
    try:
        response = requests.get(pdf_url, timeout=5)
        pdf_file = open('temp.pdf', 'wb')  # Save PDF temporarily
        pdf_file.write(response.content)
        pdf_file.close()

        pdf_reader = PyPDF2.PdfReader('temp.pdf')
        num_pages = len(pdf_reader.pages)

        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            extracted_text += text
            
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        forbidden_urls.append(url)
        extracted_text+= " "

    except requests.exceptions.Timeout as e:
        print(f"Error fetching URL: {e}")
        forbidden_urls.append(url)
        extracted_text+= " "
    
    finally:
        if os.path.exists('temp.pdf'):  # Check if the file exists before deleting
            os.remove('temp.pdf')
    
    return extracted_text

In [19]:
PDF_url_dataframe = pd.DataFrame(columns=['text', 'text_length','gram_score'])

for url in tqdm(pdf_urls):
    
    text = extract_text_from_pdf_url(url)
    text = clean_text(text)
    
    chunks = chunk_text(text)
    
    for t in chunks:
        row = [t, len(t.split()) ,grammar_check(t)]
        PDF_url_dataframe.loc[len(PDF_url_dataframe)] = row
        
        
PDF_url_dataframe.to_csv("PDF_urls_text_300.csv", index= False, quoting=csv.QUOTE_NONE, escapechar=' ')        

100%|█████████████████████████████████████████████| 8/8 [01:59<00:00, 14.88s/it]


### Additional Data Extraction

#### 1. Extract textual data from Wikipedia articles corresponding to keywords of the course

In [20]:
# Manually gathered keywords that have an article related to it in correlation with Speech Processing
# This process can be further automated using keyword extraction.

queries = ['Virtual Assistant', 'Natural language generation' ,'Keyword spotting', 'Speech Recognition', 
           'Question answering', 'Speech Processing', 'Speech Synthesis', 'Voice activity detection',
           'utterance', 'paraphrase', 'Prosody (linguistics)' , 'Connectionist temporal classification', 
           'Language model', 'representation learning', 'word embedding', 'large language model',
          'Transformer (deep learning architecture)', 'Voice Analysis']

In [21]:
def get_wikipedia_content(search_query):

    try:
        result = wikipedia.search(search_query)[0]

        # Fetch the full article content
        page = wikipedia.page(result)
        return page.content

    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Multiple results found: {e.options}")
        return None
    except wikipedia.exceptions.PageError:
        print(f"Page not found: {search_query}")
        return None

In [22]:
texts_keywords_wiki = {}

for keyword in queries:
    
    content = get_wikipedia_content(keyword)
    if content:
        texts_keywords_wiki[keyword] = clean_text(content)
    else:
        print("Wikipedia article not found or an error occurred.")

In [23]:

wiki_dataframe = pd.DataFrame(columns=['text', 'text_length','gram_score'])

for KW, content in tqdm(texts_keywords_wiki.items()):
    
    chunks = chunk_text(content)
    
    for t in chunks:
        row = [t, len(t.split()) ,grammar_check(t)]
        wiki_dataframe.loc[len(wiki_dataframe)] = row
        
        
wiki_dataframe.to_csv("Wiki_text_300.csv", index=False)

100%|███████████████████████████████████████████| 18/18 [01:01<00:00,  3.44s/it]


##### 2. Extract text from relevant chapters of the book, "Speech and Language Processing." by Daniel Jurafsky & James H. Martin.

In [24]:
book_folder = "./Book Chapters/"
chapters_pdf_files = [f for f in os.listdir(book_folder) if f.endswith('.pdf')]

In [25]:
results = {}

for file_name in chapters_pdf_files:
    
    file_path = os.path.join(book_folder, file_name)
    try:
        text_data = extract_pdf_text(file_path)
        combined_text = " ".join(text_data)
        results[file_name] = combined_text
    except PyPDF2.errors.PdfReadError:
        print(f"Error processing file (pdferror): {file_name}")

In [26]:
books_DF = pd.DataFrame(columns=['text', 'text_length','gram_score'])


for texts in results.values():
    chunks = chunk_text(texts)
    
    for t in chunks:
        t = clean_text(t)
        row = [t, len(t.split()) ,grammar_check(t)]
        books_DF.loc[len(books_DF)] = row
        

##### 3. Using the online book provided by Aalto Univeristy titled, "Introduction to Speech Processing"

In [27]:
extra_urls = ['https://speechprocessingbook.aalto.fi/Recognition_tasks_in_speech_processing.html',
              'https://speechprocessingbook.aalto.fi/Recognition/Voice_activity_detection.html',
             'https://speechprocessingbook.aalto.fi/Recognition/Speech_Recognition.html',
             'https://speechprocessingbook.aalto.fi/Recognition/Speaker_Recognition_and_Verification.html',
             'https://speechprocessingbook.aalto.fi/Recognition/Speaker_Diarization.html',
             'https://speechprocessingbook.aalto.fi/Recognition/Paralinguistic_speech_processing.html',
             'https://speechprocessingbook.aalto.fi/Speech_Synthesis.html',
             'https://speechprocessingbook.aalto.fi/Synthesis/Concatenative_speech_synthesis.html',
             'https://speechprocessingbook.aalto.fi/Synthesis/Statistical_parametric_speech_synthesis.html',
             'https://speechprocessingbook.aalto.fi/Analysis/Forensic_speaker_recognition.html']

In [28]:
# GET TEXT FROM URLS
extra_url_results = {}
extra_url_dataframe = pd.DataFrame(columns=['text', 'text_length','gram_score'])

for url in tqdm(extra_urls):
    
    article_texts = extract_text_from_url(url) # Input: URL; Output: Text retrieved from the link 
    chunks = chunk_text(article_texts)
    
    for t in chunks:
        row = [t, len(t.split()) ,grammar_check(t)]
        extra_url_dataframe.loc[len(extra_url_dataframe)] = row
        

100%|███████████████████████████████████████████| 10/10 [00:53<00:00,  5.37s/it]


In [29]:
new_DF = pd.concat([books_DF, extra_url_dataframe], ignore_index=True)
new_DF.to_csv("Book_text_300.csv", index=False, quoting=csv.QUOTE_NONE, escapechar=' ')

In [34]:
wiki_dataframe.iloc[0,:]

text           a virtual assistant va is a software agent tha...
text_length                                                  300
gram_score                                                    20
Name: 0, dtype: object