# Getting Cleaned text from these following Yale website Url's:
        urls = ['https://medicine.yale.edu/intmed/cardio/news-article/integrating-chagas-disease-screening-into-primary-healthcare/', 
    'https://library.yale.edu/news/library-staff-supports-young-historians-connecticut-history-day-regional-contests',
    'https://news.yale.edu/2023/04/10/dr-jason-fish-named-next-ceo-yale-health',
    'https://ysph.yale.edu/news-article/for-hurricane-katrina-survivors-covid-19-brings-a-new-mental-health-toll/',
    'https://egc.yale.edu/events/podcasts/voices-development-episode-5']
    
## Since there are only around 40 sentences in a page from yale website, I have chosen 5 different pages for preparing my dataset which would make the sentence count to 128+ samples

In [1]:
# these cells contains different function which take in the url and extracts the paragraphs from
# the html's and later converts them into cleaned sentences.
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
import re

# extracts the paragraphs from url, and joins all the paragraphs as a string on separartor '$'
def ready_to_use_text(urls):    
    all_paras = []
    for url in urls:
        response = requests.get(url)

        # Create BeautifulSoup object
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all paragraphs in the page
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            all_paras.append(p.text)
    separator = '$'
    text = separator.join(all_paras)
    
    return filter_on_pos(text)

# filters all the text based on parts of speech
# the assumption is, if there is no action verbs, then it's not a desired sentence
def filter_on_pos(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    filtered_sentences = []
    
    # Loop through each sentence and filter out sentences without action verbs
    for sentence in sentences:
        # Tokenize the sentence into words
        words = word_tokenize(sentence)
        
        # Perform POS tagging on the words
        tagged_words = pos_tag(words)
        
        # Look for action verbs in the tagged words
        has_action_verb = False
        for word, tag in tagged_words:
            if tag.startswith('VB'):  # Check if the word is a verb
                has_action_verb = True
                break
        
        # If the sentence has an action verb, add it to the filtered sentences list
        if has_action_verb:
            sentence = sentence.replace('\ufeff', '')
            sentence = sentence.replace('\xa0', ' ')
            filtered_sentences.append(sentence)
    
    

    clean_sentences = []
    for i in filtered_sentences:
        for j in i.split('$'):
            pattern = r'^[A-Z][a-z]'
            if re.search(pattern, j) and '.' in j:
                clean_sentences.append(j)
    return clean_sentences

In [2]:
urls = ['https://medicine.yale.edu/intmed/cardio/news-article/integrating-chagas-disease-screening-into-primary-health-care/', 
    'https://library.yale.edu/news/library-staff-supports-young-historians-connecticut-history-day-regional-contests',
    'https://news.yale.edu/2023/04/10/dr-jason-fish-named-next-ceo-yale-health',
    'https://ysph.yale.edu/news-article/for-hurricane-katrina-survivors-covid-19-brings-a-new-mental-health-toll/',
    'https://egc.yale.edu/events/podcasts/voices-development-episode-5']
yale_text = ready_to_use_text(urls)

In [3]:
print(f'Few samples of text: {yale_text[:4]}')

Few samples of text: ['April 14 is World Chagas Disease Day.', 'Since its discovery in 1909 by Brazilian researcher Carlos Ribeiro Justiniano Chagas, a Chagas disease epidemic has spread from the rural areas of Latin America to the United States.', 'When untreated, patients are at risk for disorders including arrythmias, dilated cardiomyopathy, or heart failure, so raising awareness among healthcare providers is a priority.', 'Bernardo Lombo, MD, is an assistant professor of clinical medicine at the Yale School of Medicine and an expert in Chagas disease at Yale.']


# English text is translated into spanish using google translator
### Translations of yale_text is done in this cell

In [4]:
from googletrans import Translator
import time

def get_translations(yale_text):
    final_translation = []
    translator = Translator()
    translations = []

    for sentence in yale_text:
        translation = None
        while translation is None:
            try:
                translation = translator.translate(sentence, dest='es')
                translations.append(translation.text)
            except AttributeError:
                time.sleep(1)
            except Exception as e:
                print(e)
                time.sleep(10)

    return translations
translations = get_translations(yale_text)

the JSON object must be str, bytes or bytearray, not NoneType
the JSON object must be str, bytes or bytearray, not NoneType
the JSON object must be str, bytes or bytearray, not NoneType
the JSON object must be str, bytes or bytearray, not NoneType
the JSON object must be str, bytes or bytearray, not NoneType
the JSON object must be str, bytes or bytearray, not NoneType
the JSON object must be str, bytes or bytearray, not NoneType
the JSON object must be str, bytes or bytearray, not NoneType


In [8]:
print(f'Few samples of translated text: {translations[125:130]}')

Few samples of translated text: ['Las reuniones de primavera 2023 de esta semana del Grupo del Banco Mundial y el FMI con un enfoque en "remodelar el desarrollo para una nueva era" son un recordatorio importante de que el cambio sistémico no ocurre de la noche a la mañana.', 'El desarrollo económico fuerte requiere una infraestructura adecuada y el acceso individual a los mercados y oportunidades.', 'Sin embargo, muchas comunidades enfrentan barreras sociales, ambientales y económicas que obstaculizan el desarrollo.', 'Los paisajes duros y los entornos remotos pueden limitar el acceso a los mercados, y las crisis como la pandemia Covid-19, la inseguridad alimentaria y el cambio climático pueden retrasar ganancias ganadas con tanto esfuerzo en el desarrollo.', 'En este episodio de Voices in Development, Kevin Donovan, profesor asistente de economía y asuntos globales y afiliados de EGC, discute su diverso cuerpo de trabajo reciente desempacando las condiciones y herramientas necesarias 

# Writing the base text and translated text to text files for future use

In [19]:
with open('yale_base_text.txt', 'w') as f:
    for item in yale_text:
        f.write(item + '\n')


In [20]:
with open('yale_translated_text.txt', 'w') as f:
    for item in translations:
        f.write(item + '\n')


# CustomDataset class definition:

In [53]:
import torch
import random
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, base_text_file, translated_text_file):
        with open(base_text_file, 'r') as f:
            self.base_text = f.readlines()
        with open(translated_text_file, 'r') as f:
            self.translated_text = f.readlines()

    def __len__(self):
        return len(self.base_text)

    def __getitem__(self, idx):
        return self.base_text[idx], self.translated_text[idx]

dataset = CustomDataset('yale_base_text.txt', 'yale_translated_text.txt')



# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

# Create data loaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Show the first sample of each minibatch of size 8
for i, (base_text_batch, translated_text_batch) in enumerate(train_loader):
        print(f'The first sample in batch {i+1}')
        print('==================================')
        print(base_text_batch[0])
        print(translated_text_batch[0])

The first sample in batch 1
There are researchers in the U.S. tackling these kinds of questions in regard to Chagas disease and triatomine bugs, but it's an area that can use more attention as our climate continues to be altered by human activities.

Hay investigadores en los Estados Unidos que abordan este tipo de preguntas con respecto a la enfermedad de Chagas y los insectos triatominos, pero es un área que puede usar más atención a medida que nuestro clima continúa siendo alterado por las actividades humanas.

The first sample in batch 2
People emigrate and travel, and someone may move from a country like Bolivia, where screening programs for Chagas disease are relatively robust, to a country like the US, where almost no health care providers were taught to even consider screening their at-risk patients.

Las personas emigran y viajan, y alguien puede mudarse de un país como Bolivia, donde los programas de detección de la enfermedad de Chagas son relativamente robustos, a un país c