<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 2 - Phase 1_1 - eyamrog

The aim of this phase is to check.

## Required Python packages

- beautifulsoup4
- lxml
- pandas
- requests
- selenium
- tqdm

## Importing the required libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import sys
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Defining input variables

In [None]:
input_directory = 'cl_st2_ph1_eyamrog'
output_directory = 'cl_st2_ph11_eyamrog'

## Creating output directory

In [None]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

## Web Scraping [Annual Review of Plant Biology](https://www.annualreviews.org/content/journals/arplant)

### Importing the data into a DataFrame

In [None]:
df_ar_plant_biology = pd.read_json(f'{input_directory}/ar_plant_biology.jsonl', lines=True)

In [None]:
df_ar_plant_biology.columns

In [None]:
df_ar_plant_biology['Vol/Year/Page Range'].unique()

#### Extracting the `Posted` dates from the column `Vol/Year/Page Range`

In [None]:
# Extract year using RegEx
df_ar_plant_biology['Posted'] = df_ar_plant_biology['Vol/Year/Page Range'].str.extract(r'^Vol. .+ \n\((\d{4})\).+')

In [None]:
df_ar_plant_biology['Posted'] = pd.to_datetime(df_ar_plant_biology['Posted'])

In [None]:
df_ar_plant_biology.dtypes

In [None]:
df_ar_plant_biology

#### Creating the column `Text ID`

In [None]:
df_ar_plant_biology['Text ID'] = 'ar_plant_biology' + df_ar_plant_biology.index.astype(str).str.zfill(6)

#### Inspecting a few samples

In [None]:
url_sample = df_ar_plant_biology.at[87, 'URL']

In [None]:
# Setting up the WebDriver (make sure you have downloaded the Microsoft Edge WebDriver executable)
# https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/
service = Service(r'C:\Users\eyamr\OneDrive\Documentos\0-Technology\laelgelc\edgedriver_win64\msedgedriver.exe')
driver = webdriver.Edge(service=service)

# Navigating to target URL 1 and saving its web page
driver.get(url_sample)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.ID, 'google_esf')))
document_page_sample = driver.page_source

with open(f'{output_directory}/ar_plant_biology_sample.html', 'w', encoding='utf8', newline='\n') as file:
    file.write(document_page_sample)

# Closing the WebDriver
driver.quit()

#### Scraping the paragraphs of the articles into TXT format

In [None]:
# Iterating over the rows of the column 'URL' to scrape paragraphs from each article
for index, row in df_ar_plant_biology.iterrows():
    url = row['URL']
    text_id = row['Text ID']
    
    # Setting up the WebDriver
    service = Service(r'C:\Users\eyamr\OneDrive\Documentos\0-Technology\laelgelc\edgedriver_win64\msedgedriver.exe')
    driver = webdriver.Edge(service=service)
    driver.get(url)
    
    # Wait for page to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.ID, 'google_esf')))
    
    page = driver.page_source
    soup = BeautifulSoup(page, 'lxml')
    paragraphs = soup.find_all('p')
    
    # Extract text from paragraphs
    article_content = '\n'.join(p.get_text(strip=True) for p in paragraphs)
    
    # Saving each article's content to a text file
    with open(f"{output_directory}/{text_id}.txt", 'w', encoding='utf-8') as file:
        file.write(article_content)
        
    # Closing the WebDriver
    driver.quit()