## eplore.ipynb

This file has some code for some potential transformations that you may want to do as well as some potential feature engineering that you can do for other information.

In [None]:
import pandas as pd
df = pd.read_parquet('articles.parquet')
df

In [None]:
date_pattern = r'/(\d{4})/(\d{2})/(\d{2})/'
df[['year', 'month', 'day']] = df['URL'].str.extract(date_pattern)

# Convert the extracted components to integers
df['year'] = df['year'].astype(int)
df['month'] = df['month'].astype(int)
df['day'] = df['day'].astype(int)

df

In [None]:
df['generated_content'] = df['generated_content'].apply(lambda x:' '.join(x))
df

## Rerun all of the web scraping with updated paramaters to get the proper text

In [None]:
import requests
from bs4 import BeautifulSoup, Comment, Tag
def scrape_article(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extracting article title
            article_title = soup.find("div", class_="storytitle").find("h1").text.strip()

            # Extracting article body
            article_body = soup.find("div", id="storytext")

            # Exclude divs with class 'credit-caption'
            for div in article_body.find_all("div", class_="credit-caption"):
                div.extract()

            # Finding all paragraphs
            paragraphs = article_body.find_all("p")
            article_text = ""
            for p in paragraphs:
                # Extracting text from paragraph excluding links and nested tags
                paragraph_text = ''.join([child.strip() if isinstance(child, str) else ' ' + child.text.strip() for child in p.contents if not isinstance(child, Comment)])
                article_text += paragraph_text.strip() + " "
            # Getting the first 200 words of the article
            article_text = ' '.join(article_text.split()[:200])
            return article_title, article_text
        else:
            print(f"Failed to fetch URL: {url}, Status code: {response.status_code}")
    except Exception as e:
        print(f"Error occurred while scraping URL: {url}, Error: {e}")
    return None, None

df['Content'] = df['URL'].apply(lambda x:scrape_article(x)[1])


In [None]:
df['content_len'] = df['Content'].apply(lambda x:len(x))
df['content_word'] = df['Content'].apply(lambda x:len(x.split()))
df['generated_content_len'] = df['generated_content'].apply(lambda x:len(x))
df['generated_content_word'] = df['generated_content'].apply(lambda x:len(x.split()))
df

## Compare real vs generated

In [None]:
print(df['Content'][0])
print(df['generated_content'][0])
print(df['URL'][0])

## Save the Updated df

In [None]:
df.to_parquet('./articles.parquet')

In [None]:
check_df = pd.read_parquet('./articles.parquet')
check_df