# Imports and Setting Up

In [None]:
import json
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

ncd_file = '/Users/dianakazarian/Desktop/ClassifierData/ncd.json'

In [None]:
data = pd.read_json(ncd_file, lines=True)

# Original Data from HuffPo (NCD)

We have everything we need besides the actual body of the articles.

In [None]:
data.head()

In [None]:
data.shape

# Web Scraping Functions

In [None]:
def scrape_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        article_text = ''
        for paragraph in soup.find_all('div', class_='primary-cli cli cli-text'):
            article_text += paragraph.get_text() + '\n'
        return article_text.strip()
    except Exception as e:
        print("Error occurred while scraping:", e)
        return None

In [None]:
executor = ThreadPoolExecutor(max_workers=10)

In [None]:
def scrape_single_url(row):
    url = row.link
    try:
        article_text = scrape_article_text(url)
        return {'link': url, 'article_text': article_text}
    except Exception as e:
        print("Error occurred while scraping:", e)
        return {'link': url, 'article_text': None}

In [None]:
subset_data = data.iloc[:11842]
subset_data['article_text'] = subset_data['link'].apply(scrape_article_text)

results = executor.map(scrape_single_url, subset_data.itertuples())

# Convert the results to a list of dictionaries and create new df
results_list = list(results)
result_df = pd.DataFrame(results_list)

# Merge result with original
df = pd.merge(subset_data, result_df, on='link', how='left')

In [None]:
results = executor.map(scrape_single_url, data.itertuples())
results_list = list(results)
result_df = pd.DataFrame(results_list)

In [None]:
df = df[['link', 'headline', 'category', 'short_description', 'authors', 'date', 'article_text_x']]

# A Sample Article

In [None]:
df['article_text_x'][1024]

In [None]:
df = df[(df['article_text_x'] != '') & (df['article_text_x'].notna())]

# New dataframe with `article_text` column

In [None]:
classification_df = df

In [None]:
classification_df['class'] = df['category'].map(lambda x: 1 if x == 'POLITICS' else 0)

In [None]:
classification_df.to_csv('/Users/dianakazarian/Desktop/ClassifierData/classifierdata.csv', index=False)