## Import packages

In [1]:
# !pip install newspaper3k
from newspaper import Article
import pandas as pd
import glob

## Combine raw CNN news data

In [2]:
path = r'../raw_data/cnn_news'
all_files = glob.glob(path + "/*.csv")

df_list = (pd.read_csv(file, encoding='utf-8-sig') for file in all_files)

combined_df   = pd.concat(df_list, ignore_index=True)
combined_df = combined_df.loc[:,['url']]
combined_df

Unnamed: 0,url
0,https://edition.cnn.com/2023/02/28/sport/los-a...
1,https://edition.cnn.com/2023/02/16/sport/lesle...
2,https://edition.cnn.com/2023/02/27/sport/damia...
3,https://edition.cnn.com/2023/02/27/football/su...
4,https://edition.cnn.com/2023/02/22/football/pa...
...,...
207,https://edition.cnn.com/2023/02/20/sport/ander...
208,https://edition.cnn.com/2023/02/27/golf/chris-...
209,https://edition.cnn.com/2023/03/06/sport/ja-mo...
210,https://edition.cnn.com/2023/03/06/football/wo...


## Collecting url, title, text, authors and date for each article

In [3]:
url = []
title = []
text = []
authors = []
date = []

for l in combined_df.url:
    try:
        article = Article(l)
        article.download()
        article.parse()
        url.append(l)
        text.append(article.text)
        title.append(article.title)
        authors.append(article.authors)
        date.append(article.publish_date)
    except:
        continue

df = pd.DataFrame(list(zip(url, title, text, authors, date)), columns=['url', 'title', 'text', 'author', 'date'])

In [4]:
df.to_csv("cnn_raw_final.csv")

## Combine raw CNA news data

In [7]:
path = r'../raw_data/cna_news'
all_files = glob.glob(path + "/*.csv")

df_list = (pd.read_csv(file, encoding='utf-8-sig') for file in all_files)

combined_df   = pd.concat(df_list, ignore_index=True)
combined_df = combined_df.loc[:,['url']]
combined_df

Unnamed: 0,url
0,https://www.channelnewsasia.com/sport/steely-m...
1,https://www.channelnewsasia.com/sport/choupo-m...
2,https://www.channelnewsasia.com/sport/chelsea-...
3,https://www.channelnewsasia.com/sport/nothing-...
4,https://www.channelnewsasia.com/sport/contes-c...
...,...
140,https://www.channelnewsasia.com/sport/fa-cup-q...
141,https://www.channelnewsasia.com/sport/man-unit...
142,https://www.channelnewsasia.com/sport/formula-...
143,https://www.channelnewsasia.com/sport/prop-hao...


## Collecting url, title, text, authors and date for each article

In [8]:
url = []
title = []
text = []
authors = []
date = []

for l in combined_df.url:
    try:
        article = Article(l)
        article.download()
        article.parse()
        url.append(l)
        text.append(article.text)
        title.append(article.title)
        authors.append(article.authors)
        date.append(article.publish_date)
    except:
        continue

df = pd.DataFrame(list(zip(url, title, text, authors, date)), columns=['url', 'title', 'text', 'author', 'date'])

In [9]:
df.to_csv("cna_raw_final.csv")