# Code to download The Guardian UK data and clean data for text analysis
@Jorge de Leon 

This script allows you to download news articles that match your parameters from the Guardian newspaper, https://www.theguardian.com/us.

## Set-up

In [None]:
import os
import re   
import glob
import json
import requests
import pandas as pd 

from glob import glob
from os import makedirs
from textblob import TextBlob
from os.path import join, exists
from datetime import date, timedelta

os.chdir("..")

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

## API and news articles requests

This section contains the code that will be used to download articles from the Guardian website. 
the initial variables will be determined as user-defined parameters.

In [None]:
#Enter API and parameters - these parameters can be obtained by playing around with the Guardian API tool:
# https://open-platform.theguardian.com/explore/

# Set up initial and end date 

start_date_global = date(2000, 1, 1)
end_date_global = date(2020, 5, 17)
query = "3M"
term = ('stock')

#Enter API key, endpoint and parameters
my_api_key = open("..\\input files\\creds_guardian.txt").read().strip()
api_endpoint = "http://content.guardianapis.com/search?"
my_params = {
    'from-date': '',
    'to-date': '',
    'show-fields': 'bodyText',
    'q': query,
    'page-size': 200,
    'api-key': my_api_key
}

In [None]:
articles_dir = join('theguardian','3m')
makedirs(articles_dir, exist_ok=True)

In [None]:
# day iteration from here:
# http://stackoverflow.com/questions/7274267/print-all-day-dates-between-two-dates
start_date = start_date_global
end_date = end_date_global
dayrange = range((end_date - start_date).days + 1)
for daycount in dayrange:
    dt = start_date + timedelta(days=daycount)
    datestr = dt.strftime('%Y-%m-%d')
    fname = join(articles_dir, datestr + '.json')
    if not exists(fname):
        # then let's download it
        print("Downloading", datestr)
        all_results = []
        my_params['from-date'] = datestr
        my_params['to-date'] = datestr
        current_page = 1
        total_pages = 1
        while current_page <= total_pages:
            print("...page", current_page)
            my_params['page'] = current_page
            resp = requests.get(api_endpoint, my_params)
            data = resp.json()
            all_results.extend(data['response']['results'])
            # if there is more than one page
            current_page += 1
            total_pages = data['response']['pages']

        with open(fname, 'w') as f:
            print("Writing to", fname)

            # re-serialize it for pretty indentation
            f.write(json.dumps(all_results, indent=2))

In [None]:
#Read all json files that will be concatenated
test_files = sorted(glob('theguardian/3m/*.json'))

In [None]:
#intialize empty list that we will append dataframes to
all_files = []
 
#write a for loop that will go through each of the file name through globbing and the end result will be the list 
#of dataframes
for file in test_files:
    try:
        articles = pd.read_json(file)
        all_files.append(articles)
    except pd.errors.EmptyDataError:
        print('Note: filename.csv ws empty. Skipping')
        continue #will skip the rest of the bloc and move to next file

#create dataframe with data from json files
theguardian_rawdata = pd.concat(all_files, axis=0, ignore_index=True)     

## Text Analysis

In [None]:
#Drop empty columns
theguardian_rawdata = theguardian_rawdata.iloc[:,0:12]

In [None]:
#show types of media that was downloaded by type
theguardian_rawdata['type'].unique()

In [None]:
#filter only for articles
theguardian_rawdata = theguardian_rawdata[theguardian_rawdata['type'].str.match('article',na=False)]

In [None]:
#remove columns that do not contain relevant information for analysis
theguardian_dataset = theguardian_rawdata.drop(['apiUrl','id', 'isHosted', 'pillarId', 'pillarName',
       'sectionId', 'sectionName', 'type','webTitle', 'webUrl'], axis=1)

In [None]:
#Modify the column webPublicationDate to Date and the fields to string and lower case
theguardian_dataset["date"] = pd.to_datetime(theguardian_dataset["webPublicationDate"]).dt.strftime('%Y-%m-%d')
theguardian_dataset['fields'] = theguardian_dataset['fields'].astype(str).str.lower()

In [None]:
#Clean the articles from URLS, remove punctuaction and numbers. 
theguardian_dataset['fields'] = theguardian_dataset['fields'].str.replace('<.*?>','') # remove HTML tags
theguardian_dataset['fields'] = theguardian_dataset['fields'].str.replace('[^\w\s]','') # remove punc.

In [None]:
#Generate sentiment analysis for each article
#Using TextBlob obtain polarity
theguardian_dataset['sentiment_polarity'] = theguardian_dataset['fields'].apply(lambda row: TextBlob(row).sentiment.polarity)
#Using TextBlob obtain subjectivity
theguardian_dataset['sentiment_subjectivity'] = theguardian_dataset['fields'].apply(lambda row: TextBlob(row).sentiment.subjectivity)

In [None]:
#Remove numbers from text
theguardian_dataset['fields'] = theguardian_dataset['fields'].str.replace('\d+','') # remove numbers

#Then I will tokenize each word and remover stop words
theguardian_dataset['tokenized_fields'] = theguardian_dataset.apply(lambda row: nltk.word_tokenize(row['fields']), axis=1)

In [None]:
#Stop words
stop_words=set(stopwords.words("english"))

In [None]:
#Remove stop words
theguardian_dataset['tokenized_fields'] = theguardian_dataset['tokenized_fields'].apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
#Count number of words and create a column with the most common 5 words per article
from collections import Counter
theguardian_dataset['high_recurrence'] = theguardian_dataset['tokenized_fields'].apply(lambda x: [k for k, v in Counter(x).most_common(5)])

In [None]:
#Create a word count for the word "stock"
theguardian_dataset['word_ocurrence'] = theguardian_dataset['tokenized_fields'].apply(lambda x: [w for w in x if re.search(term, w)])
theguardian_dataset['word_count'] = theguardian_dataset['word_ocurrence'].apply(len)

In [None]:
#Create a count of the total number of words
theguardian_dataset['total_words'] = theguardian_dataset['tokenized_fields'].apply(len)

In [None]:
#Create new table with average polarity, subjectivity, count of the word "stock" per day
guardian_microsoft = theguardian_dataset.groupby('date')['sentiment_polarity','sentiment_subjectivity','word_count','total_words'].agg('mean')

In [None]:
#Create a variable for the number of articles per day
count_articles = theguardian_dataset
count_articles['no_articles'] = count_articles.groupby(['date'])['fields'].transform('count')
count_articles = count_articles[["date","no_articles"]]
count_articles_df = count_articles.drop_duplicates(subset = "date", 
                     keep = "first", inplace=False) 

In [None]:
#Join tables by date
guardian_microsoft = guardian_microsoft.merge(count_articles_df, on='date', how ='left')

In [None]:
#Save dataframes into CSV
theguardian_dataset.to_csv('theguardian/3m/theguardian_3m_text.csv', encoding='utf-8')
guardian_microsoft.to_csv('theguardian/3m/theguardian_3m_data.csv', encoding='utf-8')