In [None]:
import requests
import pandas as pd
import random
import os
from datetime import datetime, timedelta

# Guardian API Key and Base URL
API_KEY = "998344a2-04a1-4410-9d53-1490cfa2e9d2"
BASE_URL = "https://content.guardianapis.com/search"

# Output file
FILE_NAME = "guardian_articles.csv"

# Number of weeks to scrape
NUM_WEEKS = 600
ARTICLES_PER_WEEK = 5

# Generate random weekly dates (going backward from today)
start_date = datetime.today()
dates = [(start_date - timedelta(weeks=i)).strftime("%Y-%m-%d") for i in range(NUM_WEEKS)]

# Load existing data if the file exists
if os.path.exists(FILE_NAME):
    df = pd.read_csv(FILE_NAME)
else:
    df = pd.DataFrame(columns=["headline", "publication_date", "url"])

# Fetch articles for each week
for week_date in dates:
    params = {
        "api-key": API_KEY,
        "from-date": week_date,
        "to-date": week_date,
        "show-fields": "headline",
        "page-size": 10,  # Get up to 10 articles from the date
    }

    response = requests.get(BASE_URL, params=params)

    if response.status_code == 200:
        data = response.json()
        articles = data["response"]["results"]
        
        if articles:
            selected_article = random.choice(articles)  # Pick one randomly
            article_data = {
                "headline": selected_article["webTitle"],
                "publication_date": selected_article["webPublicationDate"],
                "url": selected_article["webUrl"],
            }

            # Append new data and save
            df = pd.concat([df, pd.DataFrame([article_data])], ignore_index=True)
            df.to_csv(FILE_NAME, index=False)

            print(f"Saved article from {week_date}: {article_data['headline']}")
        else:
            print(f"No articles found for {week_date}.")
    else:
        print(f"Failed to fetch data for {week_date}: {response.status_code}")

print(" Data collection complete!")


In [None]:
#this allows us to test the sucess of our scrapinng and ensure it has gone back 500 weeks also that it was effectively formatted
df.tail()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors

# Assuming 'df' is your dataframe with a 'publication_date' column in datetime format
# Sample data:
# df = pd.read_csv('your_data.csv')

# Make sure 'publication_date' is in datetime format
df['publication_date'] = pd.to_datetime(df['publication_date'])

# Remove timezone info if present
df['publication_date'] = df['publication_date'].dt.tz_localize(None)

# Extract Year-Month from publication_date
df['year_month'] = df['publication_date'].dt.to_period('M')

# Group by 'year_month' to count how many articles were published in each month
monthly_data = df.groupby('year_month').size().reset_index(name='count')

# Format the 'year_month' to show as 'Jan 18', 'Feb 18', etc.
monthly_data['year_month_str'] = monthly_data['year_month'].dt.strftime('%b %y')

# Create a list of distinct colors for each month (12 months in a year)
month_colors = list(mcolors.TABLEAU_COLORS.values())  # Use Tableau colors, which are distinct

# Map each month to a color
# The color for each month will repeat, so January always gets the same color, February the same, etc.
month_to_color = {month: month_colors[i % len(month_colors)] for i, month in enumerate(range(1, 13))}

# Assign colors based on the month part of the 'year_month'
monthly_data['color'] = monthly_data['year_month'].dt.month.map(month_to_color)

# Sort the data by 'count' to identify the 10 smallest values
sorted_data = monthly_data.sort_values(by='count')

# Identify the 10 smallest values
smallest_10 = sorted_data.head(10)

# Assign red color to the 10 smallest values, blue to the rest
monthly_data['color'] = monthly_data.apply(
    lambda row: 'red' if row['year_month'] in smallest_10['year_month'].values else 'blue',
    axis=1
)

# Create a bar chart for the count of articles per month
plt.figure(figsize=(14, 6))

# Create the bars with the assigned colors
bars = plt.bar(monthly_data['year_month_str'], monthly_data['count'], color=monthly_data['color'])

# Add labels and title
plt.xlabel('Month')
plt.ylabel('Article Count')
plt.title('Monthly Data of Articles (Top 10 smallest counts in red)')

# Rotate x-axis labels for better readability (vertical labels)
plt.xticks(rotation=90)

# Display the bar chart
plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Prepare the data
df['publication_date'] = pd.to_datetime(df['publication_date'])
df['year_month'] = df['publication_date'].dt.to_period('M')
df['month'] = df['publication_date'].dt.month_name()
df['year'] = df['publication_date'].dt.year

# Count headlines per month
monthly_counts = df.groupby(['year_month', 'month']).size().reset_index(name='headline_count')

# Create figure with two subplots
plt.figure(figsize=(16, 6))

# Define a colour palette for months
month_palette = {
    'January': '#1f77b4', 'February': '#ff7f0e', 'March': '#2ca02c',
    'April': '#d62728', 'May': '#9467bd', 'June': '#8c564b',
    'July': '#e377c2', 'August': '#7f7f7f', 'September': '#bcbd22',
    'October': '#17becf', 'November': '#aec7e8', 'December': '#ffbb78'
}

# Subplot 1: Overall distribution (single box)
plt.subplot(1, 2, 1)
sns.boxplot(data=monthly_counts, y='headline_count', color='lightblue')
plt.title('Overall Distribution of Monthly Headline Counts')
plt.ylabel('Number of Headlines')
plt.xlabel('All Months Combined')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Subplot 2: Monthly distribution (coloured by month)
plt.subplot(1, 2, 2)
sns.boxplot(
    data=monthly_counts,
    x='month',
    y='headline_count',
    order=['January', 'February', 'March', 'April', 'May', 'June',
           'July', 'August', 'September', 'October', 'November', 'December'],
    palette=month_palette
)
plt.title('Monthly Distribution of Headline Counts')
plt.ylabel('Number of Headlines')
plt.xlabel('Month')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add a custom legend for the colours
legend_elements = [Patch(facecolor=month_palette[m], label=m) 
                  for m in month_palette]
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:


# Read the CSV file into a pandas DataFrame
df_gdp = pd.read_csv('/Users/georgewalsh/Desktop/API_NY/GDP.csv', skiprows=4)  # Skip the first 4 rows which contain metadata

# Display the first few rows of the DataFrame to verify
df_gdp.head()

df_gdp_cleaned = df_gdp[['Country Name','2014','2015','2016','2017','2018','2019', '2020', '2021', '2022','2023']]
df_gdp_cleaned.head(20)
