In [None]:
import requests
import pandas as pd
import random
import os
from datetime import datetime, timedelta

# Guardian API Key (replace with your own)
API_KEY = "998344a2-04a1-4410-9d53-1490cfa2e9d2"
BASE_URL = "https://content.guardianapis.com/search"

# Output file
FILE_NAME = "guardian_articles.csv"

# Number of weeks to scrape
NUM_WEEKS = 500
ARTICLES_PER_WEEK = 2

# Generate random weekly dates (going backward from today)
start_date = datetime.today()
dates = [(start_date - timedelta(weeks=i)).strftime("%Y-%m-%d") for i in range(NUM_WEEKS)]

# Load existing data if the file exists
if os.path.exists(FILE_NAME):
    df = pd.read_csv(FILE_NAME)
else:
    df = pd.DataFrame(columns=["headline", "publication_date", "url"])

# Fetch articles for each week
for week_date in dates:
    params = {
        "api-key": API_KEY,
        "from-date": week_date,
        "to-date": week_date,
        "show-fields": "headline",
        "page-size": 10,  # Get up to 10 articles from the date
    }

    response = requests.get(BASE_URL, params=params)

    if response.status_code == 200:
        data = response.json()
        articles = data["response"]["results"]
        
        if articles:
            selected_article = random.choice(articles)  # Pick one randomly
            article_data = {
                "headline": selected_article["webTitle"],
                "publication_date": selected_article["webPublicationDate"],
                "url": selected_article["webUrl"],
            }

            # Append new data and save
            df = pd.concat([df, pd.DataFrame([article_data])], ignore_index=True)
            df.to_csv(FILE_NAME, index=False)

            print(f"Saved article from {week_date}: {article_data['headline']}")
        else:
            print(f"No articles found for {week_date}.")
    else:
        print(f"Failed to fetch data for {week_date}: {response.status_code}")

print("✅ Data collection complete!")


In [None]:
df.tail()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors

# Assuming 'df' is your dataframe with a 'publication_date' column in datetime format
# Sample data:
# df = pd.read_csv('your_data.csv')

# Make sure 'publication_date' is in datetime format
df['publication_date'] = pd.to_datetime(df['publication_date'])

# Remove timezone info if present
df['publication_date'] = df['publication_date'].dt.tz_localize(None)

# Extract Year-Month from publication_date
df['year_month'] = df['publication_date'].dt.to_period('M')

# Group by 'year_month' to count how many articles were published in each month
monthly_data = df.groupby('year_month').size().reset_index(name='count')

# Format the 'year_month' to show as 'Jan 18', 'Feb 18', etc.
monthly_data['year_month_str'] = monthly_data['year_month'].dt.strftime('%b %y')

# Create a list of distinct colors for each month (12 months in a year)
month_colors = list(mcolors.TABLEAU_COLORS.values())  # Use Tableau colors, which are distinct

# Map each month to a color
# The color for each month will repeat, so January always gets the same color, February the same, etc.
month_to_color = {month: month_colors[i % len(month_colors)] for i, month in enumerate(range(1, 13))}

# Assign colors based on the month part of the 'year_month'
monthly_data['color'] = monthly_data['year_month'].dt.month.map(month_to_color)

# Create a bar chart for the count of articles per month
plt.figure(figsize=(15, 6))

# Create the bars with the assigned colors
bars = plt.bar(monthly_data['year_month_str'], monthly_data['count'], color=monthly_data['color'])

# Add labels and title
plt.xlabel('Month')
plt.ylabel('Article Count')
plt.title('Monthly Data of Articles')

# Rotate x-axis labels for better readability (vertical labels)
plt.xticks(rotation=90)

# Add a line of best fit (Linear Regression)
# Fit a linear regression line to the monthly counts
x = np.arange(len(monthly_data))  # The x-axis (month indices)
y = monthly_data['count']  # The article counts

# Perform linear regression (polynomial fitting of degree 1)
slope, intercept = np.polyfit(x, y, 1)

# Create the line of best fit
line = slope * x + intercept

# Plot the line of best fit
plt.plot(monthly_data['year_month_str'], line, color='red', linewidth=2, label='Trend Line')

# Display the legend
plt.legend()

# Display the bar chart
plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors

# Assuming 'df' is your dataframe with a 'publication_date' column in datetime format
# Sample data:
# df = pd.read_csv('your_data.csv')

# Make sure 'publication_date' is in datetime format
df['publication_date'] = pd.to_datetime(df['publication_date'])

# Remove timezone info if present
df['publication_date'] = df['publication_date'].dt.tz_localize(None)

# Extract Year-Month from publication_date
df['year_month'] = df['publication_date'].dt.to_period('M')

# Group by 'year_month' to count how many articles were published in each month
monthly_data = df.groupby('year_month').size().reset_index(name='count')

# Format the 'year_month' to show as 'Jan 18', 'Feb 18', etc.
monthly_data['year_month_str'] = monthly_data['year_month'].dt.strftime('%b %y')

# Create a list of distinct colors for each month (12 months in a year)
month_colors = list(mcolors.TABLEAU_COLORS.values())  # Use Tableau colors, which are distinct

# Map each month to a color
# The color for each month will repeat, so January always gets the same color, February the same, etc.
month_to_color = {month: month_colors[i % len(month_colors)] for i, month in enumerate(range(1, 13))}

# Assign colors based on the month part of the 'year_month'
monthly_data['color'] = monthly_data['year_month'].dt.month.map(month_to_color)

# Sort the data by 'count' to identify the 10 smallest values
sorted_data = monthly_data.sort_values(by='count')

# Identify the 10 smallest values
smallest_10 = sorted_data.head(10)

# Assign red color to the 10 smallest values, blue to the rest
monthly_data['color'] = monthly_data.apply(
    lambda row: 'red' if row['year_month'] in smallest_10['year_month'].values else 'blue',
    axis=1
)

# Create a bar chart for the count of articles per month
plt.figure(figsize=(14, 6))

# Create the bars with the assigned colors
bars = plt.bar(monthly_data['year_month_str'], monthly_data['count'], color=monthly_data['color'])

# Add labels and title
plt.xlabel('Month')
plt.ylabel('Article Count')
plt.title('Monthly Data of Articles (Top 10 smallest counts in red)')

# Rotate x-axis labels for better readability (vertical labels)
plt.xticks(rotation=90)

# Display the bar chart
plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Prepare the data
df['publication_date'] = pd.to_datetime(df['publication_date'])
df['year_month'] = df['publication_date'].dt.to_period('M')
df['month'] = df['publication_date'].dt.month_name()
df['year'] = df['publication_date'].dt.year

# Count headlines per month
monthly_counts = df.groupby(['year_month', 'month']).size().reset_index(name='headline_count')

# Create figure with two subplots
plt.figure(figsize=(16, 6))

# Define a color palette for months
month_palette = {
    'January': '#1f77b4', 'February': '#ff7f0e', 'March': '#2ca02c',
    'April': '#d62728', 'May': '#9467bd', 'June': '#8c564b',
    'July': '#e377c2', 'August': '#7f7f7f', 'September': '#bcbd22',
    'October': '#17becf', 'November': '#aec7e8', 'December': '#ffbb78'
}

# Subplot 1: Overall distribution (single box)
plt.subplot(1, 2, 1)
sns.boxplot(data=monthly_counts, y='headline_count', color='lightblue')
plt.title('Overall Distribution of Monthly Headline Counts')
plt.ylabel('Number of Headlines')
plt.xlabel('All Months Combined')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Subplot 2: Monthly distribution (colored by month)
plt.subplot(1, 2, 2)
sns.boxplot(
    data=monthly_counts,
    x='month',
    y='headline_count',
    order=['January', 'February', 'March', 'April', 'May', 'June',
           'July', 'August', 'September', 'October', 'November', 'December'],
    palette=month_palette
)
plt.title('Monthly Distribution of Headline Counts')
plt.ylabel('Number of Headlines')
plt.xlabel('Month')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add a custom legend for the colors
legend_elements = [Patch(facecolor=month_palette[m], label=m) 
                  for m in month_palette]
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re

# Assuming df is the dataframe that contains the articles data
# Make sure 'publication_date' is in datetime format
df['publication_date'] = pd.to_datetime(df['publication_date'])

# Remove timezone info if it's present
df['publication_date'] = df['publication_date'].dt.tz_localize(None)

# Function to filter data and plot the graph for a given keyword (including variations)
def plot_keyword_mentions(df, keyword, ax, color):
    # Prepare a regex pattern that allows case-insensitivity and matches variations      
    keyword_pattern = fr'\b{keyword}\b'

    # Filter articles that mention the keyword (case-insensitive and with variations)
    df_keyword = df[df['headline'].str.contains(keyword_pattern, case=False, na=False, regex=True)].copy()  # Make a copy to avoid setting on a slice
    
    # Remove the timezone information before converting to period
    df_keyword['publication_date'] = df_keyword['publication_date'].dt.tz_localize(None)
    
    # Use .loc to set the 'year_quarter' column safely
    df_keyword.loc[:, 'year_quarter'] = df_keyword['publication_date'].dt.to_period('Q')  # 'Q' gives quarterly periods

    # Count the number of mentions per quarter
    keyword_mentions = df_keyword.groupby('year_quarter').size().reset_index(name='mentions')
    
    # Plot the frequency over time
    ax.plot(keyword_mentions['year_quarter'].astype(str), keyword_mentions['mentions'], marker='o', label=keyword, color=color)
    ax.set_title(f'Persistence of the word "{keyword}" in Titles Over Time (Quarterly)')
    ax.set_xlabel('Time (Year-Quarter)')
    ax.set_ylabel('Number of Mentions')
    
    # Explicitly set the x-ticks before setting the labels
    ax.set_xticks(range(len(keyword_mentions)))
    ax.set_xticklabels(keyword_mentions['year_quarter'].astype(str), rotation=45)
    
    ax.grid(True)
    ax.legend()

# Create a figure with subplots (2 rows, 2 columns)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

colors = ['blue', 'red', 'green', 'orange']
# Plot for each keyword
plot_keyword_mentions(df, 'China', axes[0, 0], color=colors[0])  # Top-left graph for Trump
plot_keyword_mentions(df, 'Europe', axes[0, 1], color=colors[1])  # Top-right graph for America
plot_keyword_mentions(df, 'USA', axes[1, 0],color=colors[2])  # Bottom-left graph for Inflation
plot_keyword_mentions(df, 'UK', axes[1, 1],color=colors[3])  # Bottom-right graph for Politics

# Adjust layout for better spacing
plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re

# List of 12 powerful economies (highest GDP)
countries = [
    'United States', 'USA', 'US', 'America',  # USA variants
    'United Kingdom', 'UK', 'Britain',        # UK variants
    'China', 'Japan', 'Germany', 
    'India', 'France', 'Italy',
    'Canada', 'Brazil', 'Russia', 'South Korea'
]

# Combine variants - we'll count these as single countries
country_mapping = {
    'United States': 'USA', 'US': 'USA', 'America': 'USA',
    'United Kingdom': 'UK', 'Britain': 'UK',
    # Others can be added if you have variants
}

# Count mentions in headlines
country_counts = {}
for country in countries:
    # Use regex to find whole word matches (case insensitive)
    pattern = r'\b' + re.escape(country) + r'\b'
    count = df['headline'].str.contains(pattern, case=False, regex=True).sum()
    
    # Map to standardized country names
    standardized = country_mapping.get(country, country)
    country_counts[standardized] = country_counts.get(standardized, 0) + count

# Select top 12 countries (already standardized)
top_countries = sorted(country_counts.items(), key=lambda x: x[1], reverse=True)[:12]
countries, counts = zip(*top_countries)  # Unzip the tuples

# Create bar chart
plt.figure(figsize=(12, 6))
bars = plt.bar(countries, counts, color=plt.cm.tab20.colors[:12])

# Add value labels on each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}', ha='center', va='bottom')

# Formatting
plt.title('Country Mentions in Headlines (Top 12 Economies)')
plt.xlabel('Country')
plt.ylabel('Number of Mentions')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df_gdp = pd.read_csv('/Users/georgewalsh/Desktop/API_NY/GDP.csv', skiprows=4)  # Skip the first 4 rows which contain metadata

# Display the first few rows of the DataFrame to verify
df_gdp.head()

df_gdp_cleaned = df_gdp[['Country Name','2019', '2020', '2021', '2022','2023']]
df_gdp_cleaned.head(20)



In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess

# Convert all GDP values (2019-2023) to numeric (integers or floats)
gdp_columns = ['2019', '2020', '2021', '2022', '2023']
df_gdp_cleaned[gdp_columns] = df_gdp_cleaned[gdp_columns].apply(pd.to_numeric, errors='coerce')  # Convert values

# Create a new column summarizing the total GDP over all years
df_gdp_cleaned['Total GDP'] = df_gdp_cleaned[gdp_columns].sum(axis=1)

# Convert column names (years) to strings before melting
df_gdp_cleaned.columns = df_gdp_cleaned.columns.astype(str)

# Reshape the dataframe for visualization
df_gdp_melted = df_gdp_cleaned.melt(id_vars=['Country Name', 'Total GDP'], var_name='Year', value_name='GDP')

df_gdp_melted['Country Name'] = df_gdp_melted['Country Name'].replace('United Kingdom', 'UK')
df_gdp_melted[df_gdp_melted["Country Name"] == "UK"]




In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Assuming 'df_gdp_cleaned' is already defined as per your previous code

# List of 12 random countries
df_random_countries = ['United States',  # USA variants
    'United Kingdom',        # UK variants
    'China', 'Japan', 'Germany', 
    'India', 'France', 'Italy',
    'Canada', 'Brazil', 'Russia', 'South Korea']

# Filter the dataframe for the selected countries
df_filtered = df_gdp_cleaned[df_gdp_cleaned['Country Name'].isin(df_random_countries)]

# Create a bar chart of Total GDP for these random countries
plt.figure(figsize=(10, 6))
sns.barplot(x='Country Name', y='Total GDP', data=df_filtered, palette='viridis')

# Customize the plot
plt.xlabel('Country')
plt.ylabel('Total GDP (in USD)')
plt.title('Total GDP of 12 Random Countries')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Display the plot
plt.show()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.nonparametric.smoothers_lowess import lowess
import re

# List of 12 random countries for GDP chart and country mentions chart
df_countries = ['United States', 'United Kingdom', 'China', 'Japan', 'Germany', 
    'India', 'France', 'Italy', 'Canada', 'Brazil', 'Russia', 'South Korea']

countries = [
    'United States', 'USA', 'US', 'America',  # USA variants
    'United Kingdom', 'UK', 'Britain',        # UK variants
    'China', 'Japan', 'Germany', 
    'India', 'France', 'Italy',
    'Canada', 'Brazil', 'Russia', 'South Korea'
]

# Combine variants - we'll count these as single countries
country_mapping = {
    'United States': 'USA', 'US': 'USA', 'America': 'USA',
    'United Kingdom': 'UK', 'Britain': 'UK',
    # Others can be added if you have variants
}

# Assuming 'df_gdp_cleaned' and 'df' are already defined as per previous code
# Count mentions in headlines (from a DataFrame `df` containing 'headline' column)
country_counts = {}
for country in countries:
    pattern = r'\b' + re.escape(country) + r'\b'
    count = df['headline'].str.contains(pattern, case=False, regex=True).sum()
    standardized = country_mapping.get(country, country)
    country_counts[standardized] = country_counts.get(standardized, 0) + count

# Select top 12 countries (already standardized)
top_countries = sorted(country_counts.items(), key=lambda x: x[1], reverse=True)[:12]
countries_mentions, counts_mentions = zip(*top_countries)  # Unzip the tuples

# Filter GDP data for the selected countries
df_filtered = df_gdp_cleaned[df_gdp_cleaned['Country Name'].isin(df_countries)]

# Create a bar chart for the country mentions
fig, axes = plt.subplots(1, 2, figsize=(16, 6))  # Create two subplots

# Bar chart for mentions
sns.barplot(x=list(countries_mentions), y=list(counts_mentions), ax=axes[0], palette='tab20')
axes[0].set_title('Country Mentions in Headlines (Top 12 Economies)')
axes[0].set_xlabel('Country')
axes[0].set_ylabel('Number of Mentions')
axes[0].tick_params(axis='x', rotation=45)

# Add smoothed line of best fit to the mentions chart
lowess_result_mentions = lowess(counts_mentions, list(range(len(countries_mentions))), frac=0.3)
axes[0].plot(countries_mentions, lowess_result_mentions[:, 1], color='red', lw=2, label='Smoothed Line')
axes[0].legend()

# Bar chart for Total GDP
sns.barplot(x='Country Name', y='Total GDP', data=df_filtered, ax=axes[1], palette='tab20')
axes[1].set_title('Total GDP of 12 Random Countries')
axes[1].set_xlabel('Country')
axes[1].set_xticks([])
axes[1].set_ylabel('Total GDP (in USD)')
axes[1].tick_params(axis='x', rotation=45)

# Add smoothed line of best fit to the GDP chart
lowess_result_gdp = lowess(df_filtered['Total GDP'], list(range(len(df_filtered))), frac=0.3)
axes[1].plot(df_filtered['Country Name'], lowess_result_gdp[:, 1], color='red', lw=2, label='Smoothed Line')
axes[1].legend()

# Layout adjustments
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns

# ====== First Plot Data Preparation ======
# List of 12 powerful economies (highest GDP)
countries = [
    'United States', 'USA', 'US', 'America',  # USA variants
    'United Kingdom', 'UK', 'Britain',        # UK variants
    'China', 'Japan', 'Germany', 
    'India', 'France', 'Italy',
    'Canada', 'Brazil', 'Russia', 'South Korea'
]

# Combine variants - we'll count these as single countries
country_mapping = {
    'USA': 'United States',
    'United Kingdom': 'UK', 'Britain': 'UK',
    # Others can be added if you have variants
}

# Count mentions in headlines
country_counts = {}
for country in countries:
    pattern = r'\b' + re.escape(country) + r'\b'
    count = df['headline'].str.contains(pattern, case=False, regex=True).sum()
    standardized = country_mapping.get(country, country)
    country_counts[standardized] = country_counts.get(standardized, 0) + count

    
    # Map to standardized country names
    standardized = country_mapping.get(country, country)
    country_counts[standardized] = country_counts.get(standardized, 0) + count

# Get top countries sorted by GDP order (using the same order as the GDP plot)
gdp_order_countries = ['United States', 'China', 'Japan', 'Germany', 'India', 
                      'UK', 'France', 'Italy', 'Canada', 'Brazil', 'Russia', 'South Korea']

# Filter and sort the country counts to match GDP order
sorted_country_counts = [(country, country_counts.get(country, 0)) 
                        for country in gdp_order_countries]
countries_mentions, counts = zip(*sorted_country_counts)

# ====== Second Plot Data Preparation ======
# Filter the GDP dataframe for the selected countries and sort by our predefined order
df_filtered = df_gdp_cleaned[df_gdp_cleaned['Country Name'].isin(gdp_order_countries)]
# Convert to categorical to preserve order
df_filtered['Country Name'] = pd.Categorical(df_filtered['Country Name'], 
                                           categories=gdp_order_countries,
                                           ordered=True)
df_filtered = df_filtered.sort_values('Country Name')

# ====== Create Combined Figure ======
plt.figure(figsize=(14, 10))

# First subplot - Mentions
plt.subplot(2, 1, 1)
bars = plt.bar(countries_mentions, counts, color=plt.cm.tab20.colors[:12])
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}', ha='center', va='bottom')
plt.title('Country Mentions in Headlines (Top 12 Economies)')
plt.ylabel('Number of Mentions')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Second subplot - GDP
plt.subplot(2, 1, 2)
sns.barplot(x='Country Name', y='Total GDP', data=df_filtered, palette='viridis')
plt.xlabel('Country')
plt.ylabel('Total GDP (in USD)')
plt.title('Total GDP of Top 12 Economies')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()