In [None]:
import requests
import pandas as pd
import random
import os
from datetime import datetime, timedelta

# Guardian API Key and Base URL
API_KEY = "998344a2-04a1-4410-9d53-1490cfa2e9d2"
BASE_URL = "https://content.guardianapis.com/search"

# Output file
FILE_NAME = "guardian_articles.csv"

# Number of weeks to scrape
NUM_WEEKS = 600
ARTICLES_PER_WEEK = 8

# Generate random weekly dates (going backward from today)
start_date = datetime.today()
dates = [(start_date - timedelta(weeks=i)).strftime("%Y-%m-%d") for i in range(NUM_WEEKS)]

# Load existing data if the file exists
if os.path.exists(FILE_NAME):
    df = pd.read_csv(FILE_NAME)
else:
    df = pd.DataFrame(columns=["headline", "publication_date", "url"])

# Fetch articles for each week
for week_date in dates:
    params = {
        "api-key": API_KEY,
        "from-date": week_date,
        "to-date": week_date,
        "show-fields": "headline",
        "page-size": 10,  # Get up to 10 articles from the date
    }

    response = requests.get(BASE_URL, params=params)

    if response.status_code == 200:
        data = response.json()
        articles = data["response"]["results"]
        
        if articles:
            selected_article = random.choice(articles)  # Pick one randomly
            article_data = {
                "headline": selected_article["webTitle"],
                "publication_date": selected_article["webPublicationDate"],
                "url": selected_article["webUrl"],
            }

            # Append new data and save
            df = pd.concat([df, pd.DataFrame([article_data])], ignore_index=True)
            df.to_csv(FILE_NAME, index=False)

            print(f"Saved article from {week_date}: {article_data['headline']}")
        else:
            print(f"No articles found for {week_date}.")
    else:
        print(f"Failed to fetch data for {week_date}: {response.status_code}")

print(" Data collection complete!")


In [None]:
#this allows us to test the sucess of our scrapinng and ensure it has gone back 500 weeks also that it was effectively formatted
df.tail()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors

# Assuming 'df' is your dataframe with a 'publication_date' column in datetime format
# Sample data:
# df = pd.read_csv('your_data.csv')

# Make sure 'publication_date' is in datetime format
df['publication_date'] = pd.to_datetime(df['publication_date'])

# Remove timezone info if present
df['publication_date'] = df['publication_date'].dt.tz_localize(None)

# Extract Year-Month from publication_date
df['year_month'] = df['publication_date'].dt.to_period('M')

# Group by 'year_month' to count how many articles were published in each month
monthly_data = df.groupby('year_month').size().reset_index(name='count')

# Format the 'year_month' to show as 'Jan 18', 'Feb 18', etc.
monthly_data['year_month_str'] = monthly_data['year_month'].dt.strftime('%b %y')

# Create a list of distinct colors for each month (12 months in a year)
month_colors = list(mcolors.TABLEAU_COLORS.values())  # Use Tableau colors, which are distinct

# Map each month to a color
# The color for each month will repeat, so January always gets the same color, February the same, etc.
month_to_color = {month: month_colors[i % len(month_colors)] for i, month in enumerate(range(1, 13))}

# Assign colors based on the month part of the 'year_month'
monthly_data['color'] = monthly_data['year_month'].dt.month.map(month_to_color)

# Sort the data by 'count' to identify the 10 smallest values
sorted_data = monthly_data.sort_values(by='count')

# Identify the 10 smallest values
smallest_10 = sorted_data.head(10)

# Assign red color to the 10 smallest values, blue to the rest
monthly_data['color'] = monthly_data.apply(
    lambda row: 'red' if row['year_month'] in smallest_10['year_month'].values else 'blue',
    axis=1
)

# Create a bar chart for the count of articles per month
plt.figure(figsize=(14, 6))

# Create the bars with the assigned colors
bars = plt.bar(monthly_data['year_month_str'], monthly_data['count'], color=monthly_data['color'])

# Add labels and title
plt.xlabel('Month')
plt.ylabel('Article Count')
plt.title('Monthly Data of Articles (Top 10 smallest counts in red)')

# Rotate x-axis labels for better readability (vertical labels)
plt.xticks(rotation=90)

# Display the bar chart
plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Prepare the data
df['publication_date'] = pd.to_datetime(df['publication_date'])
df['year_month'] = df['publication_date'].dt.to_period('M')
df['month'] = df['publication_date'].dt.month_name()
df['year'] = df['publication_date'].dt.year

# Count headlines per month
monthly_counts = df.groupby(['year_month', 'month']).size().reset_index(name='headline_count')

# Create figure with two subplots
plt.figure(figsize=(16, 6))

# Define a colour palette for months
month_palette = {
    'January': '#1f77b4', 'February': '#ff7f0e', 'March': '#2ca02c',
    'April': '#d62728', 'May': '#9467bd', 'June': '#8c564b',
    'July': '#e377c2', 'August': '#7f7f7f', 'September': '#bcbd22',
    'October': '#17becf', 'November': '#aec7e8', 'December': '#ffbb78'
}

# Subplot 1: Overall distribution (single box)
plt.subplot(1, 2, 1)
sns.boxplot(data=monthly_counts, y='headline_count', color='lightblue')
plt.title('Overall Distribution of Monthly Headline Counts')
plt.ylabel('Number of Headlines')
plt.xlabel('All Months Combined')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Subplot 2: Monthly distribution (coloured by month)
plt.subplot(1, 2, 2)
sns.boxplot(
    data=monthly_counts,
    x='month',
    y='headline_count',
    order=['January', 'February', 'March', 'April', 'May', 'June',
           'July', 'August', 'September', 'October', 'November', 'December'],
    palette=month_palette
)
plt.title('Monthly Distribution of Headline Counts')
plt.ylabel('Number of Headlines')
plt.xlabel('Month')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add a custom legend for the colours
legend_elements = [Patch(facecolor=month_palette[m], label=m) 
                  for m in month_palette]
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have a DataFrame called 'df' with a 'title' column
# If not, you would load your data first:
# df = pd.read_csv('your_data.csv')

# List of keywords to search for
keywords = ['economy', 'inflation', 'recession', 
            'pandemic', 'war', 'conflict',
            'trade','death','aid','attack',
            'election','debt',
            'crime','healthcare',
            'education',
]

# Create a function to count keyword occurrences
def count_keywords(df, column_name, keywords):
    keyword_counts = {}
    
    # Convert titles to lowercase for case-insensitive search
    titles = df[column_name].str.lower()
    
    for keyword in keywords:
        # Count how many titles contain the keyword
        count = titles.str.contains(keyword).sum()
        keyword_counts[keyword] = count
    
    return keyword_counts

# Get keyword counts
keyword_counts = count_keywords(df, 'headline', keywords)

# Create a DataFrame for plotting
plot_data = pd.DataFrame.from_dict(keyword_counts, orient='index', columns=['count'])
plot_data = plot_data.sort_values('count', ascending=False)

# Create the bar chart
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=plot_data.index, y=plot_data['count'], palette='viridis')

# Customize the chart
plt.title('Frequency of Keywords in Titles', fontsize=16)
plt.xlabel('Keywords', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add value labels on top of each bar
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 5), 
                textcoords='offset points')

plt.tight_layout()
plt.show()

In [None]:


# Read the CSV file into a pandas DataFrame
df_gdp = pd.read_csv('/Users/georgewalsh/Desktop/API_NY/GDP.csv', skiprows=4)  # Skip the first 4 rows which contain metadata

# Display the first few rows of the DataFrame to verify
df_gdp.head()

df_gdp_cleaned = df_gdp[['Country Name','2014','2015','2016','2017','2018','2019', '2020', '2021', '2022','2023']]
df_gdp_cleaned.head(20)


In [None]:
# Convert all GDP values (2019-2023) to numeric (integers or floats)
gdp_columns = ['2014','2015','2016','2017','2018','2019', '2020', '2021', '2022', '2023']
df_gdp_cleaned[gdp_columns] = df_gdp_cleaned[gdp_columns].apply(pd.to_numeric, errors='coerce')  # Convert values

# Create a new column summarizing the total GDP over all years
df_gdp_cleaned['Total GDP'] = df_gdp_cleaned[gdp_columns].sum(axis=1)

# Convert column names (years) to strings before melting
df_gdp_cleaned.columns = df_gdp_cleaned.columns.astype(str)

# Reshape the dataframe for visualization
df_gdp_melted = df_gdp_cleaned.melt(id_vars=['Country Name', 'Total GDP'], var_name='Year', value_name='GDP')

df_gdp_melted['Country Name'] = df_gdp_melted['Country Name'].replace('United Kingdom', 'UK')
df_gdp_melted[df_gdp_melted["Country Name"] == "UK"]


In [None]:

#this allows us to incorporate the smoothed line of best fit
from statsmodels.nonparametric.smoothers_lowess import lowess
# Define our target countries (in the order we want them displayed)
target_countries = [
    'United States', 'China', 'Japan', 'Germany', 
    'India', 'United Kingdom', 'France', 'Italy',
    'Canada', 'Brazil', 'Russia', 'South Korea'
]
# Country variants to search for in headlines
country_variants = {
    'United States': ['United States', 'USA', 'US', 'America'],
    'United Kingdom': ['United Kingdom', 'UK', 'Britain'],
    'China': ['China'],
    'Japan': ['Japan'],
    'Germany': ['Germany'],
    'India': ['India'],
    'France': ['France'],
    'Italy': ['Italy'],
    'Canada': ['Canada'],
    'Brazil': ['Brazil'],
    'Russia': ['Russia'],
    'South Korea': ['South Korea']
}
# Count mentions in headlines while keeping original country names this ensures we allow varriants
country_mentions = {}
for country, variants in country_variants.items():
    total = 0
    for variant in variants:
        pattern = r'\b' + re.escape(variant) + r'\b'
        count = df['headline'].str.contains(pattern, case=False, regex=True).sum()
        total += count
    country_mentions[country] = total

# Prepare data for plotting
mentions_counts = [country_mentions[country] for country in target_countries]

# Filter GDP data for our target countries and maintain order
df_filtered = df_gdp_cleaned[df_gdp_cleaned['Country Name'].isin(target_countries)]
df_filtered['Country Name'] = pd.Categorical(
    df_filtered['Country Name'], 
    categories=target_countries,
    ordered=True
)
df_filtered = df_filtered.sort_values('Country Name')

#combined figure
plt.figure(figsize=(14, 10))

# First subplot 
plt.subplot(2, 1, 1)
bars = plt.bar(target_countries, mentions_counts, color=plt.cm.tab20.colors[:12])
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}', ha='center', va='bottom')
    
# Add smoothed line
mentions_smoothed = lowess(mentions_counts, np.arange(len(target_countries)), frac=0.3)
plt.plot(target_countries, mentions_smoothed[:, 1], color='red', lw=2, label='Trend Line')

plt.title('Country Mentions in Headlines (Top 12 Economies)')
plt.ylabel('Number of Mentions')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.legend()

# Second subplot 
plt.subplot(2, 1, 2)
sns.barplot(x='Country Name', y='Total GDP', data=df_filtered, palette='viridis', order=target_countries)
plt.xlabel('Country')
plt.ylabel('Total GDP in Trillions(in USD)')
plt.title('Total GDP of Top 12 Economies')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

#Formatting
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Read the CSV file into a pandas DataFrame
df_war = pd.read_csv('/Users/georgewalsh/Documents/battle/battledata.csv',skiprows=4)  # Skip the first 4 rows which contain metadata
df_war_cleaned = df_war[['Country Name','2014','2015','2016','2017','2018','2019', '2020', '2021', '2022','2023']]
# Display the first few rows of the DataFrame to verify
df_war_cleaned.head()

In [None]:

from scipy.interpolate import make_interp_spline
# Ensure column names are strings
df_war_cleaned.columns = df_war_cleaned.columns.astype(str)

# Identify valid year columns (2014-2023) that exist in the dataframe
valid_years = [str(year) for year in range(2014, 2024) if str(year) in df_war_cleaned.columns]



# Compute the number of non-null values for each year
yearly_counts = [df_war_cleaned[year].notnull().sum() for year in valid_years]

# Convert to DataFrame for proper plotting
data_counts_df = pd.DataFrame({"Year": valid_years, "Count": yearly_counts})
data_counts_df["Year"] = data_counts_df["Year"].astype(int)  # Ensure Year is numeric

# Create the figure and axis
fig, ax = plt.subplots(figsize=(10, 6))

# Create the bar chart
bars = ax.bar(data_counts_df['Year'], data_counts_df['Count'], color='skyblue', label='Count')

# Add line of best fit (smoothed)
# Create x and y for the line
x = data_counts_df['Year'].values
y = data_counts_df['Count'].values

# Generate more points for smooth curve
x_smooth = np.linspace(x.min(), x.max(), 300)  

# Create spline interpolator
spl = make_interp_spline(x, y, k=3)  
y_smooth = spl(x_smooth)

# Plot the smoothed line
ax.plot(x_smooth, y_smooth, color='red', linewidth=2, label='Trend Line')

# Add labels and title
ax.set_xlabel('Year', fontsize=12)
ax.set_xticks(np.linspace(x.min(), x.max(), 10))
ax.set_ylabel('Number of countries with data available', fontsize=12)
ax.set_title('Countries with available statistics', fontsize=14)

# Add legend
ax.legend()
#adds grid lines
ax.grid(True, linestyle='--', alpha=0.7)
# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Show the plot
plt.tight_layout()
plt.show()




In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import make_interp_spline

# First, create the war count data from your initial dataframe
df['war_count'] = df['headline'].str.contains('war', case=False).astype(int)

# Group by year and sum war counts
war_by_year = df.groupby('year')['war_count'].sum().reset_index()

# Plotting bar chart with a trend line
plt.figure(figsize=(12, 6))

# Bar chart for the 'war' mentions count
bars = plt.bar(war_by_year['year'], war_by_year['war_count'], color='#8B0000', alpha=0.7, label='"War" in Titles')

# Smoothing line of best fit
x = war_by_year['year'].values
y = war_by_year['war_count'].values
if len(x) > 3:
    x_smooth = np.linspace(x.min(), x.max(), 300)
    spl = make_interp_spline(x, y, k=min(3, len(x)-1))  # Adjust spline order based on the data points
    y_smooth = spl(x_smooth)
    plt.plot(x_smooth, y_smooth, color='black', linewidth=2, linestyle='--', label='Trend')

# Formatting and labels
plt.title('Frequency of "War" in Titles by Year', fontsize=14, pad=20)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Titles Containing "War"', fontsize=12)
plt.grid(True, linestyle=':', alpha=0.4)
plt.xticks(war_by_year['year'], rotation=45)

# Adding value labels on top of bars
for bar in bars:
    height = bar.get_height()
    if height > 0:
        plt.text(bar.get_x() + bar.get_width()/2., height, f'{int(height)}', ha='center', va='bottom')

plt.legend()
plt.tight_layout()
plt.show()

# Now for the second block with df_war_cleaned for summing yearly data
# Ensure column names are strings and valid years between 2014 and 2023
df_war_cleaned.columns = df_war_cleaned.columns.astype(str)
valid_years = [str(year) for year in range(2014, 2024) if str(year) in df_war_cleaned.columns]

# Sum values for each year
yearly_sums = [df_war_cleaned[year].sum() for year in valid_years]

# Convert sums to DataFrame
data_sums_df = pd.DataFrame({"Year": valid_years, "Sum": yearly_sums})
data_sums_df["Year"] = data_sums_df["Year"].astype(int)

# Plotting the bar chart for summed values with a trend line
fig, ax = plt.subplots(figsize=(10, 6))

# Create bar chart
bars = ax.bar(data_sums_df['Year'], data_sums_df['Sum'], color='skyblue', label='Total Value')

# Smooth trend line
x = data_sums_df['Year'].values
y = data_sums_df['Sum'].values
if len(x) > 3:
    x_smooth = np.linspace(x.min(), x.max(), 300)
    spl = make_interp_spline(x, y, k=min(3, len(x)-1))
    y_smooth = spl(x_smooth)
    ax.plot(x_smooth, y_smooth, color='red', linewidth=2, label='Trend Line')

# Labels and title
ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Sum of Values', fontsize=12)
ax.set_title('Yearly Sum of Values (2014-2023)', fontsize=14)
ax.set_xticks(np.linspace(x.min(), x.max(), len(x)))
ax.grid(True, linestyle='--', alpha=0.7)
ax.legend()

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height, f'{height:,.0f}', ha='center', va='bottom')


plt.xticks(rotation=45)
plt.tight_layout()
plt.show()