In [None]:
import requests
import pandas as pd
import random
import os
from datetime import datetime, timedelta

# Guardian API Key and Base URL
API_KEY = "998344a2-04a1-4410-9d53-1490cfa2e9d2"
BASE_URL = "https://content.guardianapis.com/search"

# Output file
FILE_NAME = "guardian_articles.csv"

# Number of weeks to scrape
NUM_WEEKS = 580
ARTICLES_PER_WEEK = 100

# Generate random weekly dates (going backward from today)
start_date = datetime.today()
dates = [(start_date - timedelta(weeks=i)).strftime("%Y-%m-%d") for i in range(NUM_WEEKS)]

# Load existing data if the file exists
if os.path.exists(FILE_NAME):
    df = pd.read_csv(FILE_NAME)
else:
    df = pd.DataFrame(columns=["headline", "publication_date", "url"])

# Fetch articles for each week
for week_date in dates:
    params = {
        "api-key": API_KEY,
        "from-date": week_date,
        "to-date": week_date,
        "show-fields": "headline",
        "page-size": 10,  # Get up to 10 articles from the date
    }

    response = requests.get(BASE_URL, params=params)

    if response.status_code == 200:
        data = response.json()
        articles = data["response"]["results"]
        
        if articles:
            selected_article = random.choice(articles)  # Pick one randomly
            article_data = {
                "headline": selected_article["webTitle"],
                "publication_date": selected_article["webPublicationDate"],
                "url": selected_article["webUrl"],
            }

            # Append new data and save
            df = pd.concat([df, pd.DataFrame([article_data])], ignore_index=True)
            df.to_csv(FILE_NAME, index=False)

            print(f"Saved article from {week_date}: {article_data['headline']}")
        else:
            print(f"No articles found for {week_date}.")
    else:
        print(f"Failed to fetch data for {week_date}: {response.status_code}")

print(" Data collection complete!")


In [None]:
#this allows us to test the sucess of our scrapinng and ensure it has gone back 500 weeks also that it was effectively formatted
# Using .shape to get the number of rows and columns
df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have a DataFrame called 'df' with a 'title' column
# If not, you would load your data first:
# df = pd.read_csv('your_data.csv')

# List of keywords to search for
keywords = ['economy', 'inflation', 'recession', 
            'pandemic', 'war', 'conflict',
            'trade','death','aid','attack',
            'election','debt',
            'crime','healthcare',
            'education',
]

# Create a function to count keyword occurrences
def count_keywords(df, column_name, keywords):
    keyword_counts = {}
    
    # Convert titles to lowercase for case-insensitive search
    titles = df[column_name].str.lower()
    
    for keyword in keywords:
        # Count how many titles contain the keyword
        count = titles.str.contains(keyword).sum()
        keyword_counts[keyword] = count
    
    return keyword_counts

# Get keyword counts
keyword_counts = count_keywords(df, 'headline', keywords)

# Create a DataFrame for plotting
plot_data = pd.DataFrame.from_dict(keyword_counts, orient='index', columns=['count'])
plot_data = plot_data.sort_values('count', ascending=False)

# Create the bar chart
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=plot_data.index, y=plot_data['count'], palette='viridis')

# Customize the chart
plt.title('Frequency of Keywords in Titles', fontsize=16)
plt.xlabel('Keywords', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add value labels on top of each bar
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 5), 
                textcoords='offset points')

plt.tight_layout()
plt.show()

In [None]:


# Read the CSV file into a pandas DataFrame
df_gdp = pd.read_csv('/Users/georgewalsh/Desktop/API_NY/GDP.csv', skiprows=4)  # Skip the first 4 rows which contain metadata

# Display the first few rows of the DataFrame to verify
df_gdp.head()

df_gdp_cleaned = df_gdp[['Country Name','2014','2015','2016','2017','2018','2019', '2020', '2021', '2022','2023']]
df_gdp_cleaned.head()


In [None]:
# Convert all GDP values (2019-2023) to numeric (integers or floats)
gdp_columns = ['2014','2015','2016','2017','2018','2019', '2020', '2021', '2022', '2023']
df_gdp_cleaned[gdp_columns] = df_gdp_cleaned[gdp_columns].apply(pd.to_numeric, errors='coerce')  # Convert values

# Create a new column summarizing the total GDP over all years
df_gdp_cleaned['Total GDP'] = df_gdp_cleaned[gdp_columns].sum(axis=1)

# Convert column names (years) to strings before melting
df_gdp_cleaned.columns = df_gdp_cleaned.columns.astype(str)

# Reshape the dataframe for visualization
df_gdp_melted = df_gdp_cleaned.melt(id_vars=['Country Name', 'Total GDP'], var_name='Year', value_name='GDP')

df_gdp_melted['Country Name'] = df_gdp_melted['Country Name'].replace('United Kingdom', 'UK')
df_gdp_melted[df_gdp_melted["Country Name"] == "UK"]


In [None]:

#this allows us to incorporate the smoothed line of best fit
from statsmodels.nonparametric.smoothers_lowess import lowess
# Define our target countries (in the order we want them displayed)
target_countries = [
    'United States', 'China', 'Japan', 'Germany', 
    'India', 'United Kingdom', 'France', 'Italy',
    'Canada', 'Brazil', 'Russia', 'South Korea'
]
# Country variants to search for in headlines
country_variants = {
    'United States': ['United States', 'USA', 'US', 'America'],
    'United Kingdom': ['United Kingdom', 'UK', 'Britain'],
    'China': ['China'],
    'Japan': ['Japan'],
    'Germany': ['Germany'],
    'India': ['India'],
    'France': ['France'],
    'Italy': ['Italy'],
    'Canada': ['Canada'],
    'Brazil': ['Brazil'],
    'Russia': ['Russia'],
    'South Korea': ['South Korea']
}
# Count mentions in headlines while keeping original country names this ensures we allow varriants
country_mentions = {}
for country, variants in country_variants.items():
    total = 0
    for variant in variants:
        pattern = r'\b' + re.escape(variant) + r'\b'
        count = df['headline'].str.contains(pattern, case=False, regex=True).sum()
        total += count
    country_mentions[country] = total

# Prepare data for plotting
mentions_counts = [country_mentions[country] for country in target_countries]

# Filter GDP data for our target countries and maintain order
df_filtered = df_gdp_cleaned[df_gdp_cleaned['Country Name'].isin(target_countries)]
df_filtered['Country Name'] = pd.Categorical(
    df_filtered['Country Name'], 
    categories=target_countries,
    ordered=True
)
df_filtered = df_filtered.sort_values('Country Name')

#combined figure
plt.figure(figsize=(14, 10))

# First subplot 
plt.subplot(2, 1, 1)
bars = plt.bar(target_countries, mentions_counts, color=plt.cm.tab20.colors[:12])
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}', ha='center', va='bottom')
    
# Add smoothed line
mentions_smoothed = lowess(mentions_counts, np.arange(len(target_countries)), frac=0.3)
plt.plot(target_countries, mentions_smoothed[:, 1], color='red', lw=2, label='Trend Line')

plt.title('Country Mentions in Headlines (Top 12 Economies)')
plt.ylabel('Number of Mentions')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.legend()

# Second subplot 
plt.subplot(2, 1, 2)
sns.barplot(x='Country Name', y='Total GDP', data=df_filtered, palette='viridis', order=target_countries)
plt.xlabel('Country')
plt.ylabel('Total GDP in Trillions(in USD)')
plt.title('Total GDP of Top 12 Economies')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

#Formatting
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Read the CSV file into a pandas DataFrame
df_war = pd.read_csv('/Users/georgewalsh/Documents/battle/battledata.csv',skiprows=4)  # Skip the first 4 rows which contain metadata
df_war_cleaned = df_war[['Country Name','2014','2015','2016','2017','2018','2019', '2020', '2021', '2022','2023']]
# Display the first few rows of the DataFrame to verify
df_war_cleaned.head()

In [None]:


# Then do the summing
year_columns = [str(year) for year in range(2014, 2024)]
df_war_cleaned['Total'] = df_war_cleaned[year_columns].sum(axis=1)


df_war_cleaned.head()


In [None]:


# Define countries and colours
countries = {
    'Yemen': '#8B0000',
    'Afghanistan': '#4169E1', 
    'Ethiopia': '#228B22',
    'Syria': '#FF8C00',
    'Sudan': '#9400D3'
}


plt.figure(figsize=(15, 8))

for country, color in countries.items():
    # Create war count data for headlines
    df[f'{country.lower()}_war_count'] = (df['headline'].str.contains('war', case=False) & 
                                          df['headline'].str.contains(country.lower(), case=False)).astype(int)
    
    # Group by year and sum counts
    war_by_year = df.groupby('year')[f'{country.lower()}_war_count'].sum().reset_index()
    war_by_year = war_by_year[war_by_year['year'] != 2013]  # Exclude 2013
    
    # Plot bars
    bars = plt.bar(war_by_year['year'] + (list(countries.keys()).index(country)*0.15), 
                   war_by_year[f'{country.lower()}_war_count'], 
                   width=0.15, color=color, alpha=0.7, 
                   label=f'"War" and "{country}" Mentions')
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        if height > 0:
            plt.text(bar.get_x() + bar.get_width()/2., height, f'{int(height)}', 
                     ha='center', va='bottom', fontsize=8)

# Formatting
plt.title('Frequency of "War" Mentions in Headlines by Country (2014–2023)', fontsize=16, pad=20)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Count of Headlines', fontsize=12)
plt.yticks(range(1, 7)) 
plt.grid(True, linestyle=':', alpha=0.4)
plt.xticks(sorted(df['year'].unique()), rotation=45)
plt.legend()
plt.tight_layout()
plt.show()



valid_years = [str(year) for year in range(2014, 2024)]
total_deaths_per_year = {}

for year in valid_years:
    total = 0
    for country in countries:
        country_data = df_war_cleaned[df_war_cleaned['Country Name'].str.contains(country, case=False, na=False)]
        if year in country_data.columns:
            total += country_data[year].sum()
    total_deaths_per_year[int(year)] = total

# Convert to DataFrame
combined_df = pd.DataFrame({
    "Year": list(total_deaths_per_year.keys()),
    "Total_Deaths": list(total_deaths_per_year.values())
})

# Plotting
plt.figure(figsize=(12, 6))
bars = plt.bar(combined_df['Year'], combined_df['Total_Deaths'], 
               color='teal', alpha=0.7, label='Total War Deaths (All 5 Countries)')

# Trend line
x = combined_df['Year'].values
y = combined_df['Total_Deaths'].values
if len(x) > 3:
    x_smooth = np.linspace(x.min(), x.max(), 300)
    spl = make_interp_spline(x, y, k=min(3, len(x)-1))
    y_smooth = spl(x_smooth)
    plt.plot(x_smooth, y_smooth, color='black', linestyle='--', linewidth=2, label='Trend')

# Value labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height, f'{height:,.0f}', 
             ha='center', va='bottom', fontsize=9)

# Formatting
plt.title('Total War Deaths (All 5 Countries Combined, 2014–2023)', fontsize=14)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Total Deaths', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.xticks(combined_df['Year'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:


# Define countries and colours
countries = {
    'Russia': '#8B0000',
    'Ukraine': '#4169E1'
}

# Define year range
valid_years = list(range(2014, 2024))  # 2014–2023

# Create figure and subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10), gridspec_kw={'height_ratios': [1, 1.1]})
fig.suptitle('Russia & Ukraine: War Mentions and Estimated Deaths (2014–2023)', fontsize=16, y=0.95)


for country, color in countries.items():
    df[f'{country.lower()}_war_count'] = (
        df['headline'].str.contains('war', case=False) & 
        df['headline'].str.contains(country.lower(), case=False)
    ).astype(int)

    war_by_year = df.groupby('year')[f'{country.lower()}_war_count'].sum().reset_index()
    war_by_year = war_by_year[war_by_year['year'].isin(valid_years)]
    
    offset = (list(countries.keys()).index(country) * 0.2) - 0.1
    bars = ax1.bar(war_by_year['year'] + offset, 
                   war_by_year[f'{country.lower()}_war_count'], 
                   width=0.2, color=color, alpha=0.8, 
                   label=f'"War" & "{country}" Mentions')
    
    for bar in bars:
        height = bar.get_height()
        if height > 0:
            ax1.text(bar.get_x() + bar.get_width()/2., height, f'{int(height)}', 
                     ha='center', va='bottom', fontsize=8)

ax1.set_ylabel('Headline Mentions', fontsize=12)
ax1.set_title('Headline Frequency of "War" and Country Name', fontsize=14)
ax1.grid(True, linestyle=':', alpha=0.4)
ax1.legend()
ax1.set_xticks(valid_years)
ax1.set_xticklabels(valid_years, rotation=45)




total_deaths_per_year = {}

for year in valid_years:
    total = 0
    for country in countries:
        country_data = df_war_cleaned[df_war_cleaned['Country Name'].str.contains(country, case=False, na=False)]
        if str(year) in country_data.columns:
            total += country_data[str(year)].sum()
    total_deaths_per_year[year] = total

combined_df = pd.DataFrame({
    "Year": list(total_deaths_per_year.keys()),
    "Total_Deaths": list(total_deaths_per_year.values())
})

bars = ax2.bar(combined_df['Year'], combined_df['Total_Deaths'], 
               color='darkorange', alpha=0.75, label='Russia & Ukraine Combined')

x = combined_df['Year'].values
y = combined_df['Total_Deaths'].values
if len(x) > 3:
    x_smooth = np.linspace(x.min(), x.max(), 300)
    spl = make_interp_spline(x, y, k=min(3, len(x)-1))
    y_smooth = spl(x_smooth)
    ax2.plot(x_smooth, y_smooth, color='black', linestyle='--', linewidth=2, label='Trend')

for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height, f'{height:,.0f}', 
             ha='center', va='bottom', fontsize=9)

ax2.set_xlabel('Year', fontsize=12)
ax2.set_ylabel('Total Deaths', fontsize=12)
ax2.set_title('Total Estimated War Deaths (Russia + Ukraine)', fontsize=14)
ax2.grid(True, linestyle='--', alpha=0.5)
ax2.set_xticks(valid_years)
ax2.set_xticklabels(valid_years, rotation=45)
ax2.legend()

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


In [None]:
# Define countries and colours
countries = {
    'Russia': '#8B0000',
    'Ukraine': '#4169E1'
}

# Define year range
valid_years = list(range(2014, 2024))  # 2014–2023

# Create figure and subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10), gridspec_kw={'height_ratios': [1, 1.1]})
fig.suptitle('Russia & Ukraine: War Mentions and Estimated Deaths (2014–2023)', fontsize=16, y=0.95)

# Calculate total war mentions for each country across all years
war_mentions = {}
for country, color in countries.items():
    df[f'{country.lower()}_war_count'] = (
        df['headline'].str.contains('war', case=False) & 
        df['headline'].str.contains(country.lower(), case=False)
    ).astype(int)
    total_mentions = df[df['year'].isin(valid_years)][f'{country.lower()}_war_count'].sum()
    war_mentions[country] = total_mentions

# Plot single bars for each country's total war mentions
x_pos = np.arange(len(countries))
for i, (country, color) in enumerate(countries.items()):
    ax1.bar(x_pos[i], war_mentions[country], color=color, alpha=0.8, width=0.6, 
            label=f'Total "War" & "{country}" Mentions')
    ax1.text(x_pos[i], war_mentions[country], f'{war_mentions[country]:,}', 
             ha='center', va='bottom', fontsize=10)

ax1.set_ylabel('Total Headline Mentions', fontsize=12)
ax1.set_title('Total "War" and Country Name Mentions (2014-2023)', fontsize=14)
ax1.grid(True, linestyle=':', alpha=0.4, axis='y')
ax1.legend()
ax1.set_xticks(x_pos)
ax1.set_xticklabels(countries.keys())

# Calculate total deaths across all years
total_deaths = 0
for year in valid_years:
    for country in countries:
        country_data = df_war_cleaned[df_war_cleaned['Country Name'].str.contains(country, case=False, na=False)]
        if str(year) in country_data.columns:
            total_deaths += country_data[str(year)].sum()

# Plot single bar for total deaths
ax2.bar(0, total_deaths, color='darkorange', alpha=0.75, width=0.6, 
        label='Total Russia & Ukraine Deaths')
ax2.text(0, total_deaths, f'{total_deaths:,.0f}', ha='center', va='bottom', fontsize=10)

ax2.set_xlabel('', fontsize=12)
ax2.set_ylabel('Total Deaths', fontsize=12)
ax2.set_title('Total Estimated War Deaths (Russia + Ukraine, 2014-2023)', fontsize=14)
ax2.grid(True, linestyle='--', alpha=0.5, axis='y')
ax2.set_xticks([])
ax2.legend()

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
df