In [None]:
import requests
import pandas as pd
import random
import os
from datetime import datetime, timedelta

# Guardian API Key and Base URL
API_KEY = "998344a2-04a1-4410-9d53-1490cfa2e9d2"
BASE_URL = "https://content.guardianapis.com/search"

# Output file
FILE_NAME = "guardian_articles.csv"

# Number of weeks to scrape
NUM_WEEKS = 580
ARTICLES_PER_WEEK = 100

# Generate random weekly dates (going backward from today)
start_date = datetime.today()
dates = [(start_date - timedelta(weeks=i)).strftime("%Y-%m-%d") for i in range(NUM_WEEKS)]

# Load existing data if the file exists
if os.path.exists(FILE_NAME):
    df = pd.read_csv(FILE_NAME)
else:
    df = pd.DataFrame(columns=["headline", "publication_date", "url"])

# Fetch articles for each week
for week_date in dates:
    params = {
        "api-key": API_KEY,
        "from-date": week_date,
        "to-date": week_date,
        "show-fields": "headline",
        "page-size": 10,  # Get up to 10 articles from the date
    }

    response = requests.get(BASE_URL, params=params)

    if response.status_code == 200:
        data = response.json()
        articles = data["response"]["results"]
        
        if articles:
            selected_article = random.choice(articles)  # Pick one randomly
            article_data = {
                "headline": selected_article["webTitle"],
                "publication_date": selected_article["webPublicationDate"],
                "url": selected_article["webUrl"],
            }

            # Append new data and save
            df = pd.concat([df, pd.DataFrame([article_data])], ignore_index=True)
            df.to_csv(FILE_NAME, index=False)

            print(f"Saved article from {week_date}: {article_data['headline']}")
        else:
            print(f"No articles found for {week_date}.")
    else:
        print(f"Failed to fetch data for {week_date}: {response.status_code}")

print(" Data collection complete!")


In [None]:
#this allows us to test the sucess of our scrapinng and ensure it has gone back 500 weeks also that it was effectively formatted
# Using .shape to get the number of rows and columns
df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have a DataFrame called 'df' with a 'title' column
# If not, you would load your data first:
# df = pd.read_csv('your_data.csv')

# List of keywords to search for
keywords = ['economy', 'inflation', 'recession', 
            'pandemic', 'war', 'conflict',
            'trade','death','aid','attack',
            'election','debt',
            'crime','healthcare',
            'education',
]

# Create a function to count keyword occurrences
def count_keywords(df, column_name, keywords):
    keyword_counts = {}
    
    # Convert titles to lowercase for case-insensitive search
    titles = df[column_name].str.lower()
    
    for keyword in keywords:
        # Count how many titles contain the keyword
        count = titles.str.contains(keyword).sum()
        keyword_counts[keyword] = count
    
    return keyword_counts

# Get keyword counts
keyword_counts = count_keywords(df, 'headline', keywords)

# Create a DataFrame for plotting
plot_data = pd.DataFrame.from_dict(keyword_counts, orient='index', columns=['count'])
plot_data = plot_data.sort_values('count', ascending=False)

# Create the bar chart
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=plot_data.index, y=plot_data['count'], palette='viridis')

# Customize the chart
plt.title('Frequency of Keywords in Titles', fontsize=16)
plt.xlabel('Keywords', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add value labels on top of each bar
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 5), 
                textcoords='offset points')

plt.tight_layout()
plt.show()

In [None]:


# Read the CSV file into a pandas DataFrame
df_gdp = pd.read_csv('/Users/georgewalsh/Desktop/API_NY/GDP.csv', skiprows=4)  # Skip the first 4 rows which contain metadata

# Display the first few rows of the DataFrame to verify
df_gdp.head()

df_gdp_cleaned = df_gdp[['Country Name','2014','2015','2016','2017','2018','2019', '2020', '2021', '2022','2023']]
df_gdp_cleaned.head()


In [None]:
# Convert all GDP values (2019-2023) to numeric (integers or floats)
gdp_columns = ['2014','2015','2016','2017','2018','2019', '2020', '2021', '2022', '2023']
df_gdp_cleaned[gdp_columns] = df_gdp_cleaned[gdp_columns].apply(pd.to_numeric, errors='coerce')  # Convert values

# Create a new column summarizing the total GDP over all years
df_gdp_cleaned['Total GDP'] = df_gdp_cleaned[gdp_columns].sum(axis=1)

# Convert column names (years) to strings before melting
df_gdp_cleaned.columns = df_gdp_cleaned.columns.astype(str)

# Reshape the dataframe for visualization
df_gdp_melted = df_gdp_cleaned.melt(id_vars=['Country Name', 'Total GDP'], var_name='Year', value_name='GDP')

df_gdp_melted['Country Name'] = df_gdp_melted['Country Name'].replace('United Kingdom', 'UK')
df_gdp_melted[df_gdp_melted["Country Name"] == "UK"]


In [None]:

#this allows us to incorporate the smoothed line of best fit
from statsmodels.nonparametric.smoothers_lowess import lowess
# Define our target countries (in the order we want them displayed)
target_countries = [
    'United States', 'China', 'Japan', 'Germany', 
    'India', 'United Kingdom', 'France', 'Italy',
    'Canada', 'Brazil', 'Russia', 'South Korea'
]
# Country variants to search for in headlines
country_variants = {
    'United States': ['United States', 'USA', 'US', 'America'],
    'United Kingdom': ['United Kingdom', 'UK', 'Britain'],
    'China': ['China'],
    'Japan': ['Japan'],
    'Germany': ['Germany'],
    'India': ['India'],
    'France': ['France'],
    'Italy': ['Italy'],
    'Canada': ['Canada'],
    'Brazil': ['Brazil'],
    'Russia': ['Russia'],
    'South Korea': ['South Korea']
}
# Count mentions in headlines while keeping original country names this ensures we allow varriants
country_mentions = {}
for country, variants in country_variants.items():
    total = 0
    for variant in variants:
        pattern = r'\b' + re.escape(variant) + r'\b'
        count = df['headline'].str.contains(pattern, case=False, regex=True).sum()
        total += count
    country_mentions[country] = total

# Prepare data for plotting
mentions_counts = [country_mentions[country] for country in target_countries]

# Filter GDP data for our target countries and maintain order
df_filtered = df_gdp_cleaned[df_gdp_cleaned['Country Name'].isin(target_countries)]
df_filtered['Country Name'] = pd.Categorical(
    df_filtered['Country Name'], 
    categories=target_countries,
    ordered=True
)
df_filtered = df_filtered.sort_values('Country Name')

#combined figure
plt.figure(figsize=(14, 10))

# First subplot 
plt.subplot(2, 1, 1)
bars = plt.bar(target_countries, mentions_counts, color=plt.cm.tab20.colors[:12])
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}', ha='center', va='bottom')
    
# Add smoothed line
mentions_smoothed = lowess(mentions_counts, np.arange(len(target_countries)), frac=0.3)
plt.plot(target_countries, mentions_smoothed[:, 1], color='red', lw=2, label='Trend Line')

plt.title('Country Mentions in Headlines (Top 12 Economies)')
plt.ylabel('Number of Mentions')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.legend()

# Second subplot 
plt.subplot(2, 1, 2)
sns.barplot(x='Country Name', y='Total GDP', data=df_filtered, palette='viridis', order=target_countries)
plt.xlabel('Country')
plt.ylabel('Total GDP in Trillions(in USD)')
plt.title('Total GDP of Top 12 Economies')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

#Formatting
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Read the CSV file into a pandas DataFrame
df_war = pd.read_csv('/Users/georgewalsh/Documents/battle/battledata.csv',skiprows=4)  # Skip the first 4 rows which contain metadata
df_war_cleaned = df_war[['Country Name','2014','2015','2016','2017','2018','2019', '2020', '2021', '2022','2023']]
# Display the first few rows of the DataFrame to verify
df_war_cleaned.head()

In [None]:


# Then do the summing
year_columns = [str(year) for year in range(2014, 2024)]
df_war_cleaned['Total'] = df_war_cleaned[year_columns].sum(axis=1)

# Corrected code to filter rows where 'Country Name' is "Russia"
df_russia = df_war_cleaned[df_war_cleaned['Country Name'] == "United States"]

# Display the filtered DataFrame
df_russia


In [None]:



# Define key countries and variations


key_countries = ['Ukraine', 'Russia', 'United States', 'Sudan', 'United Kingdom',
                 'Afghanistan', 'Ethiopia', 'Iraq']

country_variations = {
    'Ukraine': ['ukraine'],
    'Russia': ['russia'],
    'Iraq': ['iraq'],
    'United Kingdom': ['united kingdom', 'uk'],
    'United States': ['united states', 'us', 'usa'],
    'Ethiopia': ['ethiopia'],
    'Afghanistan': ['afghanistan'],
    'Sudan': ['sudan']
}





# Ensure lowercase version of headlines
df['headline_lower'] = df['headline'].str.lower()

# Count mentions
mention_counts = {}
for country, variations in country_variations.items():
    mask = df['headline_lower'].str.contains(r'\b(war|conflict)\b', case=False)
    country_mask = mask & df['headline_lower'].apply(
        lambda text: any(variant in text for variant in variations)
    )
    mention_counts[country] = country_mask.sum()

mentions_df = pd.DataFrame(list(mention_counts.items()), columns=['Country', 'Mentions'])


#  War Deaths 


# Filter the DataFrame to include only key countries
df_war_subset = df_war_cleaned[df_war_cleaned['Country Name'].isin(key_countries)]
df_war_subset = df_war_subset.sort_values('Total', ascending=False)
# plotting
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

# plot total deaths
bars1 = ax1.bar(df_war_subset['Country Name'], df_war_subset['Total'], color='skyblue')
ax1.set_title('Total Deaths (2014–2023)', fontsize=14)
ax1.set_ylabel('Total Deaths')
ax1.grid(axis='y', linestyle=':', alpha=0.5)



for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height, f'{int(height):,}', 
             ha='center', va='bottom', fontsize=10)


    
# plot total mentions
# Match order of countries 
mentions_df = mentions_df[mentions_df['Country'].isin(key_countries)]
mentions_df = mentions_df.set_index('Country').loc[df_war_subset['Country Name']].reset_index()


# Add a smooth trend line — makes it easier to spot the general pattern
x_vals = np.arange(len(mentions_df))
y_vals = mentions_df['Mentions'].values

if len(x_vals) > 2:
    x_smooth = np.linspace(x_vals.min(), x_vals.max(), 300)
    spline = make_interp_spline(x_vals, y_vals, k=2)  # you can adjust 'k' for curve smoothness
    y_smooth = spline(x_smooth)
    ax2.plot(x_smooth, y_smooth, color='darkred', linestyle='--', linewidth=2, label='Smoothed Trend')
    
bars2 = ax2.bar(mentions_df['Country'], mentions_df['Mentions'], color='cornflowerblue')
ax2.set_title('Mentions of "War" or "Conflict" in Headlines', fontsize=14)
ax2.set_ylabel('Number of Mentions')
ax2.set_xticklabels(mentions_df['Country'], rotation=30)
ax2.grid(axis='y', linestyle=':', alpha=0.5)

for bar in bars2:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height, f'{height}', 
             ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Read the Excel file, skipping initial blank rows
df_pol = pd.read_excel('/Users/georgewalsh/Documents/pivottablefull.xlsx', sheet_name='Sheet1', header=8)
df_pol_cleaned = df_pol[['Party',"Data",2015,2017,2019]]
df_vote= df_pol_cleaned[df_pol_cleaned['Data'] == 'Sum of Vote']


#replace party abbreviations with full names
df_vote['Party'] = df_vote['Party'].replace({'CON': 'Conservative', 'LAB': 'Labour', 'LIB': 'Liberal Democrats','MIN': 'Minority Parties','NAT': 'British National Party','OTH':"Other"})

df_vote
