In [None]:
#import requests
import requests
import pandas as pd
import random
import os
from datetime import datetime, timedelta

#set-up
# Guardian API Key and Base URL
API_KEY = "998344a2-04a1-4410-9d53-1490cfa2e9d2"
BASE_URL = "https://content.guardianapis.com/search"
# Output file
FILE_NAME = "guardian_articles.csv"
# Number of weeks to scrape
NUM_WEEKS = 554


# Generate random weekly dates (going backward from today)
start_date = datetime.today()
dates = [(start_date - timedelta(weeks=i)).strftime("%Y-%m-%d") for i in range(NUM_WEEKS)]

# Load existing data if the file exists
if os.path.exists(FILE_NAME):
    df = pd.read_csv(FILE_NAME)
else:
    df = pd.DataFrame(columns=["headline", "publication_date", "url"])

# Fetch articles for each week
for week_date in dates:
    params = {
        "api-key": API_KEY,
        "from-date": week_date,
        "to-date": week_date,
        "show-fields": "headline",
        "page-size": 10,  # Get up to 10 articles from the date
    }

    response = requests.get(BASE_URL, params=params)

    if response.status_code == 200:
        data = response.json()
        articles = data["response"]["results"]
        
        if articles:
            selected_article = random.choice(articles)  # Pick one randomly
            article_data = {
                "headline": selected_article["webTitle"],
                "publication_date": selected_article["webPublicationDate"],
                "url": selected_article["webUrl"],
            }

            # Append new data and save
            df = pd.concat([df, pd.DataFrame([article_data])], ignore_index=True)
            df.to_csv(FILE_NAME, index=False)

            print(f"Saved article from {week_date}: {article_data['headline']}")
        else:
            print(f"No articles found for {week_date}.")
    else:
        print(f"Failed to fetch data for {week_date}: {response.status_code}")

print(" Data collection complete!")
df.tail(20)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 1. Ensure datetime format
df['publication_date'] = pd.to_datetime(df['publication_date'])

# 2. Extract year and month name
df['year'] = df['publication_date'].dt.year
df['month'] = df['publication_date'].dt.month_name()

# 3. Group by year and month to count entries
monthly_distribution = df.groupby(['year', 'month']).size().reset_index(name='count')

# 4. Ensure consistent month order
month_order = [
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December'
]
monthly_distribution['month'] = pd.Categorical(monthly_distribution['month'], categories=month_order, ordered=True)

# 5. Create figure and boxplot
plt.figure(figsize=(16, 7))
ax = sns.boxplot(
    data=monthly_distribution,
    x='month',
    y='count',
    palette='pastel',
    linewidth=1.2,
    fliersize=3
)

# 6. Calculate stats and annotate
for i, month in enumerate(month_order):
    month_data = monthly_distribution[monthly_distribution['month'] == month]['count'].dropna()
    if len(month_data) == 0:
        continue

    q1 = np.percentile(month_data, 25)
    q3 = np.percentile(month_data, 75)
    median = np.median(month_data)
    whisker_low = month_data[month_data >= q1 - 1.5 * (q3 - q1)].min()
    whisker_high = month_data[month_data <= q3 + 1.5 * (q3 - q1)].max()

    # Annotate
    ax.text(i, median, f'Median: {int(median)}', ha='center', va='center', fontsize=9, color='black', weight='bold')
    ax.text(i, q1, f'Q1: {int(q1)}', ha='center', va='top', fontsize=8, color='darkblue')
    ax.text(i, q3, f'Q3: {int(q3)}', ha='center', va='bottom', fontsize=8, color='darkgreen')
    ax.text(i, whisker_low, f'Min: {int(whisker_low)}', ha='center', va='top', fontsize=8, color='gray')
    ax.text(i, whisker_high, f'Max: {int(whisker_high)}', ha='center', va='bottom', fontsize=8, color='gray')

# 7. Final styling
plt.title("Box Plot of Monthly Entry Counts Across Years\nwith Median, Quartiles and Whiskers", fontsize=16, weight='bold', pad=20)
plt.xlabel("Month", fontsize=12)
plt.ylabel("Number of Entries per Year", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.4)
sns.despine(top=True, right=True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Convert to datetime if needed
df['publication_date'] = pd.to_datetime(df['publication_date'])

# 2. Extract year
df['year'] = df['publication_date'].dt.year

# 3. Count entries per year
year_counts = df['year'].value_counts().sort_index()
year_index = year_counts.index.astype(str)
year_values = year_counts.values

# 4. Generate unique colors
colors = sns.color_palette("tab10", len(year_counts))

# 5. Plot
plt.figure(figsize=(10, 6))

# Bar chart
bars = plt.bar(
    year_index,
    year_values,
    color=colors,
    edgecolor='white',
    width=0.6,
    label='Entries'
)

# Smoothed line of best fit
sns.lineplot(
    x=year_index,
    y=year_values,
    color='black',
    linewidth=1,
    label='Trend'
)

# Add value labels above each bar
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + 1,
        f'{height}',
        ha='center',
        va='bottom',
        fontsize=10
    )

# Styling
plt.title("Entries Per Year with Trend Line", fontsize=16, weight='bold', pad=15)
plt.xlabel("")
plt.ylabel("Number of Entries", fontsize=12)
plt.xticks(rotation=0, fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.4)
sns.despine(top=True, right=True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:


# Read the CSV file into a pandas DataFrame
df_gdp = pd.read_csv('/Users/georgewalsh/Desktop/API_NY/GDP.csv', skiprows=4)  # Skip the first 4 rows which contain metadata

# Display the first few rows of the DataFrame to verify
df_gdp.head()

df_gdp_cleaned = df_gdp[['Country Name','2014','2015','2016','2017','2018','2019', '2020', '2021', '2022','2023']]


# Convert all GDP values (2019-2023) to numeric (integers or floats)
gdp_columns = ['2014','2015','2016','2017','2018','2019', '2020', '2021', '2022', '2023']
df_gdp_cleaned[gdp_columns] = df_gdp_cleaned[gdp_columns].apply(pd.to_numeric, errors='coerce')  # Convert values

# Create a new column summarizing the total GDP over all years
df_gdp_cleaned['Total GDP'] = df_gdp_cleaned[gdp_columns].sum(axis=1)



df_gdp_cleaned.head()

In [None]:

#this allows us to incorporate the smoothed line of best fit
from statsmodels.nonparametric.smoothers_lowess import lowess
# Define our target countries (in the order we want them displayed)
target_countries = [
    'United States', 'China', 'Japan', 'Germany', 
    'India', 'United Kingdom', 'France', 'Italy',
    'Canada', 'Brazil', 'Russia', 'South Korea'
]
# Country variants to search for in headlines
country_variants = {
    'United States': ['United States', 'USA', 'US', 'America'],
    'United Kingdom': ['United Kingdom', 'UK', 'Britain'],
    'China': ['China'],
    'Japan': ['Japan'],
    'Germany': ['Germany'],
    'India': ['India'],
    'France': ['France'],
    'Italy': ['Italy'],
    'Canada': ['Canada'],
    'Brazil': ['Brazil'],
    'Russia': ['Russia'],
    'South Korea': ['South Korea']
}
# Count mentions in headlines while keeping original country names this ensures we allow varriants
country_mentions = {}
for country, variants in country_variants.items():
    total = 0
    for variant in variants:
        pattern = r'\b' + re.escape(variant) + r'\b'
        count = df['headline'].str.contains(pattern, case=False, regex=True).sum()
        total += count
    country_mentions[country] = total

# Prepare data for plotting
mentions_counts = [country_mentions[country] for country in target_countries]

# Filter GDP data for our target countries and maintain order
df_filtered = df_gdp_cleaned[df_gdp_cleaned['Country Name'].isin(target_countries)]
df_filtered['Country Name'] = pd.Categorical(
    df_filtered['Country Name'], 
    categories=target_countries,
    ordered=True
)
df_filtered = df_filtered.sort_values('Country Name')

#combined figure
plt.figure(figsize=(14, 10))

# First subplot 
plt.subplot(2, 1, 1)
bars = plt.bar(target_countries, mentions_counts, color=plt.cm.tab20.colors[:12])
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}', ha='center', va='bottom')
    
# Add smoothed line
mentions_smoothed = lowess(mentions_counts, np.arange(len(target_countries)), frac=0.3)
plt.plot(target_countries, mentions_smoothed[:, 1], color='red', lw=2, label='Trend Line')

plt.title('Country Mentions in Headlines (Top 12 Economies)')
plt.ylabel('Number of Mentions')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.legend()

# Second subplot 
plt.subplot(2, 1, 2)
sns.barplot(x='Country Name', y='Total GDP', data=df_filtered, palette='viridis', order=target_countries)
plt.xlabel('Country')
plt.ylabel('Total GDP in Trillions(in USD)')
plt.title('Total GDP of Top 12 Economies')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

#Formatting
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Read the CSV file into a pandas DataFrame
df_war = pd.read_csv('/Users/georgewalsh/Documents/battle/battledata.csv',skiprows=4)  # Skip the first 4 rows which contain metadata
df_war_cleaned = df_war[['Country Name','2014','2015','2016','2017','2018','2019', '2020', '2021', '2022','2023']]

# Then do the summing
year_columns = [str(year) for year in range(2014, 2024)]
df_war_cleaned['Total'] = df_war_cleaned[year_columns].sum(axis=1)

# Display the first few rows of the DataFrame to verify
df_war_cleaned.head()

In [None]:



# Define key countries and variations


key_countries = ['Ukraine', 'Russia', 'United States', 'Sudan', 'United Kingdom',
                 'Afghanistan', 'Ethiopia', 'Iraq']

country_variations = {
    'Ukraine': ['ukraine'],
    'Russia': ['russia'],
    'Iraq': ['iraq'],
    'United Kingdom': ['united kingdom', 'uk'],
    'United States': ['united states', 'us', 'usa'],
    'Ethiopia': ['ethiopia'],
    'Afghanistan': ['afghanistan'],
    'Sudan': ['sudan']
}





# Ensure lowercase version of headlines
df['headline_lower'] = df['headline'].str.lower()

# Count mentions
mention_counts = {}
for country, variations in country_variations.items():
    mask = df['headline_lower'].str.contains(r'\b(war|conflict)\b', case=False)
    country_mask = mask & df['headline_lower'].apply(
        lambda text: any(variant in text for variant in variations)
    )
    mention_counts[country] = country_mask.sum()

mentions_df = pd.DataFrame(list(mention_counts.items()), columns=['Country', 'Mentions'])


#  War Deaths 


# Filter the DataFrame to include only key countries
df_war_subset = df_war_cleaned[df_war_cleaned['Country Name'].isin(key_countries)]
df_war_subset = df_war_subset.sort_values('Total', ascending=False)
# plotting
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

# plot total deaths
bars1 = ax1.bar(df_war_subset['Country Name'], df_war_subset['Total'], color='skyblue')
ax1.set_title('Total Deaths (2014–2023)', fontsize=14)
ax1.set_ylabel('Total Deaths')
ax1.grid(axis='y', linestyle=':', alpha=0.5)



for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height, f'{int(height):,}', 
             ha='center', va='bottom', fontsize=10)


    
# plot total mentions
# Match order of countries 
mentions_df = mentions_df[mentions_df['Country'].isin(key_countries)]
mentions_df = mentions_df.set_index('Country').loc[df_war_subset['Country Name']].reset_index()


# Add a smooth trend line — makes it easier to spot the general pattern
x_vals = np.arange(len(mentions_df))
y_vals = mentions_df['Mentions'].values

if len(x_vals) > 2:
    x_smooth = np.linspace(x_vals.min(), x_vals.max(), 300)
    spline = make_interp_spline(x_vals, y_vals, k=2)  # you can adjust 'k' for curve smoothness
    y_smooth = spline(x_smooth)
    ax2.plot(x_smooth, y_smooth, color='darkred', linestyle='--', linewidth=2, label='Smoothed Trend')
    
bars2 = ax2.bar(mentions_df['Country'], mentions_df['Mentions'], color='cornflowerblue')
ax2.set_title('Mentions of "War" or "Conflict" in Headlines', fontsize=14)
ax2.set_ylabel('Number of Mentions')
ax2.set_xticklabels(mentions_df['Country'], rotation=30)
ax2.grid(axis='y', linestyle=':', alpha=0.5)

for bar in bars2:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height, f'{height}', 
             ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:

#  Vote Share by Party
# Load and process data
df_pol = pd.read_excel('/Users/georgewalsh/Documents/pivottablefull.xlsx', header=8)
df_vote = df_pol[df_pol['Data'] == 'Sum of Vote'][['Party', 2015, 2017, 2019]]

# Calculate totals and percentages (keep as float)
df_vote['Total Votes'] = df_vote[[2015, 2017, 2019]].sum(axis=1)
total_all_parties = df_vote['Total Votes'].sum()
df_vote['Percentage'] = (df_vote['Total Votes'] / total_all_parties) * 100

# Clean party names
df_vote['Party'] = df_vote['Party'].replace({
    'CON': 'Conservative',
    'LAB': 'Labour',
    'LIB': 'Lib Dem',
    'NAT': 'Scotish National Party'
})

# Filter out minor parties and others
df_vote = df_vote[~df_vote['Party'].isin(['MIN', 'OTH'])]

# Sort by numeric percentage before plotting
df_vote = df_vote.sort_values('Percentage', ascending=False)

In [None]:



#this plots the vote share by party
plt.figure(figsize=(14, 7))
bars_vote = plt.bar(df_vote['Party'], df_vote['Percentage'], color=[
    '#0087DC', '#E4003B', '#FAA61A', '#3F8428','#6D3177','#999999'
])

for bar in bars_vote:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f}%',
             ha='center', va='bottom')

plt.title('UK General Elections Total Vote Share by Party', pad=20)
plt.ylabel('Percentage of Votes (%)')
plt.ylim(0, df_vote['Percentage'].max() + 5)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


# Count mentions
mention_counts = defaultdict(int)

for headline in df['headline']:
    found_parties = set()
    for party, pattern in patterns.items():
        if pattern.search(headline):
            found_parties.add(party)
    for party in found_parties:
        mention_counts[party] += 1

# Convert to percentages
count_df = pd.DataFrame.from_dict(mention_counts, orient='index', columns=['Count'])
total_mentions = count_df['Count'].sum()
count_df['Percentage'] = (count_df['Count'] / total_mentions) * 100
count_df = count_df.sort_values('Percentage', ascending=False)

# Create visualization
plt.figure(figsize=(14, 7))
colors = {
    'Conservative': '#0087DC',
    'Labour': '#E4003B',
    'Liberal Democrat': '#FAA61A',
    'Scotish National Party': '#3F8428'  
}

bars_mentions = plt.bar(count_df.index, count_df['Percentage'], 
                       color=[colors[p] for p in count_df.index])

for bar in bars_mentions:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f}%',
             ha='center', va='bottom', fontsize=10)

plt.title('Political Party Mentions in Headlines (% of Total Mentions)', pad=20)
plt.ylabel('Percentage of Mentions (%)')
plt.ylim(0, count_df['Percentage'].max() + 5)
plt.grid(axis='y', alpha=0.2)
plt.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Step 1: Combine all headlines into a single string
text = " ".join(headline for headline in df['headline'].dropna())

# Optional: Clean up text (remove common stopwords)
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)

# Step 2: Create the word cloud
wordcloud = WordCloud(width=800, height=400,
                      background_color='white',
                      stopwords=stopwords,
                      colormap='viridis').generate(text)

# Step 3: Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Frequent Words in Headlines')
plt.show()


In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# Step 1: Combine all headlines into a single string
text = " ".join(headline for headline in df['headline'].dropna())

# Step 2: Clean up stopwords and irrelevant words
stopwords = set(STOPWORDS)
stopwords.update(["s", "said", "mr", "mrs",
                  "says","will","happened","review",
                  "quick","U","new","crossword", "Cryptic",""
                  "day","call","year"])  # Add more custom stopwords if needed

# Step 3: Create the word cloud
wordcloud = WordCloud(width=800, height=400,
                      background_color='white',
                      stopwords=stopwords,
                      colormap='viridis').generate(text)

# Step 4: Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Frequent Words in Headlines')
plt.show()


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("evangower/premier-league-matches-19922022")

path

In [None]:
df_footy=pd.read_csv("/Users/georgewalsh/Documents/premier-league-matches.csv")
df_footy_filtered = df_footy[df_footy['Season_End_Year'] >= 2016]
df_footy_filtered.head(10)


In [None]:

# Home team wins
home_wins = df_footy_filtered[df_footy_filtered['HomeGoals'] > df_footy_filtered['AwayGoals']]
home_win_counts = home_wins['Home'].value_counts()

# Away team wins
away_wins = df_footy_filtered[df_footy_filtered['AwayGoals'] > df_footy_filtered['HomeGoals']]
away_win_counts = away_wins['Away'].value_counts()

# Combine home and away wins
total_wins = home_win_counts.add(away_win_counts, fill_value=0).astype(int)

# Create the new DataFrame
df_team_wins = total_wins.reset_index()
df_team_wins.columns = ['team', 'wins']

# Sort by number of wins (optional)
df_team_wins = df_team_wins.sort_values(by='wins', ascending=False).reset_index(drop=True)

df_team_wins.head()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create a combined list of all unique teams from both datasets
all_teams = sorted(list(set(df_team_wins_sorted['team'].unique()).union(set(prem_teams))))

# Create a consistent color palette for all teams
team_colors = sns.color_palette("hsv", len(all_teams))
color_dict = {team: color for team, color in zip(all_teams, team_colors)}

# Define a small offset for the value labels
offset = 3

# First plot: Total Wins by Team
plt.figure(figsize=(12, 8))

# Horizontal lines, dots, and value labels - using the consistent color mapping
for team, wins in zip(df_team_wins_sorted['team'], df_team_wins_sorted['wins']):
    plt.hlines(y=team, xmin=0, xmax=wins, color=color_dict[team], linewidth=2)
    plt.plot(wins, team, "o", color=color_dict[team], markersize=8)
    # Place the value label to the left of the bar and in black
    plt.text(wins + offset, team, str(wins), va='center', ha='left', fontsize=10, color='black')

plt.xlabel("Total Wins")
plt.title("Total Wins by Team")
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.xticks(np.arange(0, 230, 10))
plt.tight_layout()
plt.show()

# Second plot: Team Mentions in Headlines
plt.figure(figsize=(12, 10))

# Draw horizontal lines, dots, and value labels - using the same color mapping
for team, mentions in zip(df_mentions['team'], df_mentions['mentions']):
    plt.hlines(y=team, xmin=0, xmax=mentions, color=color_dict[team], linewidth=2)
    plt.plot(mentions, team, "o", color=color_dict[team], markersize=8)
    # Place the value label to the left of the bar and in black
    plt.text(mentions +  offset, team, str(mentions), va='center', ha='left', fontsize=10, color='black')

plt.xlabel("Number of Mentions in Headlines")
plt.title("Premier League Team Mentions in Headlines", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.xticks(np.arange(0, 130, 10))
plt.tight_layout()
plt.show()
