In [None]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

city_daily_temperature_df = pd.read_csv("Datasets/city-daily-temperature.csv")
global_country_info_df = pd.read_csv("Datasets/global-country-information.csv")
print(city_daily_temperature_df.shape)
print(global_country_info_df.shape)

In [None]:
# Preprocessing City Daily Temperature
# We only want to look at data from 2000 onward 
city_daily_temperature_df = city_daily_temperature_df[city_daily_temperature_df['Year'] >= 2000]
city_daily_temperature_df = city_daily_temperature_df[city_daily_temperature_df['AvgTemperature'] <= 140]

# Formatting the date 
city_daily_temperature_df['Date'] = pd.to_datetime(city_daily_temperature_df[['Year', 'Month', 'Day']], errors='coerce')

# Checking for missing days for specific countries 
countries = city_daily_temperature_df['Country'].unique()
full_date_ranges = []

for country in countries:
    country_data = city_daily_temperature_df[city_daily_temperature_df['Country'] == country]
    start = country_data['Year'].min()
    end = country_data['Year'].max()

    country_date_range = pd.date_range(start=f'{start}-01-01', end=f'{end}-12-31', freq='D')

    full_date_ranges.append(pd.DataFrame({
        'Country': country,
        'Date': country_date_range
    }))

full_dates_df = pd.concat(full_date_ranges)

merged_df = pd.merge(full_dates_df, city_daily_temperature_df[['Country', 'Date']],
                     on=['Country', 'Date'], how='left', indicator=True)
missing_dates_df = merged_df[merged_df['_merge'] == 'left_only']
print(missing_dates_df[['Country', 'Date']])

# More data cleaning
city_daily_temperature_df = city_daily_temperature_df[city_daily_temperature_df['AvgTemperature'] >= -90]

# Checking if every country has data from same years
country_years = city_daily_temperature_df.groupby('Country')['Year'].unique()
all_years = set(city_daily_temperature_df['Year'].unique())
incomplete_country_years = [country for country, years in country_years.items() if set(years) != all_years]
print("Countries with missing years:", incomplete_country_years)

# Organizing data 
avg_temp_per_year = city_daily_temperature_df.groupby('Year')['AvgTemperature'].mean().reset_index()
avg_temp_per_region_per_year = city_daily_temperature_df.groupby(['Region', 'Year'])['AvgTemperature'].mean().reset_index()
avg_temp_per_country_per_year = city_daily_temperature_df.groupby(['Country', 'Year'])['AvgTemperature'].mean().reset_index()
avg_temp_per_country_per_month = city_daily_temperature_df.groupby(['Country', 'Year', 'Month'])['AvgTemperature'].mean().reset_index()

# Number of countries
unique_countries = city_daily_temperature_df['Country'].unique()
num_countries = len(unique_countries)

# print("Unique", unique_countries)
# print("Number of countries", num_countries)
# print("Average Temperature Per Year")
# print(avg_temp_per_year)
# print("Average Temperature Per Region by Year")
# print(avg_temp_per_region_per_year)
# print("Average Temperature Per Country by Year") 
# print(avg_temp_per_country_per_year)
# print("Average Temperature Per Country by Month")
# print(avg_temp_per_country_per_month)

In [60]:
# Preprocessing Global Country Information