# 20-year Temperature increases for international cities

In [1]:
import pandas as pd

# Specify the file path to the CSV file
file_path = 'city_temperature.csv'

# Define the column names for your data
column_names = ['region','country', 'state', 'city','month', 'day',  'year', 'avgtemperature']

# Define the data types for each column
data_types = {'region': str, 'country': str, 'state': str,  'city': str, 'month': int, 'day': int, 'year': int, 'avgtemperature': float}

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, names=column_names, skiprows=1, dtype=data_types)

# some cleaning
df = df[(df['day'] != 0) & (df.avgtemperature != -99)]

In [4]:
# get an average temperature for each month for each city
df_filt = df.drop(columns=['day','state','region'])

# get just international cities, we also only want years between 2000 and 2015
df_filt = df_filt[(df_filt.country != 'US') & ((2001 > df_filt.year) | (2014 < df_filt.year))]

# also don't need the country column anymore
df_filt = df_filt.drop(columns=['country'])

df_filt.head()

Unnamed: 0,city,month,year,avgtemperature
0,Algiers,1,1995,64.2
1,Algiers,1,1995,49.4
2,Algiers,1,1995,48.8
3,Algiers,1,1995,46.4
4,Algiers,1,1995,47.9


In [5]:
# for later use, find the average summer months temperature from the 2015 to 2020, don't need the month and year columns anymore
df_filt_summer_temp = df_filt[(df_filt.year > 2014) & (df_filt.month > 5) & (df_filt.month < 9)].drop(columns=['month','year'])

# group by city
last_5_year_summer_temp = df_filt_summer_temp.groupby(['city']).mean()
last_5_year_summer_temp.head()

Unnamed: 0_level_0,avgtemperature
city,Unnamed: 1_level_1
Abidjan,78.464783
Abu Dhabi,97.347174
Addis Ababa,61.528708
Algiers,78.141087
Almaty,75.505217


In [6]:
# now, back to our initial analysis
# find the mean temperature for each month in each year in each city

df_grouped = df_filt.groupby(['city','year', 'month'], as_index=False).mean()

df_grouped.head()

Unnamed: 0,city,year,month,avgtemperature
0,Abidjan,1995,1,79.916129
1,Abidjan,1995,2,82.614286
2,Abidjan,1995,3,82.545161
3,Abidjan,1995,4,83.32
4,Abidjan,1995,5,82.403226


In [7]:
# loop through each city and year to find the difference in monthly average temperatures for a 20 year time window (i.e. 1995 to 2015 etc)

result = []

for city in df_grouped.city.unique() :
    
    df_city = df_grouped[df_grouped.city == city]

    for year in range(1995, 2001):

        df_past = df_city[df_city.year == year].drop(columns = ['year','city']).set_index('month')
        df_future = df_city[df_city.year == (year + 20)].drop(columns = ['year','city']).set_index('month')
        df_diff = df_past.join(df_future, how = 'inner' , rsuffix='_20yrs')  
        df_diff['difference'] = df_diff.avgtemperature_20yrs - df_diff.avgtemperature ;
        df_diff.difference.mean()
        avg_difference = df_diff.difference.mean()
        
        for index, row in df_diff.iterrows():
            result_row = [city, year, (year + 20), index, row.difference]
            result.append(result_row)
       

In [8]:
# convert result to a dataframe
df_avg_increases = pd.DataFrame(result, columns = ['city','year_start','year_end', 'month', 'avg_temp_increase'])
df_avg_increases.head()

Unnamed: 0,city,year_start,year_end,month,avg_temp_increase
0,Abidjan,1995,2015,1,0.9
1,Abidjan,1995,2015,2,0.125
2,Abidjan,1995,2015,3,0.258065
3,Abidjan,1995,2015,4,0.476667
4,Abidjan,1995,2015,5,0.306452


In [10]:
# attribute a season to each month for grouping

df_avg_increases['season'] = 'winter'
df_avg_increases.loc[(2 < df_avg_increases.month) & (df_avg_increases.month < 6), 'season'] = 'spring'
df_avg_increases.loc[(5 < df_avg_increases.month) & (df_avg_increases.month < 9), 'season']  = 'summer'
df_avg_increases.loc[(8 < df_avg_increases.month) & (df_avg_increases.month < 12), 'season']  = 'fall'

df_avg_increases.head()

Unnamed: 0,city,year_start,year_end,month,avg_temp_increase,season
0,Abidjan,1995,2015,1,0.9,winter
1,Abidjan,1995,2015,2,0.125,winter
2,Abidjan,1995,2015,3,0.258065,spring
3,Abidjan,1995,2015,4,0.476667,spring
4,Abidjan,1995,2015,5,0.306452,spring


In [11]:
# find just the average temperature increases for the (northern hemisphere) summer, and sort high to low

df_avg_summer_increases = df_avg_increases[df_avg_increases.season=='summer'].drop(columns=['year_start', 'year_end','month'])

df_summer_grouped = df_avg_summer_increases.groupby(['city'], as_index=False).mean().sort_values(by='avg_temp_increase', ascending=False)

df_summer_grouped.head()

  df_summer_grouped = df_avg_summer_increases.groupby(['city'], as_index=False).mean().sort_values(by='avg_temp_increase', ascending=False)


Unnamed: 0,city,avg_temp_increase
76,Lima,5.338136
138,Zagreb,4.725871
104,Prague,4.688545
134,Vienna,4.474903
87,Milan,4.301412


In [12]:
# now add the averages summer temperatures from the last 5 years to the dataframe as an extra datapoint

df_summer_increase_avg_temp = df_summer_grouped.join(last_5_year_summer_temp, on='city')

df_summer_increase_avg_temp.head()

Unnamed: 0,city,avg_temp_increase,avgtemperature
76,Lima,5.338136,66.811957
138,Zagreb,4.725871,74.232826
104,Prague,4.688545,67.074348
134,Vienna,4.474903,71.329783
87,Milan,4.301412,74.761522


In [225]:
# write to csv file for visualization
df_summer_increase_avg_temp.to_csv('global_avg_summer_temp_increases_and_5_year_avg_summer_temp.csv')