# Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor

## Heat Deaths

In [2]:
heat_deaths = pd.read_csv('../data/extreme_temp/heat_deaths_reported.csv').drop(columns= ['Unnamed: 0'])

In [3]:
heat_deaths.head()

Unnamed: 0,disaster_subtype,country,subregion,region,location,temperature,magnitude_scale,start_year,start_month,total_deaths,start_date,end_date,duration_days
0,Heat wave,India,Southern Asia,Asia,"Madhya Pradesh, Andhra Pradesh, Uttar Pradesh,...",50.0,°C,2002,5,1030.0,2002-05-10,2002-05-22,12 days
1,Heat wave,Bangladesh,Southern Asia,Asia,"Dhaka, Khulna, Rajshahi provinces",40.0,°C,2003,5,62.0,2003-05-10,2003-06-12,33 days
2,Heat wave,India,Southern Asia,Asia,"Andhra Pradesh, Orissa, Tamil Nadu, Vidarbha (...",49.0,°C,2003,5,1210.0,2003-05-14,2003-06-06,23 days
3,Heat wave,Pakistan,Southern Asia,Asia,Punjab province,50.0,°C,2003,5,200.0,2003-05-01,2003-06-06,36 days
4,Heat wave,Belgium,Western Europe,Europe,Region de Bruxelles-Capitale/Brussels Hoofdste...,,°C,2003,8,1175.0,2003-08-01,2003-08-15,14 days


In [4]:
heat_deaths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   disaster_subtype  111 non-null    object 
 1   country           111 non-null    object 
 2   subregion         111 non-null    object 
 3   region            111 non-null    object 
 4   location          62 non-null     object 
 5   temperature       82 non-null     float64
 6   magnitude_scale   111 non-null    object 
 7   start_year        111 non-null    int64  
 8   start_month       111 non-null    int64  
 9   total_deaths      111 non-null    float64
 10  start_date        111 non-null    object 
 11  end_date          111 non-null    object 
 12  duration_days     111 non-null    object 
dtypes: float64(2), int64(2), object(9)
memory usage: 11.4+ KB


In [5]:
for i in heat_deaths.index:
    heat_deaths.loc[i, 'duration'] = (heat_deaths.loc[i,'duration_days'].split(' ')[0])

In [6]:
heat_deaths['country'].unique()

array(['India', 'Bangladesh', 'Pakistan', 'Belgium', 'Czechia', 'Spain',
       'France', 'Italy', 'Netherlands (Kingdom of the)', 'Portugal',
       'Algeria', 'Canary Islands', 'United States of America', 'Romania',
       'Germany', 'Slovakia', 'Australia', 'Japan', 'China', 'Sudan',
       'South Africa', 'Canada',
       'United Kingdom of Great Britain and Northern Ireland', 'Austria',
       'Albania', 'Bulgaria', 'Switzerland', 'Cyprus', 'Denmark',
       'Estonia', 'Finland', 'Greece', 'Croatia', 'Hungary', 'Ireland',
       'Lithuania', 'Luxembourg', 'Latvia', 'Malta', 'Montenegro',
       'Norway', 'Poland', 'Serbia', 'Slovenia', 'Sweden', 'Myanmar',
       'Mexico', 'Saudi Arabia', 'Morocco', 'Republic of Korea'],
      dtype=object)

In [7]:
heat_deaths[heat_deaths['country']=='Australia']

Unnamed: 0,disaster_subtype,country,subregion,region,location,temperature,magnitude_scale,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration
26,Heat wave,Australia,Australia and New Zealand,Oceania,"Victoria, New South wales (South), Tasmania (N...",48.8,°C,2009,1,347.0,2009-01-27,2009-02-08,12 days,12
33,Heat wave,Australia,Australia and New Zealand,Oceania,"Western Australia, Victoria provinces",52.0,°C,2014,1,139.0,2014-01-13,2014-01-18,5 days,5


In [13]:
heat_deaths = heat_deaths.dropna(subset=['temperature'])

In [14]:
heat_deaths_no_usa = heat_deaths[heat_deaths['country']!='United States of America']

In [47]:
df_hot = pd.merge(left= heat_deaths_no_usa.drop(columns=['disaster_subtype', 'magnitude_scale']), right= dataframe_no_usa.drop(columns=['location','exact_latitude', 'exact_longitude']), how= 'left', on='country')

In [55]:
df_hot = df_hot.reset_index(drop=True)

In [54]:
delta_temps = []

for i in df_hot.index:
    try:
        delta_C, delta_std = temp_change(df_hot,i)
        delta_temps.append({'delta_temp_C': delta_C, 'delta_temp_std': delta_std})
    except:
        pass

In [56]:
hot_df = pd.merge(left= df_hot, right= pd.DataFrame(delta_temps), left_index=True, right_index=True)

In [58]:
hot_df.to_csv('../data/hot_final_dataset.csv')

In [None]:
{'country': 'United States of America', 'location': 'Connecticut, Delaware, Maryland, Maine, New Ha...', 'exact_latitude': 40, 'exact_longitude': -80, 'latitude': 40, 'longitude':-80},
    {'country': 'United States of America', 'location': 'New York, Minnesota, Georgia, Maryland, Alaska...', 'latitude': 40, 'longitude':-90},
    {'country': 'United States of America', 'location': 'Nebraska, Kansas',  'latitude': 40, 'longitude': -100},
    {'country': 'United States of America', 'location': 'Utah',  'latitude': 40, 'longitude':-110},
    {'country': 'United States of America', 'location': 'California, Nevada', 'latitude': 40, 'longitude':-120},
    {'country': 'United States of America', 'location': 'Louisiana', 'latitude': 30, 'longitude':-90},
    {'country': 'United States of America', 'location': 'Texas, Pennsylvania, Oklahoma, Kentucky, Misso...',  'latitude': 30, 'longitude':-100},
    {'country': 'United States of America', 'location': 'Alaska', 'latitude': 60, 'longitude':-160},
    {'country': 'United States of America', 'location': 'Alaska',  'latitude': 70, 'longitude':-160},
    {'country': 'United States of America', 'location': 'Alaska',  'latitude': 60, 'longitude':-150},
    {'country': 'United States of America', 'location': 'Alaska',  'latitude': 70, 'longitude':-150},
    {'country': 'United States of America', 'location': 'Alaska',  'latitude': 60, 'longitude':-140},

In [21]:
data = [
    {'country': 'Afghanistan', 'location': 'Kabul', 'exact_latitude': 34.555, 'exact_longitude':69.2, 'latitude': 30, 'longitude': 70}, 
    {'country': 'Poland', 'location': 'Warsaw', 'exact_latitude': 52.2, 'exact_longitude': 21.0, 'latitude': 50, 'longitude': 20}, 
    {'country': 'Peru', 'exact_latitude': -9.2, 'exact_longitude': -75, 'latitude': -10, 'longitude':-70}, 
    {'country': 'Germany',  'exact_latitude': 51, 'exact_longitude': 10.5, 'latitude': 50, 'longitude': 10}, 
    {'country': 'France', 'exact_latitude': 46.2, 'exact_longitude': 2.2, 'latitude': 50, 'longitude':0}, 
    {'country': 'South Africa', 'latitude': -30, 'longitude': 30}, 
    {'country': 'India',  'latitude': 20, 'longitude': 80}, 
    {'country': 'Romania',  'exact_latitude': 45, 'exact_longitude': 25, 'latitude': 50, 'longitude':20}, 
    {'country': 'United Kingdom of Great Britain and Northern Ireland',  'latitude': 51, 'longitude':0}, 
    {'country': 'Bangladesh',  'exact_latitude': 25, 'exact_longitude': 90, 'latitude': 30, 'longitude': 90}, 
    {'country': 'Russian Federation', 'latitude': 60, 'longitude': 30}, 
    {'country': 'Lithuania',  'exact_latitude': 55, 'exact_longitude': 24, 'latitude': 60, 'longitude': 30}, 
    {'country': 'China',  'exact_latitude': 30, 'exact_longitude': 110, 'latitude': 30, 'longitude': 110}, 
    {'country': 'Mexico', 'latitude': 20, 'longitude':-100}, 
    {'country': 'Nepal', 'exact_latitude': 28.5, 'exact_longitude': 84, 'latitude': 30, 'longitude': 80}, 
    {'country': 'Bosnia and Herzegovina',  'exact_latitude': 44, 'exact_longitude': 18, 'latitude': 40, 'longitude': 20}, 
    {'country': 'Republic of Moldova', 'exact_latitude': 47, 'exact_longitude': 29, 'latitude': 50, 'longitude':30}, 
    {'country': 'Bulgaria', 'exact_latitude': 43, 'exact_longitude': 26, 'latitude': 40, 'longitude':30}, 
    {'country': 'Switzerland', 'exact_latitude': 47, 'exact_longitude': 8, 'latitude': 50, 'longitude':10}, 
    {'country': 'Croatia', 'exact_latitude': 45, 'exact_longitude': 15, 'latitude': 40, 'longitude': 20}, 
    {'country': 'Serbia', 'exact_latitude': 44, 'exact_longitude': 21, 'latitude': 40, 'longitude': 20},
    {'country': 'Ukraine', 'exact_latitude': 50, 'exact_longitude': 30, 'latitude': 50, 'longitude': 30},
    {'country': 'Kyrgyzstan', 'exact_latitude': 41, 'exact_longitude': 75, 'latitude': 40, 'longitude': 70},
    {'country': 'Japan', 'exact_latitude': 36, 'exact_longitude': 138, 'latitude': 40, 'longitude': 140},
    {'country': 'Republic of Korea', 'exact_latitude': 36, 'exact_longitude': 127, 'latitude': 30, 'longitude':120},
    {'country': 'Thailand', 'exact_latitude': 16, 'exact_longitude': 101, 'latitude': 20, 'longitude': 100},
    {'country': 'Taiwan (Province of China)', 'exact_latitude': 24, 'exact_longitude': 121, 'latitude': 30, 'longitude': 120},
    {'country': 'Czechia', 'exact_latitude': 50, 'exact_longitude': 15, 'latitude': 50, 'longitude': 20},
    {'country': 'Estonia', 'exact_latitude': 59, 'exact_longitude': 25, 'latitude': 60, 'longitude': 30},
    {'country': 'Italy', 'exact_latitude': 42, 'exact_longitude': 13, 'latitude': 40, 'longitude': 20},
    {'country': 'Algeria', 'exact_latitude': 30, 'exact_longitude': 0, 'latitude': 30, 'longitude':0},
    {'country': 'State of Palestine',  'exact_latitude': 32, 'exact_longitude': 35, 'latitude': 30, 'longitude': 40},
    {'country': 'Mongolia', 'exact_latitude': 47, 'exact_longitude': 105, 'latitude': 50, 'longitude':100},
    {'country': 'Pakistan', 'exact_latitude': 30.4, 'exact_longitude': 69.3, 'latitude': 30, 'longitude': 70},
    {'country': 'Belgium', 'exact_latitude': 50.5, 'exact_longitude': 4.47, 'latitude': 50, 'longitude': 10},
    {'country': 'Spain', 'exact_latitude': 40.5, 'exact_longitude': 3.7, 'latitude': 40, 'longitude': 0},
    {'country': 'Netherlands (Kingdom of the)', 'exact_latitude': 52.1, 'exact_longitude': 5.3, 'latitude': 50, 'longitude': 10},
    {'country': 'Portugal', 'exact_latitude': 39.4, 'exact_longitude': -8.22, 'latitude': 40, 'longitude': 0},
    {'country': 'Canary Islands', 'exact_latitude': 28.3, 'exact_longitude': -16.6, 'latitude': 20, 'longitude': -10},
    {'country': 'Slovakia', 'exact_latitude': 48.7, 'exact_longitude': 19.7, 'latitude': 50, 'longitude': 20},
    {'country': 'Sudan', 'exact_latitude': 12.8, 'exact_longitude': 30, 'latitude': 20, 'longitude': 30},
    {'country': 'Austria', 'exact_latitude': 47, 'exact_longitude': 134, 'latitude': 50, 'longitude': 10},
    {'country': 'Albania', 'exact_latitude': 41, 'exact_longitude': 20, 'latitude': 40, 'longitude': 20},
    {'country': 'Switzerland', 'exact_latitude': 46.8, 'exact_longitude': 8, 'latitude': 50, 'longitude': 10},
    {'country': 'Cyprus', 'exact_latitude': 35, 'exact_longitude': 33, 'latitude': 40, 'longitude': 30},
    {'country': 'Denmark', 'exact_latitude': 56, 'exact_longitude': 9.5, 'latitude': 50, 'longitude': 10},
    {'country': 'Estonia', 'exact_latitude': 58.6, 'exact_longitude': 25, 'latitude': 60, 'longitude': 30},
    {'country': 'Finland', 'exact_latitude': 61.9, 'exact_longitude': 25.7, 'latitude': 60, 'longitude': 30},
    {'country': 'Greece', 'exact_latitude': 39.1, 'exact_longitude': 21.8, 'latitude': 40, 'longitude': 20},
    {'country': 'Hungary', 'exact_latitude': 47.2, 'exact_longitude': 19.5, 'latitude': 50, 'longitude': 20},
    {'country': 'Ireland', 'exact_latitude': 53.8, 'exact_longitude': -7.3, 'latitude': 51, 'longitude': 0},
    {'country': 'Lithuania', 'exact_latitude': 55.2, 'exact_longitude': 23.9, 'latitude': 50, 'longitude': 20},
    {'country': 'Luxembourg', 'exact_latitude': 49.8, 'exact_longitude': 6.1, 'latitude': 50, 'longitude': 10},
    {'country': 'Latvia', 'exact_latitude': 56.9, 'exact_longitude': 24.6, 'latitude': 50, 'longitude': 20},
    {'country': 'Malta', 'exact_latitude': 35.9, 'exact_longitude': 14.4, 'latitude': 40, 'longitude': 20},
    {'country': 'Montenegro', 'exact_latitude': 42.7, 'exact_longitude': 19.4, 'latitude': 40, 'longitude': 20},
    {'country': 'Norway', 'exact_latitude': 60.5, 'exact_longitude': 8.5, 'latitude': 60, 'longitude': 10},
    {'country': 'Poland', 'exact_latitude': 51.9, 'exact_longitude': 19.1, 'latitude': 50, 'longitude': 20},
    {'country': 'Serbia', 'exact_latitude': 44, 'exact_longitude': 21, 'latitude': 40, 'longitude': 20},
    {'country': 'Slovenia', 'exact_latitude': 46.2, 'exact_longitude': 15, 'latitude': 50, 'longitude': 20},
    {'country': 'Sweden', 'exact_latitude': 60.1, 'exact_longitude': 18.6, 'latitude': 60, 'longitude': 20},
    {'country': 'Myanmar', 'exact_latitude': 21.9, 'exact_longitude': 95.95, 'latitude': 20, 'longitude': 100},
    {'country': 'Saudi Arabia', 'exact_latitude': 23.9, 'exact_longitude': 45.1, 'latitude': 20, 'longitude': 50},
    {'country': 'Morocco', 'exact_latitude': 31.8, 'exact_longitude': -7.1, 'latitude': 30, 'longitude': 0}
]

In [None]:
['Pakistan', 'Belgium',  'Spain',
       'Netherlands (Kingdom of the)', 'Portugal',
        'Canary Islands', 'United States of America', 'Romania',
        'Slovakia', 'Australia', 'China', 'Sudan',
        'Canada',
       'United Kingdom of Great Britain and Northern Ireland', 'Austria',
       'Albania', 'Bulgaria', 'Switzerland', 'Cyprus', 'Denmark',
       'Estonia', 'Finland', 'Greece', 'Croatia', 'Hungary', 'Ireland',
       'Lithuania', 'Luxembourg', 'Latvia', 'Malta', 'Montenegro',
       'Norway', 'Poland', 'Serbia', 'Slovenia', 'Sweden', 'Myanmar',
       'Mexico', 'Saudi Arabia', 'Morocco', 'Republic of Korea']

## Cold Deaths

In [17]:
cold_deaths = pd.read_csv('../data/extreme_temp/cold_deaths_reported.csv').drop(columns= ['Unnamed: 0'])

In [None]:
cold_deaths.head()

In [4]:
cold_deaths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   disaster_subtype  75 non-null     object 
 1   country           75 non-null     object 
 2   subregion         75 non-null     object 
 3   region            75 non-null     object 
 4   location          67 non-null     object 
 5   temperature       36 non-null     float64
 6   magnitude_scale   75 non-null     object 
 7   start_year        75 non-null     int64  
 8   start_month       75 non-null     int64  
 9   total_deaths      75 non-null     float64
 10  start_date        75 non-null     object 
 11  end_date          75 non-null     object 
 12  duration_days     75 non-null     object 
dtypes: float64(2), int64(2), object(9)
memory usage: 7.7+ KB


In [18]:
for i in cold_deaths.index:
    cold_deaths.loc[i, 'duration'] = (cold_deaths.loc[i,'duration_days'].split(' ')[0])

In [22]:
dataframe = pd.DataFrame(data)

In [23]:
dataframe.to_csv('../data/extreme_temp/locations_lat_long.csv', index=False)

In [30]:
cold_deaths[cold_deaths['country']=='United States of America']['location']

3     Connecticut, Delaware, Maryland, Maine, New Ha...
11    Texas, Pennsylvania, Oklahoma, Kentucky, Misso...
48    New York, Minnesota, Georgia, Maryland, Alaska...
Name: location, dtype: object

USA 48: covers longitude: -125 to -70, 30 to 48 latitude
Alaska includes 60 to 70 latitude and -140 to -165 longitude

In [20]:
cold_deaths[cold_deaths['country']=='Russian Federation']['location']

19    Moskva, Moskovskaya Oblast, Sankt-peterburg, L...
41    Khabarovsk city (Khabarovskiy Kray province), ...
Name: location, dtype: object

Moskva is Moscow at about 55.5 latitude and 37.5 longitude.  
St. Petersburg is 60 latitude, 30 longitude.

In [24]:
cold_deaths_no_usa = cold_deaths[cold_deaths['country']!='United States of America']
dataframe_no_usa = dataframe[dataframe['country']!='United States of America']

In [25]:
cold_deaths

Unnamed: 0,disaster_subtype,country,subregion,region,location,temperature,magnitude_scale,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration
0,Cold wave,Afghanistan,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",-25.0,°C,2001,1,150.0,2001-01-31,2001-02-20,20 days,20
1,Cold wave,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,°C,2001,10,270.0,2001-10-10,2002-01-20,102 days,102
2,Cold wave,Peru,Latin America and the Caribbean,Americas,"Ayacucho, Huancavelica, Tacna, Moquegua, Arequ...",-28.0,°C,2003,7,339.0,2003-07-07,2003-08-15,39 days,39
3,Cold wave,United States of America,Northern America,Americas,"Connecticut, Delaware, Maryland, Maine, New Ha...",,°C,2004,1,3.0,2004-01-09,2004-01-12,3 days,3
4,Cold wave,Germany,Western Europe,Europe,Nordrhein-Westfalen province,,°C,2005,11,1.0,2005-11-25,2005-11-27,2 days,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,Cold wave,Romania,Eastern Europe,Europe,,,°C,2018,2,2.0,2018-02-23,2018-02-28,5 days,5
71,Cold wave,Algeria,Northern Africa,Africa,"Tizi Ouzou, Medea, Setif, Bourdj Bouriridj, Ba...",,°C,2019,1,8.0,2019-01-11,2019-03-22,70 days,70
72,Cold wave,State of Palestine,Western Asia,Asia,West Bank and Gaza Strip,0.0,°C,2020,1,3.0,2020-01-18,2020-02-05,18 days,18
73,Cold wave,Afghanistan,Southern Asia,Asia,"Ghor, Baghlan, Sar-e-Pul, Faryab, Takhar, Kand...",-33.0,°C,2023,1,166.0,2023-01-09,2023-01-23,14 days,14


In [26]:
df_cold = pd.merge(left= cold_deaths_no_usa.drop(columns=['disaster_subtype', 'magnitude_scale']), right= dataframe_no_usa.drop(columns=['location','exact_latitude', 'exact_longitude']), how= 'left', on='country')

In [27]:
df_cold

Unnamed: 0,country,subregion,region,location,temperature,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration,latitude,longitude
0,Afghanistan,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",-25.0,2001,1,150.0,2001-01-31,2001-02-20,20 days,20,30,70
1,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2001,10,270.0,2001-10-10,2002-01-20,102 days,102,50,20
2,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2001,10,270.0,2001-10-10,2002-01-20,102 days,102,50,20
3,Peru,Latin America and the Caribbean,Americas,"Ayacucho, Huancavelica, Tacna, Moquegua, Arequ...",-28.0,2003,7,339.0,2003-07-07,2003-08-15,39 days,39,-10,-70
4,Germany,Western Europe,Europe,Nordrhein-Westfalen province,,2005,11,1.0,2005-11-25,2005-11-27,2 days,2,50,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,Romania,Eastern Europe,Europe,,,2018,2,2.0,2018-02-23,2018-02-28,5 days,5,50,20
83,Algeria,Northern Africa,Africa,"Tizi Ouzou, Medea, Setif, Bourdj Bouriridj, Ba...",,2019,1,8.0,2019-01-11,2019-03-22,70 days,70,30,0
84,State of Palestine,Western Asia,Asia,West Bank and Gaza Strip,0.0,2020,1,3.0,2020-01-18,2020-02-05,18 days,18,30,40
85,Afghanistan,Southern Asia,Asia,"Ghor, Baghlan, Sar-e-Pul, Faryab, Takhar, Kand...",-33.0,2023,1,166.0,2023-01-09,2023-01-23,14 days,14,30,70


In [29]:
df_cold[df_cold['start_year']<2020].dropna(subset=['temperature'])

Unnamed: 0,country,subregion,region,location,temperature,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration,latitude,longitude
0,Afghanistan,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",-25.0,2001,1,150.0,2001-01-31,2001-02-20,20 days,20,30,70
1,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2001,10,270.0,2001-10-10,2002-01-20,102 days,102,50,20
2,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2001,10,270.0,2001-10-10,2002-01-20,102 days,102,50,20
3,Peru,Latin America and the Caribbean,Americas,"Ayacucho, Huancavelica, Tacna, Moquegua, Arequ...",-28.0,2003,7,339.0,2003-07-07,2003-08-15,39 days,39,-10,-70
6,South Africa,Sub-Saharan Africa,Africa,"Gauteng, Eastern Cape provinces",0.0,2007,5,22.0,2007-05-21,2007-05-27,6 days,6,-30,30
9,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2008,11,82.0,2008-11-01,2009-01-08,68 days,68,50,20
10,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2008,11,82.0,2008-11-01,2009-01-08,68 days,68,50,20
11,Romania,Eastern Europe,Europe,"Alba, Arad, Bihor, Bistrita-nasaud, Brasov, Bu...",-31.0,2009,1,43.0,2009-01-01,2009-01-15,14 days,14,50,20
12,Germany,Western Europe,Europe,"Bayern, Berlin, Hamburg provinces, Leipzig dis...",-33.6,2009,12,14.0,2009-12-18,2010-01-25,38 days,38,50,10
14,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-35.0,2009,11,298.0,2009-11-01,2010-01-26,86 days,86,50,20


In [30]:
df_cold[df_cold['start_year']>=2020]

Unnamed: 0,country,subregion,region,location,temperature,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration,latitude,longitude
84,State of Palestine,Western Asia,Asia,West Bank and Gaza Strip,0.0,2020,1,3.0,2020-01-18,2020-02-05,18 days,18,30,40
85,Afghanistan,Southern Asia,Asia,"Ghor, Baghlan, Sar-e-Pul, Faryab, Takhar, Kand...",-33.0,2023,1,166.0,2023-01-09,2023-01-23,14 days,14,30,70
86,Mongolia,Eastern Asia,Asia,"Dornod, Dornogovi, Dundgovi, Govi-Altai, Khent...",,2023,5,2.0,2023-05-19,2023-05-22,3 days,3,50,100


In [129]:
np.mean([-5.9440, -5.3286,0.0471, 2.3412])

-2.221075

In [31]:
df_cold.loc[71,'temperature'] = -2.22

In [32]:
df_cold = df_cold.dropna(subset='temperature')

In [33]:
df_cold.dtypes

country           object
subregion         object
region            object
location          object
temperature      float64
start_year         int64
start_month        int64
total_deaths     float64
start_date        object
end_date          object
duration_days     object
duration          object
latitude           int64
longitude          int64
dtype: object

In [52]:
def temp_change(df, i):
    """
    takes a dataframe, df, and an index, i

    returns the average difference between the temperature in the data row and the extreme temperature in both degrees Celsius and standard deviations
    """
    lat = int(df.loc[i,'latitude'])
    long = int(df.loc[i,'longitude'])
    start_date = dt.datetime.fromisoformat(df.loc[i,'start_date'])
    end_date = dt.datetime.fromisoformat(df.loc[i,'end_date'])
    temp = df.loc[i,'temperature']
    #duration = df.loc[i,'duration']
    
    monthly_sum = pd.read_csv(f"../data/temp_events/monthly_summary_lat={lat}_long={long}.csv", index_col='month', usecols=['month', 'min_std', 'min_extreme'])
    
    
    dates = pd.date_range(start= start_date, end= end_date)
    delta_C = []
    delta_std = []
    
    for date in dates:
        delta_temp_C = temp - monthly_sum.loc[date.month, 'min_extreme']
        delta_C.append(delta_temp_C)
        
        delta_std.append(delta_temp_C / monthly_sum.loc[date.month, 'min_std'])

    return np.mean(delta_C), np.mean(delta_std)

In [42]:
temp_change(df_cold,71)

(7.573735382951979, 1.9008804003733517)

In [35]:
delta_temps = []

for i in df_cold.index:
    delta_C, delta_std = temp_change(df_cold,i)
    delta_temps.append({'delta_temp_C': delta_C, 'delta_temp_std': delta_std})

In [39]:
df_cold = df_cold.reset_index(drop=True)

In [36]:
cold_df = pd.merge(left= df_cold, right= pd.DataFrame(delta_temps), left_index=True, right_index=True)

In [37]:
cold_df

Unnamed: 0,country,subregion,region,location,temperature,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration,latitude,longitude,delta_temp_C,delta_temp_std
0,Afghanistan,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",-25.0,2001,1,150.0,2001-01-31,2001-02-20,20 days,20,30,70,-24.856821,-8.561158
1,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2001,10,270.0,2001-10-10,2002-01-20,102 days,102,50,20,-15.941674,-4.306348
2,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2001,10,270.0,2001-10-10,2002-01-20,102 days,102,50,20,-15.941674,-4.306348
3,Peru,Latin America and the Caribbean,Americas,"Ayacucho, Huancavelica, Tacna, Moquegua, Arequ...",-28.0,2003,7,339.0,2003-07-07,2003-08-15,39 days,39,-10,-70,-43.303703,-20.441972
6,South Africa,Sub-Saharan Africa,Africa,"Gauteng, Eastern Cape provinces",0.0,2007,5,22.0,2007-05-21,2007-05-27,6 days,6,-30,30,-14.382446,-3.69479
9,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2008,11,82.0,2008-11-01,2009-01-08,68 days,68,50,20,-23.757152,-5.926544
10,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2008,11,82.0,2008-11-01,2009-01-08,68 days,68,50,20,-23.757152,-5.926544
11,Romania,Eastern Europe,Europe,"Alba, Arad, Bihor, Bistrita-nasaud, Brasov, Bu...",-31.0,2009,1,43.0,2009-01-01,2009-01-15,14 days,14,50,20,-7.234823,-1.665499
12,Germany,Western Europe,Europe,"Bayern, Berlin, Hamburg provinces, Leipzig dis...",-33.6,2009,12,14.0,2009-12-18,2010-01-25,38 days,38,50,10,23.619512,6.039956
14,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-35.0,2009,11,298.0,2009-11-01,2010-01-26,86 days,86,50,20,-20.360192,-4.593242


In [51]:
cold_df.columns

Index(['country', 'subregion', 'region', 'location', 'temperature',
       'start_year', 'start_month', 'total_deaths', 'start_date', 'end_date',
       'duration_days', 'duration', 'latitude', 'longitude', 'delta_temp_C',
       'delta_temp_std'],
      dtype='object')

In [38]:
X = cold_df.drop(columns=['country','subregion','location', 'start_date', 'end_date', 'duration_days', 'latitude', 'longitude', 'total_deaths', 'temperature'])

In [39]:
y = cold_df['total_deaths']

In [40]:
X

Unnamed: 0,region,start_year,start_month,duration,delta_temp_C,delta_temp_std
0,Asia,2001,1,20,-24.856821,-8.561158
1,Europe,2001,10,102,-15.941674,-4.306348
2,Europe,2001,10,102,-15.941674,-4.306348
3,Americas,2003,7,39,-43.303703,-20.441972
6,Africa,2007,5,6,-14.382446,-3.69479
9,Europe,2008,11,68,-23.757152,-5.926544
10,Europe,2008,11,68,-23.757152,-5.926544
11,Europe,2009,1,14,-7.234823,-1.665499
12,Europe,2009,12,38,23.619512,6.039956
14,Europe,2009,11,86,-20.360192,-4.593242


## Model Preprocessing

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=92)

In [42]:
ct = ColumnTransformer([
    ('ss', StandardScaler(), ['duration', 'delta_temp_C', 'delta_temp_std']),
    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'), ['region', 'start_year', 'start_month'])
], remainder='passthrough')

In [43]:
pipe_lr = Pipeline([
    ('ct', ct),
    ('lr', LinearRegression())
])

In [44]:
pipe_lr.fit(X_train, y_train)

In [45]:
pipe_lr.score(X_train, y_train)

0.9999999999997234

In [46]:
pipe_lr.score(X_test, y_test)

-13.707267854352187