# Modeling

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor

## Cold Deaths

In [2]:
cold_deaths = pd.read_csv('../data/extreme_temp/cold_deaths_reported.csv').drop(columns= ['Unnamed: 0'])

In [3]:
cold_deaths.head(30)

Unnamed: 0,disaster_subtype,country,subregion,region,location,temperature,magnitude_scale,start_year,start_month,total_deaths,start_date,end_date,duration_days
0,Cold wave,Afghanistan,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",-25.0,°C,2001,1,150.0,2001-01-31,2001-02-20,20 days
1,Cold wave,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,°C,2001,10,270.0,2001-10-10,2002-01-20,102 days
2,Cold wave,Peru,Latin America and the Caribbean,Americas,"Ayacucho, Huancavelica, Tacna, Moquegua, Arequ...",-28.0,°C,2003,7,339.0,2003-07-07,2003-08-15,39 days
3,Cold wave,United States of America,Northern America,Americas,"Connecticut, Delaware, Maryland, Maine, New Ha...",,°C,2004,1,3.0,2004-01-09,2004-01-12,3 days
4,Cold wave,Germany,Western Europe,Europe,Nordrhein-Westfalen province,,°C,2005,11,1.0,2005-11-25,2005-11-27,2 days
5,Cold wave,France,Western Europe,Europe,"Alsace, Aquitaine, Auvergne, Basse-Normandie, ...",,°C,2005,11,6.0,2005-11-23,2005-11-27,4 days
6,Cold wave,South Africa,Sub-Saharan Africa,Africa,"Gauteng, Eastern Cape provinces",0.0,°C,2007,5,22.0,2007-05-21,2007-05-27,6 days
7,Cold wave,India,Southern Asia,Asia,Jammu and Kashmir province,,°C,2007,3,66.0,2007-03-11,2007-03-14,3 days
8,Cold wave,France,Western Europe,Europe,"Ile-de-France, Nord-Pas-de-Calais, Provence-Al...",,°C,2009,1,2.0,2009-01-06,2009-01-08,2 days
9,Cold wave,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,°C,2008,11,82.0,2008-11-01,2009-01-08,68 days


In [4]:
cold_deaths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   disaster_subtype  75 non-null     object 
 1   country           75 non-null     object 
 2   subregion         75 non-null     object 
 3   region            75 non-null     object 
 4   location          67 non-null     object 
 5   temperature       36 non-null     float64
 6   magnitude_scale   75 non-null     object 
 7   start_year        75 non-null     int64  
 8   start_month       75 non-null     int64  
 9   total_deaths      75 non-null     float64
 10  start_date        75 non-null     object 
 11  end_date          75 non-null     object 
 12  duration_days     75 non-null     object 
dtypes: float64(2), int64(2), object(9)
memory usage: 7.7+ KB


In [5]:
for i in cold_deaths.index:
    cold_deaths.loc[i, 'duration'] = (cold_deaths.loc[i,'duration_days'].split(' ')[0])

In [6]:
data = [
    {'country': 'Afghanistan', 'location': 'Kabul', 'exact_latitude': 34.555, 'exact_longitude':69.2, 'latitude': 30, 'longitude': 70}, 
    {'country': 'Poland', 'location': 'Warsaw', 'exact_latitude': 52.2, 'exact_longitude': 21.0, 'latitude': 50, 'longitude': 20}, 
    {'country': 'Peru', 'exact_latitude': -9.2, 'exact_longitude': -75, 'latitude': -10, 'longitude':-70}, 
    {'country': 'United States of America', 'location': 'Connecticut, Delaware, Maryland, Maine, New Ha...', 'exact_latitude': 40, 'exact_longitude': -80, 'latitude': 40, 'longitude':-80},
    {'country': 'United States of America', 'location': 'New York, Minnesota, Georgia, Maryland, Alaska...', 'latitude': 40, 'longitude':-90},
    {'country': 'United States of America', 'location': 'Nebraska, Kansas',  'latitude': 40, 'longitude': -100},
    {'country': 'United States of America', 'location': 'Utah',  'latitude': 40, 'longitude':-110},
    {'country': 'United States of America', 'location': 'California, Nevada', 'latitude': 40, 'longitude':-120},
    {'country': 'United States of America', 'location': 'Louisiana', 'latitude': 30, 'longitude':-90},
    {'country': 'United States of America', 'location': 'Texas, Pennsylvania, Oklahoma, Kentucky, Misso...',  'latitude': 30, 'longitude':-100},
    {'country': 'United States of America', 'location': 'Alaska', 'latitude': 60, 'longitude':-160},
    {'country': 'United States of America', 'location': 'Alaska',  'latitude': 70, 'longitude':-160},
    {'country': 'United States of America', 'location': 'Alaska',  'latitude': 60, 'longitude':-150},
    {'country': 'United States of America', 'location': 'Alaska',  'latitude': 70, 'longitude':-150},
    {'country': 'United States of America', 'location': 'Alaska',  'latitude': 60, 'longitude':-140},
    {'country': 'Germany',  'exact_latitude': 51, 'exact_longitude': 10.5, 'latitude': 50, 'longitude': 10}, 
    {'country': 'France', 'exact_latitude': 46.2, 'exact_longitude': 2.2, 'latitude': 50, 'longitude':0}, 
    {'country': 'South Africa', 'latitude': -30, 'longitude': 30}, 
    {'country': 'India',  'latitude': 20, 'longitude': 80}, 
    {'country': 'Romania',  'exact_latitude': 45, 'exact_longitude': 25, 'latitude': 50, 'longitude':20}, 
    {'country': 'United Kingdom of Great Britain and Northern Ireland',  'latitude': 51, 'longitude':0}, 
    {'country': 'Bangladesh',  'exact_latitude': 25, 'exact_longitude': 90, 'latitude': 30, 'longitude': 90}, 
    {'country': 'Russian Federation', 'latitude': 60, 'longitude': 30}, 
    {'country': 'Lithuania',  'exact_latitude': 55, 'exact_longitude': 24, 'latitude': 60, 'longitude': 30}, 
    {'country': 'China',  'exact_latitude': 30, 'exact_longitude': 110, 'latitude': 30, 'longitude': 110}, 
    {'country': 'Mexico', 'latitude': 20, 'longitude':-100}, 
    {'country': 'Nepal', 'exact_latitude': 28.5, 'exact_longitude': 84, 'latitude': 30, 'longitude': 80}, 
    {'country': 'Bosnia and Herzegovina',  'exact_latitude': 44, 'exact_longitude': 18, 'latitude': 40, 'longitude': 20}, 
    {'country': 'Republic of Moldova', 'exact_latitude': 47, 'exact_longitude': 29, 'latitude': 50, 'longitude':30}, 
    {'country': 'Bulgaria', 'exact_latitude': 43, 'exact_longitude': 26, 'latitude': 40, 'longitude':30}, 
    {'country': 'Switzerland', 'exact_latitude': 47, 'exact_longitude': 8, 'latitude': 50, 'longitude':10}, 
    {'country': 'Croatia', 'exact_latitude': 45, 'exact_longitude': 15, 'latitude': 40, 'longitude': 20}, 
    {'country': 'Serbia', 'exact_latitude': 44, 'exact_longitude': 21, 'latitude': 40, 'longitude': 20},
    {'country': 'Ukraine', 'exact_latitude': 50, 'exact_longitude': 30, 'latitude': 50, 'longitude': 30},
    {'country': 'Kyrgyzstan', 'exact_latitude': 41, 'exact_longitude': 75, 'latitude': 40, 'longitude': 70},
    {'country': 'Japan', 'exact_latitude': 36, 'exact_longitude': 138, 'latitude': 40, 'longitude': 140},
    {'country': 'Republic of Korea', 'exact_latitude': 36, 'exact_longitude': 127, 'latitude': 30, 'longitude':120},
    {'country': 'Thailand', 'exact_latitude': 16, 'exact_longitude': 101, 'latitude': 20, 'longitude': 100},
    {'country': 'Taiwan (Province of China)', 'exact_latitude': 24, 'exact_longitude': 121, 'latitude': 30, 'longitude': 120},
    {'country': 'Czechia', 'exact_latitude': 50, 'exact_longitude': 15, 'latitude': 50, 'longitude': 20},
    {'country': 'Estonia', 'exact_latitude': 59, 'exact_longitude': 25, 'latitude': 60, 'longitude': 30},
    {'country': 'Italy', 'exact_latitude': 42, 'exact_longitude': 13, 'latitude': 40, 'longitude': 20},
    {'country': 'Algeria', 'exact_latitude': 30, 'exact_longitude': 0, 'latitude': 30, 'longitude':0},
    {'country': 'State of Palestine',  'exact_latitude': 32, 'exact_longitude': 35, 'latitude': 30, 'longitude': 40},
    {'country': 'Mongolia', 'exact_latitude': 47, 'exact_longitude': 105, 'latitude': 50, 'longitude':100}
]

In [7]:
dataframe = pd.DataFrame(data)

In [8]:
dataframe.to_csv('../data/extreme_temp/locations_lat_long.csv', index=False)

In [9]:
cold_deaths['country'].unique()

array(['Afghanistan', 'Poland', 'Peru', 'United States of America',
       'Germany', 'France', 'South Africa', 'India', 'Romania',
       'United Kingdom of Great Britain and Northern Ireland',
       'Bangladesh', 'Russian Federation', 'Lithuania', 'China', 'Mexico',
       'Nepal', 'Bosnia and Herzegovina', 'Republic of Moldova',
       'Bulgaria', 'Switzerland', 'Croatia', 'Serbia', 'Ukraine',
       'Kyrgyzstan', 'Japan', 'Republic of Korea', 'Thailand',
       'Taiwan (Province of China)', 'Czechia', 'Estonia', 'Italy',
       'Algeria', 'State of Palestine', 'Mongolia'], dtype=object)

In [10]:
cold_deaths[cold_deaths['country']=='United States of America']['location']

3     Connecticut, Delaware, Maryland, Maine, New Ha...
11    Texas, Pennsylvania, Oklahoma, Kentucky, Misso...
48    New York, Minnesota, Georgia, Maryland, Alaska...
Name: location, dtype: object

USA 48: covers longitude: -125 to -70, 30 to 48 latitude
Alaska includes 60 to 70 latitude and -140 to -165 longitude

In [20]:
cold_deaths[cold_deaths['country']=='Russian Federation']['location']

19    Moskva, Moskovskaya Oblast, Sankt-peterburg, L...
41    Khabarovsk city (Khabarovskiy Kray province), ...
Name: location, dtype: object

Moskva is Moscow at about 55.5 latitude and 37.5 longitude.  
St. Petersburg is 60 latitude, 30 longitude.

In [11]:
cold_countries = ['Afghanistan', 'Poland', 'Peru',
       'Germany', 'France', 'South Africa', 'India', 'Romania',
       'United Kingdom of Great Britain and Northern Ireland',
       'Bangladesh', 'Russian Federation', 'Lithuania', 'China', 'Mexico',
       'Nepal', 'Bosnia and Herzegovina', 'Republic of Moldova',
       'Bulgaria', 'Switzerland', 'Croatia', 'Serbia', 'Ukraine',
       'Kyrgyzstan', 'Japan', 'Republic of Korea', 'Thailand',
       'Taiwan (Province of China)', 'Czechia', 'Estonia', 'Italy',
       'Algeria', 'State of Palestine', 'Mongolia']

In [12]:
cold_deaths_no_usa = cold_deaths[cold_deaths['country']!='United States of America']
dataframe_no_usa = dataframe[dataframe['country']!='United States of America']

In [13]:
cold_deaths

Unnamed: 0,disaster_subtype,country,subregion,region,location,temperature,magnitude_scale,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration
0,Cold wave,Afghanistan,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",-25.0,°C,2001,1,150.0,2001-01-31,2001-02-20,20 days,20
1,Cold wave,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,°C,2001,10,270.0,2001-10-10,2002-01-20,102 days,102
2,Cold wave,Peru,Latin America and the Caribbean,Americas,"Ayacucho, Huancavelica, Tacna, Moquegua, Arequ...",-28.0,°C,2003,7,339.0,2003-07-07,2003-08-15,39 days,39
3,Cold wave,United States of America,Northern America,Americas,"Connecticut, Delaware, Maryland, Maine, New Ha...",,°C,2004,1,3.0,2004-01-09,2004-01-12,3 days,3
4,Cold wave,Germany,Western Europe,Europe,Nordrhein-Westfalen province,,°C,2005,11,1.0,2005-11-25,2005-11-27,2 days,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,Cold wave,Romania,Eastern Europe,Europe,,,°C,2018,2,2.0,2018-02-23,2018-02-28,5 days,5
71,Cold wave,Algeria,Northern Africa,Africa,"Tizi Ouzou, Medea, Setif, Bourdj Bouriridj, Ba...",,°C,2019,1,8.0,2019-01-11,2019-03-22,70 days,70
72,Cold wave,State of Palestine,Western Asia,Asia,West Bank and Gaza Strip,0.0,°C,2020,1,3.0,2020-01-18,2020-02-05,18 days,18
73,Cold wave,Afghanistan,Southern Asia,Asia,"Ghor, Baghlan, Sar-e-Pul, Faryab, Takhar, Kand...",-33.0,°C,2023,1,166.0,2023-01-09,2023-01-23,14 days,14


In [14]:
df = pd.merge(left= cold_deaths_no_usa.drop(columns=['disaster_subtype', 'magnitude_scale']), right= dataframe_no_usa.drop(columns=['location','exact_latitude', 'exact_longitude']), how= 'left', on='country')

In [15]:
df

Unnamed: 0,country,subregion,region,location,temperature,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration,latitude,longitude
0,Afghanistan,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",-25.0,2001,1,150.0,2001-01-31,2001-02-20,20 days,20,30,70
1,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2001,10,270.0,2001-10-10,2002-01-20,102 days,102,50,20
2,Peru,Latin America and the Caribbean,Americas,"Ayacucho, Huancavelica, Tacna, Moquegua, Arequ...",-28.0,2003,7,339.0,2003-07-07,2003-08-15,39 days,39,-10,-70
3,Germany,Western Europe,Europe,Nordrhein-Westfalen province,,2005,11,1.0,2005-11-25,2005-11-27,2 days,2,50,10
4,France,Western Europe,Europe,"Alsace, Aquitaine, Auvergne, Basse-Normandie, ...",,2005,11,6.0,2005-11-23,2005-11-27,4 days,4,50,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,Romania,Eastern Europe,Europe,,,2018,2,2.0,2018-02-23,2018-02-28,5 days,5,50,20
68,Algeria,Northern Africa,Africa,"Tizi Ouzou, Medea, Setif, Bourdj Bouriridj, Ba...",,2019,1,8.0,2019-01-11,2019-03-22,70 days,70,30,0
69,State of Palestine,Western Asia,Asia,West Bank and Gaza Strip,0.0,2020,1,3.0,2020-01-18,2020-02-05,18 days,18,30,40
70,Afghanistan,Southern Asia,Asia,"Ghor, Baghlan, Sar-e-Pul, Faryab, Takhar, Kand...",-33.0,2023,1,166.0,2023-01-09,2023-01-23,14 days,14,30,70


In [16]:
df[df['start_year']<2020].dropna(subset=['temperature'])

Unnamed: 0,country,subregion,region,location,temperature,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration,latitude,longitude
0,Afghanistan,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",-25.0,2001,1,150.0,2001-01-31,2001-02-20,20 days,20,30,70
1,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2001,10,270.0,2001-10-10,2002-01-20,102 days,102,50,20
2,Peru,Latin America and the Caribbean,Americas,"Ayacucho, Huancavelica, Tacna, Moquegua, Arequ...",-28.0,2003,7,339.0,2003-07-07,2003-08-15,39 days,39,-10,-70
5,South Africa,Sub-Saharan Africa,Africa,"Gauteng, Eastern Cape provinces",0.0,2007,5,22.0,2007-05-21,2007-05-27,6 days,6,-30,30
8,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2008,11,82.0,2008-11-01,2009-01-08,68 days,68,50,20
9,Romania,Eastern Europe,Europe,"Alba, Arad, Bihor, Bistrita-nasaud, Brasov, Bu...",-31.0,2009,1,43.0,2009-01-01,2009-01-15,14 days,14,50,20
10,Germany,Western Europe,Europe,"Bayern, Berlin, Hamburg provinces, Leipzig dis...",-33.6,2009,12,14.0,2009-12-18,2010-01-25,38 days,38,50,10
12,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-35.0,2009,11,298.0,2009-11-01,2010-01-26,86 days,86,50,20
13,Romania,Eastern Europe,Europe,"Alba, Arad, Arges, Bacau, Bihor, Bistrita-nasa...",-20.0,2009,12,11.0,2009-12-20,2009-12-23,3 days,3,50,20
14,Bangladesh,Southern Asia,Asia,"Rajbari, Manikganj, Tangail, Sherpur districts...",-6.0,2009,12,135.0,2009-12-15,2010-01-31,47 days,47,30,90


In [17]:
df[df['start_year']>=2020]

Unnamed: 0,country,subregion,region,location,temperature,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration,latitude,longitude
69,State of Palestine,Western Asia,Asia,West Bank and Gaza Strip,0.0,2020,1,3.0,2020-01-18,2020-02-05,18 days,18,30,40
70,Afghanistan,Southern Asia,Asia,"Ghor, Baghlan, Sar-e-Pul, Faryab, Takhar, Kand...",-33.0,2023,1,166.0,2023-01-09,2023-01-23,14 days,14,30,70
71,Mongolia,Eastern Asia,Asia,"Dornod, Dornogovi, Dundgovi, Govi-Altai, Khent...",,2023,5,2.0,2023-05-19,2023-05-22,3 days,3,50,100


In [129]:
np.mean([-5.9440, -5.3286,0.0471, 2.3412])

-2.221075

In [18]:
df.loc[71,'temperature'] = -2.22

In [19]:
df = df.dropna(subset='temperature')

In [22]:
df.dtypes

country           object
subregion         object
region            object
location          object
temperature      float64
start_year         int64
start_month        int64
total_deaths     float64
start_date        object
end_date          object
duration_days     object
duration          object
latitude           int64
longitude          int64
dtype: object

In [41]:
def temp_change(df, i):
    """
    takes a dataframe, df, and an index, i

    returns the average difference between the temperature in the data row and the extreme temperature in both degrees Celsius and standard deviations
    """
    lat = df.loc[i,'latitude']
    long = df.loc[i,'longitude']
    start_date = dt.datetime.fromisoformat(df.loc[i,'start_date'])
    end_date = dt.datetime.fromisoformat(df.loc[i,'end_date'])
    temp = df.loc[i,'temperature']
    #duration = df.loc[i,'duration']
    
    monthly_sum = pd.read_csv(f"../data/temp_events/monthly_summary_lat={lat}_long={long}.csv", index_col='month', usecols=['month', 'min_std', 'min_extreme'])
    
    
    dates = pd.date_range(start= start_date, end= end_date)
    delta_C = []
    delta_std = []
    
    for date in dates:
        delta_temp_C = temp - monthly_sum.loc[date.month, 'min_extreme']
        delta_C.append(delta_temp_C)
        
        delta_std.append(delta_temp_C / monthly_sum.loc[date.month, 'min_std'])

    return np.mean(delta_C), np.mean(delta_std)

In [42]:
temp_change(df,71)

(7.573735382951979, 1.9008804003733517)

In [43]:
delta_temps = []

for i in df.index:
    delta_C, delta_std = temp_change(df,i)
    delta_temps.append({'delta_temp_C': delta_C, 'delta_temp_std': delta_std})

In [47]:
df = df.reset_index(drop=True)

In [49]:
cold_df = pd.merge(left= df, right= pd.DataFrame(delta_temps), left_index=True, right_index=True)

In [50]:
cold_df

Unnamed: 0,country,subregion,region,location,temperature,start_year,start_month,total_deaths,start_date,end_date,duration_days,duration,latitude,longitude,delta_temp_C,delta_temp_std
0,Afghanistan,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",-25.0,2001,1,150.0,2001-01-31,2001-02-20,20 days,20,30,70,-24.856821,-8.561158
1,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2001,10,270.0,2001-10-10,2002-01-20,102 days,102,50,20,-15.941674,-4.306348
2,Peru,Latin America and the Caribbean,Americas,"Ayacucho, Huancavelica, Tacna, Moquegua, Arequ...",-28.0,2003,7,339.0,2003-07-07,2003-08-15,39 days,39,-10,-70,-43.303703,-20.441972
3,South Africa,Sub-Saharan Africa,Africa,"Gauteng, Eastern Cape provinces",0.0,2007,5,22.0,2007-05-21,2007-05-27,6 days,6,-30,30,-1.690848,-0.672643
4,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-25.0,2008,11,82.0,2008-11-01,2009-01-08,68 days,68,50,20,-14.382446,-3.69479
5,Romania,Eastern Europe,Europe,"Alba, Arad, Bihor, Bistrita-nasaud, Brasov, Bu...",-31.0,2009,1,43.0,2009-01-01,2009-01-15,14 days,14,50,20,-17.360192,-3.916444
6,Germany,Western Europe,Europe,"Bayern, Berlin, Hamburg provinces, Leipzig dis...",-33.6,2009,12,14.0,2009-12-18,2010-01-25,38 days,38,50,10,-24.188667,-6.483161
7,Poland,Eastern Europe,Europe,"Dolnoslaskie, Kujawsko-Pomorskie, Lodzkie, Lub...",-35.0,2009,11,298.0,2009-11-01,2010-01-26,86 days,86,50,20,-23.757152,-5.926544
8,Romania,Eastern Europe,Europe,"Alba, Arad, Arges, Bacau, Bihor, Bistrita-nasa...",-20.0,2009,12,11.0,2009-12-20,2009-12-23,3 days,3,50,20,-7.234823,-1.665499
9,Bangladesh,Southern Asia,Asia,"Rajbari, Manikganj, Tangail, Sherpur districts...",-6.0,2009,12,135.0,2009-12-15,2010-01-31,47 days,47,30,90,23.619512,6.039956


In [51]:
cold_df.columns

Index(['country', 'subregion', 'region', 'location', 'temperature',
       'start_year', 'start_month', 'total_deaths', 'start_date', 'end_date',
       'duration_days', 'duration', 'latitude', 'longitude', 'delta_temp_C',
       'delta_temp_std'],
      dtype='object')

In [99]:
X = cold_df.drop(columns=['country','subregion','location', 'start_date', 'end_date', 'duration_days', 'latitude', 'longitude', 'total_deaths', 'temperature'])

In [100]:
y = cold_df['total_deaths']

In [101]:
X

Unnamed: 0,region,start_year,start_month,duration,delta_temp_C,delta_temp_std
0,Asia,2001,1,20,-24.856821,-8.561158
1,Europe,2001,10,102,-15.941674,-4.306348
2,Americas,2003,7,39,-43.303703,-20.441972
3,Africa,2007,5,6,-1.690848,-0.672643
4,Europe,2008,11,68,-14.382446,-3.69479
5,Europe,2009,1,14,-17.360192,-3.916444
6,Europe,2009,12,38,-24.188667,-6.483161
7,Europe,2009,11,86,-23.757152,-5.926544
8,Europe,2009,12,3,-7.234823,-1.665499
9,Asia,2009,12,47,23.619512,6.039956


## Model Preprocessing

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=92)

In [116]:
ct = ColumnTransformer([
    ('ss', StandardScaler(), ['duration', 'delta_temp_C', 'delta_temp_std']),
    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'), ['region', 'start_year', 'start_month'])
], remainder='passthrough')

In [117]:
pipe_lr = Pipeline([
    ('ct', ct),
    ('lr', LinearRegression())
])

In [118]:
pipe_lr.fit(X_train, y_train)

In [119]:
pipe_lr.score(X_train, y_train)

0.9589540923335533

In [120]:
pipe_lr.score(X_test, y_test)

-1.2709934014306672