# Import Libs

In [13]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

ModuleNotFoundError: No module named 'jupyternotify'

# Load data

In [5]:
url_to_covid = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'

In [6]:
df_orig = pd.read_csv(url_to_covid)

# Understand structure

In [7]:
df_orig.location.unique()

array(['Afghanistan', 'Africa', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Asia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire Sint Eustatius and Saba',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Cayman Islands', 'Central African Republic', 'Chad', 'Chile',
       'China', 'Colombia', 'Comoros', 'Congo', 'Cook Islands',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Curacao',
       'Cyprus', 'Czechia', 'Democratic Republic of Congo', 'Denmark',
       'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'England', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Eswat

In [8]:
df = df_orig[df_orig.location == 'Austria']

In [9]:
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
13787,AUT,Europe,Austria,2020-02-25,2.0,2.0,,,,,...,30.9,,7.37,81.54,0.922,8939617.0,,,,
13788,AUT,Europe,Austria,2020-02-26,1.0,,,,,,...,30.9,,7.37,81.54,0.922,8939617.0,,,,
13789,AUT,Europe,Austria,2020-02-27,1.0,0.0,,,,,...,30.9,,7.37,81.54,0.922,8939617.0,,,,
13790,AUT,Europe,Austria,2020-02-28,1.0,0.0,,,,,...,30.9,,7.37,81.54,0.922,8939617.0,,,,
13791,AUT,Europe,Austria,2020-02-29,3.0,2.0,,,,,...,30.9,,7.37,81.54,0.922,8939617.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14839,AUT,Europe,Austria,2023-01-12,5736630.0,2306.0,2435.000,21546.0,10.0,10.429,...,30.9,,7.37,81.54,0.922,8939617.0,,,,
14840,AUT,Europe,Austria,2023-01-13,5738797.0,2167.0,2317.571,21558.0,12.0,10.857,...,30.9,,7.37,81.54,0.922,8939617.0,,,,
14841,AUT,Europe,Austria,2023-01-14,5740581.0,1784.0,2338.000,21564.0,6.0,11.429,...,30.9,,7.37,81.54,0.922,8939617.0,,,,
14842,AUT,Europe,Austria,2023-01-15,5742055.0,1474.0,2252.571,21564.0,0.0,11.000,...,30.9,,7.37,81.54,0.922,8939617.0,,,,


In [14]:
_ = plt.figure(figsize=(30, 15))
sns.scatterplot(sorted(df.date), df.new_cases);

TypeError: scatterplot() takes from 0 to 1 positional arguments but 2 were given

<Figure size 3000x1500 with 0 Axes>

## Check missing

In [15]:
df = df_orig.copy()

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250400 entries, 0 to 250399
Data columns (total 67 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   iso_code                                    250400 non-null  object 
 1   continent                                   236330 non-null  object 
 2   location                                    250400 non-null  object 
 3   date                                        250400 non-null  object 
 4   total_cases                                 236094 non-null  float64
 5   new_cases                                   235745 non-null  float64
 6   new_cases_smoothed                          234541 non-null  float64
 7   total_deaths                                216561 non-null  float64
 8   new_deaths                                  216449 non-null  float64
 9   new_deaths_smoothed                         215263 non-null  float64
 

In [17]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df.sort_values('percent_missing', inplace=True, ascending=False)
missing_value_df

Unnamed: 0,column_name,percent_missing
excess_mortality_cumulative_per_million,excess_mortality_cumulative_per_million,96.709265
excess_mortality_cumulative,excess_mortality_cumulative,96.709265
excess_mortality_cumulative_absolute,excess_mortality_cumulative_absolute,96.709265
excess_mortality,excess_mortality,96.693690
weekly_icu_admissions,weekly_icu_admissions,96.626597
...,...,...
continent,continent,5.619010
population,population,0.428914
date,date,0.000000
location,location,0.000000


In [18]:
cols_too_many_missing = missing_value_df[missing_value_df.percent_missing > 50].index.tolist()
len(cols_too_many_missing)
cols_too_many_missing

31

['excess_mortality_cumulative_per_million',
 'excess_mortality_cumulative',
 'excess_mortality_cumulative_absolute',
 'excess_mortality',
 'weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'weekly_hosp_admissions',
 'weekly_hosp_admissions_per_million',
 'icu_patients',
 'icu_patients_per_million',
 'hosp_patients_per_million',
 'hosp_patients',
 'total_boosters',
 'total_boosters_per_hundred',
 'new_vaccinations',
 'people_fully_vaccinated',
 'people_fully_vaccinated_per_hundred',
 'people_vaccinated_per_hundred',
 'people_vaccinated',
 'total_vaccinations_per_hundred',
 'total_vaccinations',
 'new_tests_per_thousand',
 'new_tests',
 'total_tests_per_thousand',
 'total_tests',
 'tests_per_case',
 'positive_rate',
 'handwashing_facilities',
 'new_tests_smoothed_per_thousand',
 'new_tests_smoothed',
 'tests_units']

In [19]:
len(df.columns)

67

In [20]:
df_reduced = df.drop(columns=cols_too_many_missing)

In [21]:
len(df_reduced.columns)

36

In [22]:
df_reduced

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,population
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,1803.987,,597.029,9.59,,,0.5,64.83,0.511,41128772.0
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,1803.987,,597.029,9.59,,,0.5,64.83,0.511,41128772.0
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,1803.987,,597.029,9.59,,,0.5,64.83,0.511,41128772.0
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,1803.987,,597.029,9.59,,,0.5,64.83,0.511,41128772.0
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,1803.987,,597.029,9.59,,,0.5,64.83,0.511,41128772.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250395,ZWE,Africa,Zimbabwe,2023-01-11,259981.0,0.0,0.0,5637.0,0.0,0.0,...,1899.775,21.4,307.846,1.82,1.6,30.7,1.7,61.49,0.571,16320539.0
250396,ZWE,Africa,Zimbabwe,2023-01-12,259981.0,0.0,0.0,5637.0,0.0,0.0,...,1899.775,21.4,307.846,1.82,1.6,30.7,1.7,61.49,0.571,16320539.0
250397,ZWE,Africa,Zimbabwe,2023-01-13,259981.0,0.0,0.0,5637.0,0.0,0.0,...,1899.775,21.4,307.846,1.82,1.6,30.7,1.7,61.49,0.571,16320539.0
250398,ZWE,Africa,Zimbabwe,2023-01-14,259981.0,0.0,0.0,5637.0,0.0,0.0,...,1899.775,21.4,307.846,1.82,1.6,30.7,1.7,61.49,0.571,16320539.0


In [23]:
df = df_reduced

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44996 entries, 0 to 44995
Data columns (total 31 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   iso_code                         44731 non-null  object 
 1   continent                        44466 non-null  object 
 2   location                         44996 non-null  object 
 3   date                             44996 non-null  object 
 4   total_cases                      44393 non-null  float64
 5   new_cases                        44192 non-null  float64
 6   new_cases_smoothed               43410 non-null  float64
 7   total_deaths                     44393 non-null  float64
 8   new_deaths                       44192 non-null  float64
 9   new_deaths_smoothed              43410 non-null  float64
 10  total_cases_per_million          44128 non-null  float64
 11  new_cases_per_million            44128 non-null  float64
 12  new_cases_smoothed

In [24]:
missing_iso_code = df[df.iso_code.isna()]
df = df.drop(index=missing_iso_code.index)

In [25]:
missing_continent = df[df.continent.isna()]
df = df.drop(index=missing_continent.index)

In [26]:
for col in df.columns: 
    col, df[col].isna().sum()

('iso_code', 0)

('continent', 0)

('location', 0)

('date', 0)

('total_cases', 14293)

('new_cases', 14649)

('new_cases_smoothed', 15788)

('total_deaths', 33641)

('new_deaths', 33932)

('new_deaths_smoothed', 35053)

('total_cases_per_million', 14293)

('new_cases_per_million', 14649)

('new_cases_smoothed_per_million', 15788)

('total_deaths_per_million', 33641)

('new_deaths_per_million', 33932)

('new_deaths_smoothed_per_million', 35053)

('reproduction_rate', 52589)

('new_vaccinations_smoothed', 96077)

('new_vaccinations_smoothed_per_million', 96077)

('new_people_vaccinated_smoothed', 96332)

('new_people_vaccinated_smoothed_per_hundred', 96332)

('stringency_index', 52369)

('population_density', 19271)

('median_age', 35639)

('aged_65_older', 37766)

('aged_70_older', 36694)

('gdp_per_capita', 36037)

('extreme_poverty', 106274)

('cardiovasc_death_rate', 36226)

('diabetes_prevalence', 25692)

('female_smokers', 84889)

('male_smokers', 86976)

('hospital_beds_per_thousand', 58080)

('life_expectancy', 7707)

('human_development_index', 40824)

('population', 0)

Now we have removed the rows and columns that contained too many Nans.

# Preprocess for ML

Now we need to encode the nominal variables and impute nans of the numerical variables.

In [35]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Encode categorical

In [36]:
nominal = df.select_dtypes(include=['object']).copy()
nominal_cols = nominal.columns.tolist()
nominal_cols

['iso_code', 'continent', 'location', 'date']

In [37]:
encoder = LabelEncoder()
for col in nominal_cols:
    col
    if df[col].isna().sum() > 0:
        df[col].fillna('MISSING', inplace=True)
    df[col] = encoder.fit_transform(df[col])

'iso_code'

'continent'

'location'

'date'

In [38]:
for col in nominal_cols:
    df[col].unique()

array([  1,   4,  56,   5,   2,   3,   9,   7,   8,   0,  10,  11,  12,
        21,  20,  18,  28,  23,  14,  24,  15,  25,  30,  26,  16,  22,
        31,  27, 225,  29,  19,  17,  13, 107,  38,  33,  44,  48,  32,
       203,  35,  36,  42,  43,  40,  41,  45,  37,  88,  46,  47,  49,
        50,  39,  54,  52,  53,  55,  57,  58, 187, 157,  79,  59,  62,
       198,  63,  68,  66,  65,  64,  67, 175,  70,  77,  72,  51,  74,
        75,  80,  82,  81,  84,  83,  73,  76,  78,  85,  89,  87,  86,
        90,  97,  93,  91,  95,  96,  94,  92,  98,  99, 100, 103, 101,
       102, 104, 105, 108, 158, 111, 106, 112, 122, 113, 119, 114, 115,
       117, 120, 121, 123, 127, 142, 143, 128, 132, 133, 130, 139, 141,
       129,  69, 126, 125, 136, 135, 140, 124, 138, 134, 144, 153, 152,
       150, 145, 154, 148, 146, 147, 149, 171, 131, 156, 159, 137, 151,
       155, 162, 167, 174, 163, 168, 173, 165, 166, 164, 169, 172, 170,
       176, 177, 178, 179, 184, 109, 116, 190, 223, 230, 188, 19

array([1, 2, 0, 3, 5, 4])

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

array([  54,   55,   56, ...,   52,   53, 1111])

## Impute missing values of numerical

In [39]:
numerical = df.select_dtypes(include=['float64']).copy()
numerical

Unnamed: 0,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,population
0,5.0,5.0,,,,,0.122,0.122,,,...,1803.987,,597.029,9.59,,,0.5,64.83,0.511,41128772.0
1,5.0,0.0,,,,,0.122,0.000,,,...,1803.987,,597.029,9.59,,,0.5,64.83,0.511,41128772.0
2,5.0,0.0,,,,,0.122,0.000,,,...,1803.987,,597.029,9.59,,,0.5,64.83,0.511,41128772.0
3,5.0,0.0,,,,,0.122,0.000,,,...,1803.987,,597.029,9.59,,,0.5,64.83,0.511,41128772.0
4,5.0,0.0,,,,,0.122,0.000,,,...,1803.987,,597.029,9.59,,,0.5,64.83,0.511,41128772.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250395,259981.0,0.0,0.0,5637.0,0.0,0.0,15929.682,0.000,0.0,345.393,...,1899.775,21.4,307.846,1.82,1.6,30.7,1.7,61.49,0.571,16320539.0
250396,259981.0,0.0,0.0,5637.0,0.0,0.0,15929.682,0.000,0.0,345.393,...,1899.775,21.4,307.846,1.82,1.6,30.7,1.7,61.49,0.571,16320539.0
250397,259981.0,0.0,0.0,5637.0,0.0,0.0,15929.682,0.000,0.0,345.393,...,1899.775,21.4,307.846,1.82,1.6,30.7,1.7,61.49,0.571,16320539.0
250398,259981.0,0.0,0.0,5637.0,0.0,0.0,15929.682,0.000,0.0,345.393,...,1899.775,21.4,307.846,1.82,1.6,30.7,1.7,61.49,0.571,16320539.0


In [40]:
df.total_cases

0              5.0
1              5.0
2              5.0
3              5.0
4              5.0
            ...   
250395    259981.0
250396    259981.0
250397    259981.0
250398    259981.0
250399    259981.0
Name: total_cases, Length: 236330, dtype: float64

In [41]:
for col in numerical:
    df[col].fillna((df[col].mean()), inplace=True)

In [42]:
df.isna().sum().sum() == 0

True

Now the dataset has no Nans and is completely encoded.

## Split into train and test set

In [43]:
X = df.drop(columns=['new_cases'])
y = df.new_cases
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [44]:
X_train

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,population
129320,120,2,118,1080,1282732.0,386.143,9452.000000,0.000000,1.571000,466438.162,...,29524.265,0.700000,342.989000,3.67,21.30000,38.000000,6.560000,75.93,0.882,2.750058e+06
84287,51,2,76,810,18772331.0,219800.286,126929.000000,283.000000,191.286000,225169.330,...,45229.245,13.682884,156.139000,8.31,28.20000,33.100000,8.000000,81.33,0.947,8.336984e+07
95359,85,5,87,980,71133.0,14.000,1279.000000,0.000000,0.143000,87956.752,...,7435.047,13.682884,373.159000,11.62,10.71614,32.804898,1.600000,69.91,0.682,8.087270e+05
139480,133,2,126,681,38117.0,43.429,462.000000,0.000000,0.000000,71474.780,...,36513.323,0.200000,168.711000,8.83,20.90000,30.200000,4.485000,82.53,0.895,5.332930e+05
93681,78,0,86,342,2444.0,0.429,44.000000,0.000000,0.000000,1160.725,...,1548.675,67.100000,382.474000,2.42,10.71614,32.804898,3.092173,58.32,0.480,2.105580e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187792,109,3,174,180,15.0,0.000,20014.173783,33.033434,33.131258,314.591,...,24654.385,13.682884,261.903685,12.84,10.71614,32.804898,2.300000,76.23,0.779,4.768100e+04
124454,119,0,114,378,6241.0,300.571,87.000000,2.000000,2.000000,2706.622,...,2851.153,59.600000,405.126000,3.94,0.40000,53.900000,3.092173,54.33,0.527,2.305826e+06
184514,177,2,170,910,2919461.0,567.714,65739.000000,0.000000,1.857000,148503.022,...,23313.199,5.700000,370.946000,9.74,22.90000,37.100000,6.892000,76.05,0.828,1.965927e+07
45726,36,1,41,629,107010.0,71.000,4851.000000,0.000000,0.143000,75.048,...,15308.712,0.700000,261.899000,9.74,1.90000,48.400000,4.340000,76.91,0.761,1.425887e+09


# Add ML

In [45]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score 
from sklearn.model_selection import RandomizedSearchCV
from joblib import dump, load

In [46]:
rf = RandomForestRegressor(
    n_estimators = 100, # 400 
    random_state = 0, 
    max_depth=30)

In [47]:
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=30, random_state=0)

In [48]:
y_pred = rf.predict(X_test)

In [49]:
print(f'{r2_score(y_test, y_pred):.2%}')

94.05%


## Improve hyperparameters

Best params for n_estimators and max_depth are

`{'n_estimators': 400, 'max_depth': 30}`

In [53]:
random_grid = {'n_estimators': np.arange(200,600,100),
#                'max_features': ['auto', 'sqrt'],
               'max_depth': np.arange(10,40,10)}
#                'min_samples_split': [2, 5],
#                'min_samples_leaf': [2,4]}#,
#                'bootstrap': [True, False]}

In [56]:
rf_random = RandomizedSearchCV(
    estimator = rf, 
    param_distributions = random_grid, 
    n_iter = 3, cv = 5, verbose=2, random_state=42)

In [57]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END .....................max_depth=30, n_estimators=400; total time=26.5min
[CV] END .....................max_depth=30, n_estimators=400; total time=23.0min
[CV] END .....................max_depth=30, n_estimators=400; total time=22.5min
[CV] END .....................max_depth=30, n_estimators=400; total time=22.3min
[CV] END .....................max_depth=30, n_estimators=400; total time=22.5min
[CV] END .....................max_depth=30, n_estimators=300; total time=16.8min
[CV] END .....................max_depth=30, n_estimators=300; total time=16.9min
[CV] END .....................max_depth=30, n_estimators=300; total time=18.9min
[CV] END .....................max_depth=30, n_estimators=300; total time=19.1min
[CV] END .....................max_depth=30, n_estimators=300; total time=19.2min
[CV] END .....................max_depth=10, n_estimators=200; total time= 6.1min
[CV] END .....................max_depth=10, n_est

RandomizedSearchCV(cv=5,
                   estimator=RandomForestRegressor(max_depth=30,
                                                   random_state=0),
                   n_iter=3,
                   param_distributions={'max_depth': array([10, 20, 30]),
                                        'n_estimators': array([200, 300, 400, 500])},
                   random_state=42, verbose=2)

In [59]:
rf_random.best_params_

{'n_estimators': 300, 'max_depth': 30}

## Re-run 

In [44]:
# rf = RandomForestRegressor(**rf_random.best_params_, random_state = 1)

In [45]:
# y_pred = rf.predict(X_test)

In [46]:
# print(f'{r2_score(y_test, y_pred):.2%}')

# Save model

In [60]:
# dump(rf, 'rf_model.joblib') 
dump(rf, 'rf_model.joblib',compress=3)
# dump(rf, 'rf_model.pkl.z')

['rf_model.joblib']

# Predict on country

In [48]:
input_val = 'Germany'

In [49]:
encoder.fit_transform(df_orig['location'])

array([ 0,  0,  0, ..., 92, 92, 92])

In [50]:
encode_ind = (encoder.classes_).tolist().index(input_val)

In [51]:
df_orig[df_orig.location == input_val]

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
15535,DEU,Europe,Germany,2019-12-31,0.0,0.0,,0.0,0.0,,...,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
15536,DEU,Europe,Germany,2020-01-01,0.0,0.0,,0.0,0.0,,...,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
15537,DEU,Europe,Germany,2020-01-02,0.0,0.0,,0.0,0.0,,...,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
15538,DEU,Europe,Germany,2020-01-03,0.0,0.0,,0.0,0.0,,...,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
15539,DEU,Europe,Germany,2020-01-04,0.0,0.0,,0.0,0.0,,...,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15795,DEU,Europe,Germany,2020-09-16,263663.0,1901.0,1455.571,9368.0,6.0,4.286,...,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
15796,DEU,Europe,Germany,2020-09-17,265857.0,2194.0,1498.714,9371.0,3.0,4.286,...,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
15797,DEU,Europe,Germany,2020-09-18,267773.0,1916.0,1560.429,9378.0,7.0,5.143,...,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
15798,DEU,Europe,Germany,2020-09-19,270070.0,2297.0,1655.714,9384.0,6.0,5.286,...,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936


In [52]:
to_pred = X[X.location == encode_ind].iloc[-1].values.reshape(1,-1)

In [53]:
rf.predict(to_pred)[0] 

1838.76