# COVID19 Global Forecasting

The COVID-19 Global Forecasting project leverages Automated Machine Learning (AutoML) to predict the spread and impact of the COVID-19 pandemic across different regions worldwide. This project aims to provide accurate forecasts for key metrics such as infection rates, hospitalization needs, and mortality rates by utilizing advanced AutoML techniques to analyze large datasets containing epidemiological data, public health measures, and socio-economic factors.

In [1]:
import pandas as pd
import numpy as np

In [2]:
ss=pd.read_csv("/content/submission.csv")
train=pd.read_csv("/content/train.csv")
test=pd.read_csv("/content/test.csv")

In [3]:
!pip install ydata-profiling



In [4]:
import ydata_profiling

In [5]:
train.profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [6]:
ss.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,1,1
1,2,1,1
2,3,1,1
3,4,1,1
4,5,1,1


In [7]:
train.head()

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0
1,2,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0
2,3,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0
3,4,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0
4,5,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0


In [8]:
test.head()

Unnamed: 0,ForecastId,Province/State,Country/Region,Lat,Long,Date
0,1,,Afghanistan,33.0,65.0,2020-03-12
1,2,,Afghanistan,33.0,65.0,2020-03-13
2,3,,Afghanistan,33.0,65.0,2020-03-14
3,4,,Afghanistan,33.0,65.0,2020-03-15
4,5,,Afghanistan,33.0,65.0,2020-03-16


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17892 entries, 0 to 17891
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              17892 non-null  int64  
 1   Province/State  8190 non-null   object 
 2   Country/Region  17892 non-null  object 
 3   Lat             17892 non-null  float64
 4   Long            17892 non-null  float64
 5   Date            17892 non-null  object 
 6   ConfirmedCases  17892 non-null  float64
 7   Fatalities      17892 non-null  float64
dtypes: float64(4), int64(1), object(3)
memory usage: 1.1+ MB


In [10]:
train.drop(["Id","Province/State","Date","Country/Region"],axis=1,inplace=True)
test.drop(["ForecastId","Province/State","Date","Country/Region"],axis=1,inplace=True)

In [11]:
train1=train.drop("Fatalities",axis=1)

In [12]:
pip install pycaret



In [13]:
from pycaret.regression import*

In [14]:
setup(data=train1,target="ConfirmedCases")

Unnamed: 0,Description,Value
0,Session id,3162
1,Target,ConfirmedCases
2,Target type,Regression
3,Original data shape,"(17892, 3)"
4,Transformed data shape,"(17892, 3)"
5,Transformed train set shape,"(12524, 3)"
6,Transformed test set shape,"(5368, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


<pycaret.regression.oop.RegressionExperiment at 0x7d628a954760>

In [15]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,281.1629,4262819.7094,2035.6113,0.5862,2.3688,36.7344,0.943
gbr,Gradient Boosting Regressor,302.5746,4286742.9467,2040.1043,0.5854,3.2243,39.9823,0.636
dt,Decision Tree Regressor,280.9744,4268968.6686,2036.6503,0.5848,2.3694,36.7636,0.049
lightgbm,Light Gradient Boosting Machine,281.3796,4269024.3709,2036.6528,0.5848,2.4785,36.7992,1.034
xgboost,Extreme Gradient Boosting,280.9954,4268964.8875,2036.648,0.5848,2.3736,36.7636,0.123
et,Extra Trees Regressor,280.9744,4268968.6686,2036.6503,0.5848,2.3694,36.7636,0.374
ada,AdaBoost Regressor,427.1537,5041114.6499,2212.5053,0.533,3.8968,68.3321,0.058
knn,K Neighbors Regressor,291.0873,4935570.75,2204.9571,0.4991,2.2504,54.5369,0.037
br,Bayesian Ridge,612.4656,12806619.0,3482.8851,0.0071,4.5625,90.7349,0.055
en,Elastic Net,615.7229,12806537.4,3482.9206,0.0071,4.5799,91.3932,0.042


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [16]:
best_model=create_model("dt")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,257.5065,3168418.1137,1780.0051,0.8544,2.3301,18.1458
1,358.0744,7119872.9675,2668.309,0.6566,2.3712,55.5712
2,270.8619,4995978.408,2235.1685,0.2694,2.3454,37.5005
3,268.4793,4120941.8589,2030.0103,0.7252,2.3993,36.6097
4,284.91,2974111.79,1724.5613,0.748,2.3906,40.1063
5,273.2933,4197282.8919,2048.7271,0.4325,2.3769,43.2909
6,276.0008,4267508.7143,2065.7949,0.1895,2.3687,18.752
7,210.7214,1743794.1124,1320.528,0.8512,2.4123,41.4134
8,272.1793,5306297.0206,2303.5401,0.3564,2.3695,49.144
9,337.7168,4795480.8086,2189.8586,0.7648,2.3296,27.102


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
import pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [18]:
with open('model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)
print(loaded_model)

DecisionTreeRegressor(random_state=3162)


In [19]:
prediction=best_model.predict(test)
ss["ConfirmedCases"]=prediction

In [20]:
from sklearn.utils import resample
train_majority = train[train['Fatalities'] == 0]
train_minority = train[train['Fatalities'] == 1]
train_minority_upsampled = resample(train_minority,
                                    replace=True,
                                    n_samples=len(train_majority),
                                    random_state=123)
train_balanced = pd.concat([train_majority, train_minority_upsampled])
print(train_balanced['Fatalities'].value_counts())
train_balanced=train_balanced[["Lat","Long","Fatalities"]]
train_balanced.reset_index(drop=True, inplace=True)

Fatalities
0.0    15424
1.0    15424
Name: count, dtype: int64


In [21]:
from pycaret.classification import*

In [22]:
setup(train_balanced,target="Fatalities")

Unnamed: 0,Description,Value
0,Session id,1436
1,Target,Fatalities
2,Target type,Binary
3,Original data shape,"(30848, 3)"
4,Transformed data shape,"(30848, 3)"
5,Transformed train set shape,"(21593, 3)"
6,Transformed test set shape,"(9255, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7d62a0e7a830>

In [23]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8383,0.9276,0.8829,0.8108,0.8452,0.6767,0.6795,1.251
dt,Decision Tree Classifier,0.8377,0.9276,0.8807,0.8111,0.8444,0.6754,0.6781,0.054
et,Extra Trees Classifier,0.8377,0.9276,0.8807,0.8111,0.8444,0.6754,0.6781,0.818
xgboost,Extreme Gradient Boosting,0.8376,0.9276,0.879,0.812,0.8441,0.6753,0.6777,0.178
lightgbm,Light Gradient Boosting Machine,0.8374,0.9274,0.8677,0.8182,0.8422,0.6748,0.6761,2.57
knn,K Neighbors Classifier,0.8236,0.899,0.8524,0.8063,0.8285,0.6472,0.6485,0.169
gbc,Gradient Boosting Classifier,0.7995,0.8971,0.7115,0.8638,0.7801,0.599,0.6087,0.866
ada,Ada Boost Classifier,0.7419,0.8291,0.6176,0.8224,0.7051,0.4837,0.4997,0.506
nb,Naive Bayes,0.6631,0.6973,0.6519,0.6669,0.6593,0.3262,0.3263,0.043
qda,Quadratic Discriminant Analysis,0.6618,0.7023,0.6618,0.6619,0.6617,0.3236,0.3237,0.045


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [24]:
best_model2=create_model("et")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8426,0.9264,0.8972,0.8088,0.8507,0.6852,0.6893
1,0.8454,0.9313,0.8796,0.8232,0.8505,0.6907,0.6924
2,0.8338,0.9285,0.8898,0.8002,0.8426,0.6676,0.6718
3,0.8309,0.9227,0.8769,0.8032,0.8384,0.6619,0.6647
4,0.8411,0.9297,0.8815,0.8158,0.8474,0.6822,0.6845
5,0.8416,0.9323,0.887,0.8132,0.8485,0.6832,0.686
6,0.8291,0.9231,0.862,0.8089,0.8346,0.6582,0.6596
7,0.843,0.9291,0.8767,0.8212,0.8481,0.686,0.6875
8,0.8407,0.9333,0.8693,0.8221,0.845,0.6813,0.6825
9,0.8291,0.9195,0.8869,0.7949,0.8384,0.6582,0.6627


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [25]:
with open('model2.pkl', 'wb') as file:
    pickle.dump(best_model2, file)
with open('model2.pkl', 'rb') as file:
    loaded_model = pickle.load(file)
print(loaded_model)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=1436, verbose=0,
                     warm_start=False)


In [26]:
pred=best_model2.predict(test)
ss["Fatalities"]=pred
ss

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,6.297872,0.0
1,2,6.297872,0.0
2,3,6.297872,0.0
3,4,6.297872,0.0
4,5,6.297872,0.0
...,...,...,...
12207,12208,0.217391,0.0
12208,12209,0.217391,0.0
12209,12210,0.217391,0.0
12210,12211,0.217391,0.0
