In [1]:
## 08 (1)Density Forecasting for individual models and (2) calcualte most recent errors for combinatnions weights

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import itertools
import statsmodels.api as sm
import os
from joblib import Parallel, delayed
import properscoring as ps
from scipy.stats import norm
from scipy.stats import gaussian_kde

In [19]:
from epiweeks import Week, Year
from datetime import date
def create_epiweek(date):
    return Week.fromdate(date)
def create_epiweekplot(epiweek):
    epiweek = str(epiweek)
    return F'Y{epiweek[:4]}W{epiweek[4:]}'
def filename_to_epiweek(filename):
    return Week.fromstring(F'{filename[:4]}W{filename[4:6]}')
def create_epiweek_fromstr(str):
    return Week.fromstring(str)
def create_epiweek_fromint(int):
    return Week.fromstring(str(int))

In [11]:
import os
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
import properscoring as ps

def outofsamples_crps(target_var, pred_directory, crps_directory, comparison_operator):
    model_names_order = ['naive', 'historymean', 'ar_pure', 'ar_env', 'ridge', 'lasso', 'alasso', 'sgl',
                 'elasticnet', 'purefactor', 'knn', 'xgboost']
    # Paths setup
    pred_directory_path = os.path.join(target_var, pred_directory)
    crps_directory_path = os.path.join(target_var, crps_directory)
    
    if not os.path.exists(crps_directory_path):
        os.makedirs(crps_directory_path)
    
    for step_name in os.listdir(pred_directory_path):
        pred_models_path = os.path.join(pred_directory_path, step_name)
        if os.path.isdir(pred_models_path):
            full_crps_df = pd.DataFrame()
            
            for model_name in os.listdir(pred_models_path):
                pred_file = os.path.join(pred_models_path, model_name)
                if os.path.isfile(pred_file):
                    y_pred = pd.read_csv(pred_file, parse_dates=[0], dayfirst=True) # Ill-disease L8_S9 lasso.csv has some unseen format isuue
                    model_crps_df = pd.DataFrame()

                    for filename in y_pred['epiweek']:
                        # Apply the comparison operator
                        if comparison_operator == '<=':
                            y_pred_new = y_pred[y_pred['epiweek'] <= filename]
                        elif comparison_operator == '==':
                            y_pred_new = y_pred[y_pred['epiweek'] == filename]
                        else:
                            raise ValueError("Invalid comparison_operator: must be '<=' or '=='")
                        
                        y_pred_outofsamples = pd.DataFrame(y_pred_new.iloc[:, 1:], dtype='float64')
                        
                        # Calculate CRPS for this epiweek
                        crps_values = []
                        for i in range(len(y_pred_outofsamples)):
                            crps = ps.crps_ensemble(
                                y_pred_outofsamples.iloc[i, 0], y_pred_outofsamples.iloc[i, 1:]
                            )
                            crps_values.append(crps)
                        
                        mean_crps = np.mean(crps_values)
                        model_crps_df.at[filename, model_name[:-4]] = mean_crps
                    
                    full_crps_df = pd.concat([full_crps_df, model_crps_df], axis=1)
            full_crps_df = full_crps_df[model_names_order]
            full_crps_df.sort_index(inplace=True)
            full_crps_df.to_csv(os.path.join(crps_directory_path, f'{step_name}.csv'))

def generate_full_crps_P1(target_var, pred_directory, crps_directory_P1):
    model_names_order = ['naive', 'historymean', 'ar_pure', 'ar_env', 'ridge', 'lasso', 'alasso', 'sgl',
                 'elasticnet', 'purefactor', 'knn', 'xgboost']
    # Paths setup
    pred_directory_path = os.path.join(target_var, pred_directory)
    crps_directory_path = os.path.join(target_var, crps_directory_P1)
    
    if not os.path.exists(crps_directory_path):
        os.makedirs(crps_directory_path)
    
    for step_name in os.listdir(pred_directory_path):
        pred_models_path = os.path.join(pred_directory_path, step_name)
        if os.path.isdir(pred_models_path):
            full_crps_df = pd.DataFrame()
            for model_name in os.listdir(pred_models_path):
                pred_file = os.path.join(pred_models_path, model_name)
                if os.path.isfile(pred_file):
                    y_pred = pd.read_csv(pred_file, parse_dates=[0], dayfirst=True)
                    index = y_pred['epiweek'].unique()
                    
                    full_crps_df = pd.DataFrame(1, index=index, columns=model_names_order)
                    full_crps_df.sort_index(inplace=True)
                    
                    full_crps_df.to_csv(os.path.join(crps_directory_path, f'{step_name}.csv'))
                    break  # We only need to do this once per step_name


In [12]:
def run_crps(target_variables_file, pred_directory, crps_directory_P3, crps_directory_P2, crps_directory_P1):
    target_variables = []
    with open(target_variables_file, 'r') as file:
        for line in file:
            target_variable = line.strip()
            target_variables.append(target_variable)
    
    print(target_variables)
    
    Parallel(n_jobs=-1, verbose=51)(
        delayed(outofsamples_crps)(target_var, pred_directory, crps_directory_P3, '<=') 
        for target_var in target_variables
    )
    
    Parallel(n_jobs=-1, verbose=51)(
        delayed(outofsamples_crps)(target_var, pred_directory, crps_directory_P2, '==') 
        for target_var in target_variables
    )
    
    Parallel(n_jobs=-1, verbose=51)(
        delayed(generate_full_crps_P1)(target_var, pred_directory, crps_directory_P1) 
        for target_var in target_variables
    )

# Execute the function
run_crps('target_variables_new.txt', 'pred', 'full_crps_P3', 'full_crps_P2', 'full_crps_P1')

['Cardiovascular disease', 'Chronic respiratory disease', 'Factors influencing health status and contact with health services', 'Digestive disease', 'Endocrine disorders', 'Malignant neoplasms', 'Diabetes mellitus', 'Genitourinary disorders', 'Musculoskeletal disease', 'Infectious and Parasitic Diseases', 'Neurological and sense disorders', 'Oral Diseases', 'Other neoplasms', 'Respiratory Infection', 'Skin diseases']
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:  2.6min remaining: 16.9min
[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed:  2.6min remaining: 10.4min
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  2.9min remaining:  7.9min
[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed:  2.9min remaining:  5.8min
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:  2.9min remaining:  4.3min
[Parallel(n_jobs=-1)]: Done   7 out

## calculate crps/log score for submodels

In [13]:
def crps(y_val, y_pred, model, target_var):
    np.random.seed(0)
    crps_df = pd.DataFrame()
    
    for epiweek in y_val.index:
        
        crps_df.at[epiweek, model] = ps.crps_ensemble(y_val.loc[epiweek, target_var], 
                                                      np.array(y_pred.loc[epiweek], dtype='float64'))
    
    return crps_df

In [14]:
# def dss(y_val, y_pred, model, target_var):
#     dss_df = pd.DataFrame()

#     for epiweek in y_val.index:
#         mean = np.mean(y_pred.loc[epiweek])

#         # Calculate the variance (sample variance)
#         variance = np.var(y_pred.loc[epiweek], ddof=1)
#         variance = np.maximum(variance, 1e-6)
#         # Calculate DSS for the current epiweek and model
#         dss = ((y_val.loc[epiweek, target_var] - mean)**2 / variance) + np.log(variance)
#         dss_df.at[epiweek, model] = dss

#     return dss_df

In [15]:
def log(y_val, y_pred, model, target_var):
    log_df = pd.DataFrame()
    
    for epiweek in y_val.index:
        samples = np.array(y_pred.loc[epiweek], dtype='float64')
        kde = gaussian_kde(samples)
        prob_density = kde(y_val.loc[epiweek, target_var])
        prob_density = max(prob_density, 1e-9)  # To avoid log(0)

        log_score = -np.log(float(prob_density))
        log_df.at[epiweek, model] = log_score        
    
    return log_df

In [16]:
def disease_crps(target_var, pred_directory, density_forecast_directory):
    model_names_order = ['naive', 'historymean', 'ar_pure', 'ar_env', 'ridge', 'lasso', 'alasso', 'sgl',
                 'elasticnet', 'purefactor', 'knn', 'xgboost']
    pred_directory_path = os.path.join(target_var, pred_directory)
    density_forecast_directory_path = os.path.join(target_var, density_forecast_directory)
    if not os.path.exists(density_forecast_directory_path):
        os.makedirs(density_forecast_directory_path)
    
    for step_name in os.listdir(pred_directory_path):
        pred_models_path = os.path.join(pred_directory_path,step_name)
        
        if os.path.isdir(pred_models_path):
            model_list = []
            crps_density_forecast_df = pd.DataFrame()
            log_density_forecast_df = pd.DataFrame()
            for model_name in os.listdir(pred_models_path): # 'model_name' here includes the '.csv'
                pred_file = os.path.join(pred_models_path, model_name)
                model = model_name[0:-4]
                
                if os.path.isfile(pred_file):
                    model_list.append(model) # to store the models' names
                    y_pred = pd.read_csv(pred_file, parse_dates = [0], dayfirst = True)  
                    y_pred['epiweek'] = y_pred['epiweek'].apply(create_epiweek_fromstr)
                    y_pred = y_pred.set_index('epiweek')

                    crps_col = crps(y_pred[[target_var]].copy(), y_pred.iloc[:,1:].copy(), model, target_var)
                    crps_density_forecast_df = pd.concat([crps_density_forecast_df, crps_col], axis=1)
                    log_col = log(y_pred[[target_var]].copy(), y_pred.iloc[:,1:].copy(), model, target_var)
                    log_density_forecast_df = pd.concat([log_density_forecast_df, log_col], axis=1)


            crps_density_forecast_df.columns = model_list
            log_density_forecast_df.columns = model_list
            
            crps_density_forecast_df = crps_density_forecast_df[model_names_order]
            log_density_forecast_df = log_density_forecast_df[model_names_order]

            density_forecast_output = pd.DataFrame()
            for col in crps_density_forecast_df.columns:
                density_forecast_output.at[col, 'crps_DENSITY_FORECAST'] = crps_density_forecast_df[col].mean()
                density_forecast_output.at[col, 'log_DENSITY_FORECAST'] = log_density_forecast_df[col].mean()
            density_forecast_output.to_csv(os.path.join(density_forecast_directory_path, F'{step_name}.csv'))
            
#disease_crps('Cardiovascular disease', 'pred', 'variance', 'density_forecast')

In [17]:
## This function calculates the density forecast based on the output prediction forecast and calculated variance

def run_full_crps(target_variables_file, pred_directory, density_forecast_directory):
    target_variables = []
    with open(target_variables_file, 'r') as file:
        for line in file:
            # Remove linebreak which is the last character of the string
            target_variable = line[:-1]
            # Add item to the list
            target_variables.append(target_variable)
    print(target_variables)
    Parallel(n_jobs=-1, verbose=51)(delayed(disease_crps)(target_var, pred_directory, density_forecast_directory) for target_var in target_variables)
    
run_full_crps('target_variables_new.txt', 'pred', 'density_forecast_metrics')

['Cardiovascular disease', 'Chronic respiratory disease', 'Factors influencing health status and contact with health services', 'Digestive disease', 'Endocrine disorders', 'Malignant neoplasms', 'Diabetes mellitus', 'Genitourinary disorders', 'Musculoskeletal disease', 'Infectious and Parasitic Diseases', 'Neurological and sense disorders', 'Oral Diseases', 'Other neoplasms', 'Respiratory Infection', 'Skin diseases']
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:   13.4s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed:   13.5s remaining:   54.1s
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:   16.5s remaining:   45.4s
[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed:   16.6s remaining:   33.2s
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:   16.6s remaining:   24.9s
[Parallel(n_jobs=-1)]: Done   7 out

### 测验

In [25]:
sb = pd.read_csv('Cardiovascular disease/pred/L8_S1/naive.csv', parse_dates = [0], dayfirst = True)
sb['epiweek'] = sb['epiweek'].apply(create_epiweek_fromstr)
sb = sb.set_index('epiweek') 
sb

Unnamed: 0_level_0,Cardiovascular disease,0,1,2,3,4,5,6,7,8,...,990,991,992,993,994,995,996,997,998,999
epiweek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201604,885.0,903.0,919.0,952.0,942.0,835.0,892.0,1024.0,923.0,956.0,...,889.0,872.0,922.0,974.0,917.0,903.0,880.0,932.0,921.0,923.0
201605,906.0,938.0,985.0,934.0,847.0,1014.0,908.0,967.0,896.0,891.0,...,978.0,926.0,967.0,927.0,928.0,986.0,966.0,916.0,919.0,999.0
201606,861.0,864.0,927.0,890.0,937.0,910.0,1015.0,854.0,906.0,889.0,...,884.0,961.0,899.0,849.0,899.0,858.0,862.0,905.0,919.0,886.0
201607,948.0,843.0,941.0,894.0,974.0,847.0,897.0,879.0,879.0,938.0,...,904.0,900.0,820.0,872.0,884.0,812.0,923.0,976.0,952.0,841.0
201608,1042.0,829.0,742.0,827.0,839.0,840.0,885.0,870.0,936.0,867.0,...,898.0,800.0,865.0,865.0,833.0,838.0,808.0,841.0,846.0,837.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201848,955.0,979.0,867.0,878.0,991.0,862.0,774.0,877.0,939.0,934.0,...,852.0,1006.0,947.0,846.0,833.0,758.0,917.0,805.0,848.0,986.0
201849,775.0,1017.0,1071.0,1043.0,974.0,952.0,1066.0,873.0,974.0,975.0,...,834.0,1005.0,914.0,1097.0,879.0,1165.0,1038.0,1079.0,1069.0,979.0
201850,862.0,868.0,962.0,939.0,951.0,1019.0,1023.0,1025.0,1003.0,925.0,...,1130.0,979.0,1047.0,931.0,1014.0,755.0,1063.0,980.0,1000.0,976.0
201851,921.0,761.0,710.0,805.0,804.0,732.0,753.0,793.0,709.0,743.0,...,774.0,737.0,690.0,728.0,754.0,753.0,716.0,716.0,799.0,774.0


In [41]:
sb = sb.iloc[0:2]
a = pd.DataFrame(sb['Cardiovascular disease'])
a.columns = ['0']

In [42]:
'201604.csv'[0:-4]

'201604'

In [11]:
index_name = sb2.index[1]
row_position = sb2.index.get_loc(index_name)
column_position = sb2.columns.get_loc('randomforest')
print("Row Position:", row_position)
print("Column Position:", column_position)

Row Position: 1
Column Position: 3


In [9]:
## test also for pred
target_var  = "Cardiovascular disease"
pred_directory = 'pred'
pred_directory_path = os.path.join(target_var, pred_directory)
pred_models_path = os.path.join(pred_directory_path, 'L8_S1')
# for model_name in os.listdir(pred_models_path):
#     pred_file = os.path.join(pred_models_path, model_name) 
pred_file = os.path.join(pred_models_path, 'naive.csv')
y_pred = pd.read_csv(pred_file, parse_dates = [0], dayfirst = True)
y_pred

Unnamed: 0,epiweek,Cardiovascular disease,0,1,2,3,4,5,6,7,...,990,991,992,993,994,995,996,997,998,999
0,201604,885.0,870.0,920.0,873.0,930.0,874.0,1142.0,872.0,807.0,...,878.0,882.0,1000.0,949.0,958.0,998.0,878.0,1012.0,944.0,923.0
1,201605,906.0,915.0,934.0,1069.0,888.0,945.0,925.0,926.0,939.0,...,942.0,1006.0,909.0,915.0,1017.0,963.0,942.0,885.0,912.0,896.0
2,201606,861.0,889.0,857.0,827.0,902.0,923.0,705.0,961.0,933.0,...,842.0,837.0,946.0,879.0,863.0,923.0,842.0,916.0,911.0,906.0
3,201607,948.0,928.0,894.0,928.0,906.0,869.0,1081.0,900.0,954.0,...,848.0,920.0,901.0,907.0,870.0,880.0,848.0,915.0,856.0,879.0
4,201608,1042.0,787.0,882.0,761.0,889.0,887.0,860.0,800.0,895.0,...,1076.0,686.0,860.0,720.0,827.0,652.0,1076.0,890.0,915.0,936.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,201848,955.0,902.0,842.0,813.0,871.0,861.0,893.0,889.0,865.0,...,905.0,794.0,910.0,872.0,931.0,892.0,948.0,907.0,897.0,920.0
149,201849,775.0,981.0,905.0,1051.0,1033.0,943.0,842.0,943.0,961.0,...,986.0,994.0,920.0,1052.0,1059.0,1010.0,1024.0,1023.0,1040.0,994.0
150,201850,862.0,896.0,1005.0,925.0,957.0,949.0,836.0,1012.0,980.0,...,831.0,831.0,973.0,998.0,1130.0,869.0,806.0,977.0,940.0,1037.0
151,201851,921.0,1011.0,776.0,830.0,832.0,735.0,747.0,681.0,716.0,...,780.0,780.0,759.0,741.0,774.0,779.0,929.0,675.0,704.0,683.0


In [94]:
# To see the data structure for pred_outofsample and actual_outofsample
test_pred_outofsamples = pd.read_csv('./Cardiovascular disease/pred_outofsamples/L8_S1/naive/201605.csv', index_col = 0)
# test_actual_outofsamples = pd.read_csv('./Cardiovascular disease/actual_outofsamples/L8_S1/naive/201605.csv', index_col = 0)
# test_actual_outofsamples
test_pred_outofsamples

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,870.0,920.0,873.0,930.0,874.0,1142.0,872.0,807.0,957.0,956.0,...,878.0,882.0,1000.0,949.0,958.0,998.0,878.0,1012.0,944.0,923.0
1,915.0,934.0,1069.0,888.0,945.0,925.0,926.0,939.0,936.0,949.0,...,942.0,1006.0,909.0,915.0,1017.0,963.0,942.0,885.0,912.0,896.0


In [14]:
type(np.var(sb.iloc[:,1:].loc[sb.index[1]]))

float

In [13]:
for filename in y_pred['epiweek']:
    print(filename)

201604
201605
201606
201607
201608
201609
201610
201611
201612
201613
201614
201615
201616
201617
201618
201619
201620
201621
201622
201623
201624
201625
201626
201627
201628
201629
201630
201631
201632
201633
201634
201635
201636
201637
201638
201639
201640
201641
201642
201643
201644
201645
201646
201647
201648
201649
201650
201651
201652
201701
201702
201703
201704
201705
201706
201707
201708
201709
201710
201711
201712
201713
201714
201715
201716
201717
201718
201719
201720
201721
201722
201723
201724
201725
201726
201727
201728
201729
201730
201731
201732
201733
201734
201735
201736
201737
201738
201739
201740
201741
201742
201743
201744
201745
201746
201747
201748
201749
201750
201751
201752
201801
201802
201803
201804
201805
201806
201807
201808
201809
201810
201811
201812
201813
201814
201815
201816
201817
201818
201819
201820
201821
201822
201823
201824
201825
201826
201827
201828
201829
201830
201831
201832
201833
201834
201835
201836
201837
201838
201839
201840
201841
201842

In [29]:
haha = np.arange(10) / (10-1)
haha

array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])

In [18]:
sb = pd.read_csv('./Ill-defined diseases/pred/L8_S9/lasso.csv', parse_dates = [0], dayfirst = True, on_bad_lines='warn')  
sb['epiweek'] = sb['epiweek'].apply(create_epiweek_fromstr)
sb = sb.set_index('epiweek')
sb

Unnamed: 0_level_0,Ill-defined diseases,0,1,2,3,4,5,6,7,8,...,990,991,992,993,994,995,996,997,998,999
epiweek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201550,3519.0,3865.070219,3816.322739,3641.260136,3671.225570,3806.801721,3724.974713,3770.131708,3591.821260,3662.181240,...,3667.674198,3724.433311,3811.778405,3775.795744,3848.384728,3646.513804,3621.074379,3564.069116,3688.020819,3753.123753
201551,3663.0,3830.642100,3754.910116,3684.973768,3634.141381,3844.007284,3725.020041,3811.854624,3613.610230,3673.923279,...,3637.227268,3825.509993,3775.417201,3757.931686,3807.015760,3761.770492,3712.686476,3633.104617,3695.908721,3758.533574
201552,3947.0,3924.201581,3762.629648,3819.557274,3788.301016,3820.280535,3769.442279,3800.956185,3690.351995,3719.370416,...,3694.170970,3903.701742,3843.414978,3825.489310,3783.040437,3850.027342,3822.266593,3541.981839,3848.070811,3822.802645
201601,3905.0,3899.415970,3708.756472,3791.588216,3776.066767,3854.224371,3858.161838,3842.095282,3692.356105,3730.903892,...,3641.102614,3900.787588,3787.461415,3740.444176,3828.064310,3892.463277,3772.590403,3480.040314,3893.249355,3757.299083
201602,3844.0,3830.535899,3717.349047,3743.571787,3830.045929,3971.399101,3955.852262,3904.511476,3764.804688,3799.248141,...,3622.431386,3893.074436,3799.979596,3778.887977,3849.848408,3899.252092,3767.355713,3526.707742,3826.409911,3778.211409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201840,4384.0,3780.699869,3795.077401,3672.332530,3774.967240,3763.873307,3802.824770,3814.852255,3833.931879,3892.950888,...,3805.200196,3814.581083,3792.222455,3871.062184,3864.220991,3792.504976,3824.627407,3792.562617,3747.898588,3701.476246
201841,3631.0,3848.130531,3873.553514,3728.853677,3798.224204,3853.425537,3908.051097,3861.876536,3854.443663,3892.706882,...,3920.289445,3846.260787,3820.706814,3926.424140,3874.030753,3855.531303,3815.574704,3841.858081,3828.014647,3747.877406
201842,4290.0,3756.960565,3912.111476,3732.609248,3720.342783,3767.457488,3809.990617,3781.460881,3741.802550,3735.131846,...,3801.441960,3699.125239,3775.656945,3825.220498,3897.572999,3863.953150,3778.430702,3865.312993,3723.938773,3689.625294
201843,4336.0,3735.834146,3791.925068,3672.264748,3628.906389,3724.245098,3775.196936,3699.249859,3675.129846,3850.745797,...,3756.441572,3710.684167,3829.490049,3800.703250,3768.270642,3935.611642,3730.739863,3777.791978,3631.831767,3684.426324


In [85]:
for i in range(1,13):
    sb = pd.read_csv(f'./Malignant neoplasms/pred/L8_S{i}/ridge.csv', parse_dates = [0], dayfirst = True)  
    sb['epiweek'] = sb['epiweek'].apply(create_epiweek_fromstr)
    sb = sb.set_index('epiweek')
    nan_rows = sb[sb.isna().any(axis=1)]
    print("Rows with NaN values:")
    print(nan_rows)    
    
#     inf_rows = sb[sb.isin([np.inf, -np.inf]).any(axis=1)]
#     print("Rows with inf values:")
#     print(inf_rows)

##### ''/Malignant neoplasms/pred/L8_S4/sgl.csv' has missing values. 
##### ''/Malignant neoplasms/pred/L8_S4/xgboost.csv' has missing values. 

Rows with NaN values:
Empty DataFrame
Columns: [Malignant neoplasms, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, ...]
Index: []

[0 rows x 1001 columns]
Rows with NaN values:
Empty DataFrame
Columns: [Malignant neoplasms, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, ...]
Index: []

[0 rows x 1001 columns]
Rows with 

In [4]:
stop_flag = False

for i in range(1, 13):
    if stop_flag:
        break 
    sb = pd.read_csv(f'./Malignant neoplasms/pred/L8_S{i}/xgboost.csv', parse_dates=[0], dayfirst=True)
    sb['epiweek'] = sb['epiweek'].apply(create_epiweek_fromstr)
    sb = sb.set_index('epiweek')
    for j in range(len(sb)):
        print(i, j, len(sb.iloc[j]))
        if len(sb.iloc[j]) != 1001:
            print(f"Stopping at i={i}, j={j} because row length is {len(sb.iloc[j])}")
            stop_flag = True  
            break  

1 0 1001
1 1 1001
1 2 1001
1 3 1001
1 4 1001
1 5 1001
1 6 1001
1 7 1001
1 8 1001
1 9 1001
1 10 1001
1 11 1001
1 12 1001
1 13 1001
1 14 1001
1 15 1001
1 16 1001
1 17 1001
1 18 1001
1 19 1001
1 20 1001
1 21 1001
1 22 1001
1 23 1001
1 24 1001
1 25 1001
1 26 1001
1 27 1001
1 28 1001
1 29 1001
1 30 1001
1 31 1001
1 32 1001
1 33 1001
1 34 1001
1 35 1001
1 36 1001
1 37 1001
1 38 1001
1 39 1001
1 40 1001
1 41 1001
1 42 1001
1 43 1001
1 44 1001
1 45 1001
1 46 1001
1 47 1001
1 48 1001
1 49 1001
1 50 1001
1 51 1001
1 52 1001
1 53 1001
1 54 1001
1 55 1001
1 56 1001
1 57 1001
1 58 1001
1 59 1001
1 60 1001
1 61 1001
1 62 1001
1 63 1001
1 64 1001
1 65 1001
1 66 1001
1 67 1001
1 68 1001
1 69 1001
1 70 1001
1 71 1001
1 72 1001
1 73 1001
1 74 1001
1 75 1001
1 76 1001
1 77 1001
1 78 1001
1 79 1001
1 80 1001
1 81 1001
1 82 1001
1 83 1001
1 84 1001
1 85 1001
1 86 1001
1 87 1001
1 88 1001
1 89 1001
1 90 1001
1 91 1001
1 92 1001
1 93 1001
1 94 1001
1 95 1001
1 96 1001
1 97 1001
1 98 1001
1 99 1001
1 100 1001

In [20]:
sb = pd.read_csv('./Ill-defined diseases/pred/L8_S3/ar_env.csv', parse_dates = [0], dayfirst = True)  
sb['epiweek'] = sb['epiweek'].apply(create_epiweek_fromstr)
sb = sb.set_index('epiweek')
sb

Unnamed: 0_level_0,Ill-defined diseases,0,1,2,3,4,5,6,7,8,...,990,991,992,993,994,995,996,997,998,999
epiweek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201602,3930.0,3767.013431,3873.763060,3930.923308,3816.645035,3901.874497,3873.263336,3717.526420,3858.361096,3882.710062,...,3928.323532,4002.037013,3712.953141,3781.796370,3767.112838,3852.879062,3743.315043,3868.836755,3832.469189,3753.386840
201603,3786.0,3859.511813,3977.960935,3976.741932,3940.286867,4085.288418,3931.385537,3786.025033,3707.854166,3998.549674,...,3957.057894,3984.581112,3746.702284,3837.656917,3971.255388,3834.591223,3733.621488,3905.112675,3845.208562,3878.290460
201604,3519.0,3930.062362,3946.538546,3946.840275,3939.426694,4004.078862,3931.105463,3850.260341,3679.785898,3876.015354,...,3911.013309,3840.499447,3851.027504,3998.855303,4061.823507,4044.393487,3893.238411,3869.289500,3838.435962,3874.770853
201605,3663.0,3959.943702,3866.622956,3903.856314,3903.126344,3953.035452,3917.294877,3971.965629,3816.777886,3778.185916,...,3875.904138,3714.476285,3930.133474,4028.528237,4064.506031,3982.022219,3928.245001,3891.371615,3888.784131,3884.553184
201606,3947.0,3899.190034,4021.346738,3996.332874,3919.182693,3929.824499,3964.994758,4080.284233,3991.236911,3887.431736,...,3903.058606,3821.605486,4032.534470,3975.638959,3954.553154,4043.579964,3970.522193,3946.056574,3975.556154,3943.319537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201846,4384.0,4118.045513,4390.983906,3967.418687,4035.483819,4144.339064,4107.089188,4025.226623,4118.290613,3907.661759,...,4076.199819,4192.642926,4115.878522,4100.970372,4060.495197,4037.344945,3952.603357,4169.837677,4072.819084,4201.643429
201847,3631.0,3936.804869,4177.343391,3920.880326,3789.752958,3855.838879,4107.737990,3855.077054,3995.885637,3826.346666,...,4028.634066,3965.746913,4043.133749,4004.153153,3859.917553,4171.143821,3949.049654,4063.343434,3925.327944,3993.043386
201848,4290.0,4165.311088,4232.087001,4254.073677,3905.336716,4299.974739,4151.672911,4260.492468,4201.849857,4097.344327,...,4292.202582,4259.485472,4293.480815,4231.036880,4300.749446,4188.124615,4191.695612,4126.449737,4221.884046,4201.248161
201849,4336.0,4433.874742,4464.574324,4447.860809,4218.181736,4465.814414,4483.910209,4531.564203,4305.210693,4348.952407,...,4554.585082,4319.716573,4476.196179,4399.070961,4516.673986,4343.414883,4482.393801,4334.632795,4264.895200,4421.043134
