In [None]:
import numpy as np
import pandas as pd

In [None]:
import seaborn as sns
import plotly.graph_objs as go

In [None]:
from datetime import date

# Preprocessing

In [None]:
df_epi = pd.read_excel("data/export_EPI_MAIN_weekly_201540_201939_FINAL.xlsx")
df_lab = pd.read_excel("data/export_LABD_PCR_weekly_201540_201939_FINAL.xlsx")

In [None]:
df_epi.columns = ["region", "city", "informator", "year", "week", \
                  "orz_0_2",  "orz_3_6", "orz_7_14", "orz_15", "orz_all",\
                  "gripp_0_2", "gripp_3_6", "gripp_7_14", "gripp_15", "gripp_all", \
                  "p_0_2", "p_3_6", "p_7_14", "p_15", "p_all"]
df_lab.columns = ["region", "city", "informator", "year", "week",\
                  "tests_gripp_0_2", "tests_gripp_3_6", "tests_gripp_7_14", "tests_gripp_15", "tests_gripp_all",\
                  "tests_gripp_pos_0_2", "tests_gripp_pos_3_6", "tests_gripp_pos_7_14", "tests_gripp_pos_15",\
                  "tests_gripp_pos_all",\
                  "tests_orz_0_2", "tests_orz_3_6", "tests_orz_7_14", "tests_orz_15", "tests_orz_all",\
                  "tests_orz_pos_0_2", "tests_orz_pos_3_6", "tests_orz_pos_7_14", "tests_orz_pos_15",\
                  "tests_orz_pos_all"]

In [None]:
df_epi.drop(columns=['region', 'informator'], inplace=True)

In [None]:
def create_dateime_col(full_df, day):
    return pd.to_datetime(full_df[['year', 'week']].apply(
        lambda x: date.fromisocalendar(x[0], x[1], day), axis=1))

In [None]:
df_epi["date"] = create_dateime_col(df_epi, 1)
df_lab["date"] = create_dateime_col(df_lab, 1)

In [None]:
df_epi = df_epi.set_index(["city", "date"])
df_lab = df_lab.set_index(["city", "date"])

In [None]:
df_epi = df_epi.sort_index(level=[0, 1])
df_lab = df_lab.sort_index(level=[0, 1])


In [None]:
orz = ["orz_0_2",  "orz_3_6", "orz_7_14", "orz_15", "orz_all"]
gripp = ["gripp_0_2", "gripp_3_6", "gripp_7_14", "gripp_15", "gripp_all"]
pop = ["p_0_2", "p_3_6", "p_7_14", "p_15", "p_all"]

### Log target

In [None]:
# df_epi[orz+gripp] = np.log(1+df_epi[orz+gripp])

### Diffs

In [None]:
# df_epi[orz+gripp] = (df_epi[orz+gripp] - df_epi.groupby('city')[orz+gripp].shift()).bfill()

In [None]:
df_epi

In [None]:
df_epi[pop] /= 1e5

In [None]:
for num1, num2, denom in zip(orz, gripp, pop):
    df_epi[num1] = df_epi[num1].divide(df_epi[denom])
    df_epi[num2] = df_epi[num2].divide(df_epi[denom])
    

In [None]:
cities = df_epi.index.get_level_values('city').unique()

In [None]:
def draw_series(df, cities, cols):
    layouts = []
    timeline = df.index.get_level_values('date').unique()
    
    for city in cities:
        city_chank = df.loc[df.index.get_level_values('city') == city]
        for col in cols:
            layout = {'x': timeline,
                      'y': city_chank[col],
                      'name': city + '_' + col}
            layouts.append(layout)
    
    fig = go.Figure(layouts)
    return fig

In [None]:
def add_lines(fig, df):
    left_border1 = df_epi.loc[df_epi['week'] == 22].index.get_level_values('date').unique()   
    right_border1 = df_epi.loc[df_epi['week'] == 35].index.get_level_values('date').unique() 
    
    left_border2 = df_epi.loc[df_epi['week'] == 36].index.get_level_values('date').unique()[:3] 
    right_border2 = df_epi.loc[df_epi['week'] == 47].index.get_level_values('date').unique()[1:]
    
    for l_b_1, r_b_1 in zip(left_border1, right_border1):
        fig.add_vrect(x0=l_b_1, x1=r_b_1, 
                  annotation_text="Summer", annotation_position="top left",
                  fillcolor="red", opacity=0.25, line_width=0)
        
    for l_b_2, r_b_2 in zip(left_border2, right_border2):
        fig.add_vrect(x0=l_b_2, x1=r_b_2, 
                  annotation_text="Fall", annotation_position="top right",
                  fillcolor="green", opacity=0.25, line_width=0)
    return fig

In [None]:
# cities = ['Москва', 'Казань', 'Нижний Новгород', 'Новосибирск']
# columns = ['orz_all']

# fig = draw_series(df_epi, cities, columns)
# fig = add_lines(fig, df_epi)
# fig.show(renderer='iframe')

In [None]:
# df_epi_mean = df_epi.groupby(['city', 'week']).mean()

In [None]:
# df_epi_std = df_epi.groupby(['city', 'week']).std()

In [None]:
# def draw_stats(mean, std, col):
#     fig = go.Figure(data=go.Scatter(x=mean[col], y=std[col], mode='markers', text=mean.index))
#     return fig

# fig2 = draw_stats(df_epi_mean, df_epi_std, 'orz_all')
# fig2.show(renderer='iframe')

In [None]:
def draw_stats(df, cities, col):
    weeks = df.index.get_level_values('week')
    layouts = []
    for city in cities:
        city_chank = df.loc[df.index.get_level_values('city') == city]
        layout = go.Scatter(x=weeks, y=city_chank[col], mode='markers', name=city)
        layouts.append(layout)
        
    fig = go.Figure(data=layouts)
    return fig

In [None]:
# cities = df_epi.index.get_level_values('city').unique()
# col = 'gripp_all'
# fig2 = draw_stats(df_epi_mean, cities, col)
# fig2.show(renderer='iframe')

In [None]:
# cities = df_epi.index.get_level_values('city').unique()
# col = 'orz_all'
# fig3 = draw_stats(df_epi_std, cities, col)
# fig3.show(renderer='iframe')

### Make target

In [None]:
df_summer = df_epi.loc[df_epi['week'].isin(list(range(24, 34)))]

In [None]:
df_target = df_summer.groupby('city').mean()[['orz_all']]

### Make Dataset

In [None]:
df_data = pd.read_csv('data/very_big_dump.csv')

In [None]:
df_data.drop(columns=['level', 'lon', 'county'], inplace=True)

In [None]:
df_data = df_data.rename(columns={'Unnamed: 0': 'city'})

In [None]:
df_data = df_data.set_index('city')

In [None]:
bad_cols = df_data.dtypes.loc[df_data.dtypes == 'object']

In [None]:
for col in bad_cols.index:
    df_data[col] = df_data[col].str.replace(' ', '').astype(float)

In [None]:
from catboost import Pool, cv, CatBoostRegressor
from sklearn.model_selection import train_test_split
import shap

### Train Valid splt

In [None]:
train_idx, valid_idx = train_test_split(df_data.index,
                                        test_size=0.2,
                                        random_state=None, 
                                        shuffle=True,
                                        )

In [None]:
train_data = df_data.loc[df_data.index.isin(train_idx)]
valid_data = df_data.loc[df_data.index.isin(valid_idx)]

train_targer = df_target.loc[df_target.index.isin(train_idx)]
valid_target = df_target.loc[df_target.index.isin(valid_idx)]

In [None]:
train_pool = Pool(data=train_data, label=train_targer,)
val_pool = Pool(data=valid_data, label=valid_target,)

### Set Model

In [None]:
model_params = dict(
    thread_count=8,
    iterations=2000,
    loss_function='RMSE',
    eval_metric='RMSE',
    # learning_rate=0.01,
    depth=4,
#     bagging_temperature=0.8,
    rsm=0.8,
    allow_writing_files=False,
    save_snapshot=False
)

In [None]:
training_params = dict(
                use_best_model=True,
                early_stopping_rounds=50,
                verbose=100
                )

In [None]:
model = CatBoostRegressor()
model.set_params(**model_params)

In [None]:
model = model.fit(train_pool, eval_set=val_pool, **training_params)

In [None]:
prediction = model.predict(val_pool)
prediction = pd.DataFrame(prediction, index=valid_target.index)

In [None]:
prediction

In [None]:
valid_target

In [None]:
def _make_feature_importance_df_(model, data_pool):
    feature_importance = model.get_feature_importance(data_pool)
    feature_names = data_pool.get_feature_names()

    importance = {}
    for score, name in sorted(zip(feature_importance, feature_names), reverse=True):
        importance[name] = score

    feature_importance_df = pd.DataFrame.from_dict(importance, orient='index', columns=['score'])
    feature_importance_df.index.name = 'features'
    return feature_importance_df

In [None]:
_make_feature_importance_df_(model, train_pool)

In [None]:
explainer = shap.Explainer(model)

In [None]:
shap_values = explainer(train_data)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
# fig = plt.figure(figsize=(10, 10))
shap.summary_plot(shap_values, train_data)