In [1]:
%matplotlib inline

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
import json

In [2]:
fname = 'sp-pos-quot-dep-2021-03-28-19h20-reg3C.csv'
d = pd.read_csv(fname, 
                sep = ';', 
                parse_dates = ['jour'], 
                dtype = {'P': int, 'T': int},
                skiprows = [1])
d.sample(3)

Unnamed: 0,reg,jour,three_class,P,T,pop
11639,Occitanie,2020-07-08,whole,24,4363,5924858.0
15185,Pays de la Loire,2021-03-18,20-59,772,11157,1824237.0
7010,Grand Est,2020-10-27,60+,967,7412,1512834.0


In [3]:
d.dtypes

reg                    object
jour           datetime64[ns]
three_class            object
P                       int64
T                       int64
pop                   float64
dtype: object

In [6]:
regs = d['reg'].unique()
three_class = d['three_class'].unique()

In [7]:
d['incidence'] = d['P'] / d['pop'] * 100000
d['positivite'] = d['P'] / d['T'] * 100
d

Unnamed: 0,reg,jour,three_class,P,T,pop,incidence,positivite
0,Auvergne-Rhône-Alpes,2020-05-13,0-19,3,377,1940253.0,0.154619,0.795756
1,Auvergne-Rhône-Alpes,2020-05-13,20-59,24,2253,3980515.0,0.602937,1.065246
2,Auvergne-Rhône-Alpes,2020-05-13,60+,29,1414,2111609.0,1.373360,2.050919
3,Auvergne-Rhône-Alpes,2020-05-13,whole,57,4051,8032377.0,0.709628,1.407060
4,Auvergne-Rhône-Alpes,2020-05-14,0-19,11,490,1940253.0,0.566936,2.244898
...,...,...,...,...,...,...,...,...
17747,Île-de-France,2021-03-24,whole,13180,119429,12278210.0,107.344637,11.035846
17748,Île-de-France,2021-03-25,0-19,2894,36066,3140965.0,92.137289,8.024178
17749,Île-de-France,2021-03-25,20-59,7722,75490,6620212.0,116.642790,10.229169
17750,Île-de-France,2021-03-25,60+,1588,16916,2517033.0,63.090154,9.387562


In [9]:
tic1 = time.time()
d['incidence hebdo'] = np.zeros(len(d))
for reg in regs:
    for age_class in three_class:
        d2 = d[(d['reg'] == reg) & (d['three_class'] == age_class)]
        d['temp'] = d2.apply(lambda x : (d2[ (d2['jour'] <= x['jour']) 
                                                        & (d2['jour'] > x['jour'] - np.timedelta64(1,'W'))]
                                                    ['incidence']
                                                    .sum()), axis = 1)
        d['incidence hebdo'] = d['incidence hebdo'] + d['temp'].fillna(0)
    toc1 = time.time()
    print('{:.2f} s : région {}'.format(-tic1+toc1, reg))

0.76 s : région Auvergne-Rhône-Alpes
1.52 s : région Bourgogne-Franche-Comté
2.28 s : région Bretagne
3.05 s : région Centre-Val de Loire
3.83 s : région Corse
4.60 s : région Grand Est
5.36 s : région Hauts-de-France
6.11 s : région Normandie
6.88 s : région Nouvelle-Aquitaine
7.67 s : région Occitanie
8.45 s : région Outre-mer
9.23 s : région Pays de la Loire
10.03 s : région Provence-Alpes-Côte d'Azur
10.78 s : région Île-de-France


In [10]:
d = d.drop(columns = ['temp'])
d.sample(3)

Unnamed: 0,reg,jour,three_class,P,T,pop,incidence,positivite,incidence hebdo
7481,Grand Est,2021-02-22,20-59,1419,21808,2740790.0,51.773394,6.506787,221.943308
1350,Bourgogne-Franche-Comté,2020-06-02,60+,16,971,848947.0,1.884688,1.647786,5.53627
1694,Bourgogne-Franche-Comté,2020-08-27,60+,21,1726,848947.0,2.473653,1.216686,12.014884


In [11]:
fname2 = fname[:-4] + '-processed.csv'
d.to_csv(fname2, sep = ';', index = False)