In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import json
from datetime import date, timedelta
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [67]:
DATA_PATH = "data/"

INTERVENTIONS_DATASET = DATA_PATH + "interventions.csv"
APPLE_MOBILITY_DATASET = DATA_PATH + "applemobilitytrends.csv"
GLOBAL_MOBILITY_DATASET = DATA_PATH + "Global_Mobility_Report.csv.gz"

#Source NL https://pageviews.wmcloud.org/?project=nl.wikipedia.org&platform=all-access&agent=user&redirects=0&start=2020-01-01&end=2020-08-31&pages=Stress|Bezorgdheid|Eenzaamheid|Slapeloosheid|Depressie_(klinisch)
#Source IT https://pageviews.wmcloud.org/?project=it.wikipedia.org&platform=all-access&agent=user&redirects=0&start=2020-01-01&end=2020-08-31&pages=Stress|Ansia|Disturbo_depressivo|Insonnia
NL_PAGEVIEWS_DATASET = DATA_PATH + "pageviews_nl.csv"
IT_PAGEVIEWS_DATASET = DATA_PATH + "pageviews_it.csv"

In [68]:
EFFECT_DURATION = 10
LANGUAGES = ['nl', 'it']

In [69]:
interventions_data = pd.read_csv(INTERVENTIONS_DATASET)
apple_mobility_data = pd.read_csv(APPLE_MOBILITY_DATASET, low_memory=False)
google_mobility_data = pd.read_csv(GLOBAL_MOBILITY_DATASET, compression='gzip', low_memory=False)

pageviews_data = {}
pageviews_data['nl'] = pd.read_csv(NL_PAGEVIEWS_DATASET, low_memory=False)
pageviews_data['it'] = pd.read_csv(IT_PAGEVIEWS_DATASET, low_memory=False)

In [70]:
interventions_data.set_index('lang', inplace=True)

for col in interventions_data.columns:
    interventions_data[col] = pd.to_datetime(interventions_data[col])


In [71]:
def is_affected(country_code, effect, day): #TODO: placeholder
    if day == pd.NaN:
        return 0
    return 1

In [72]:
diseases_names = {
    'en':['anxiety', 'depression', 'insomnia', 'loneliness', 'stress'],
    'nl':['Bezorgdheid', 'Depressie (klinisch)', 'Slapeloosheid', 'Eenzaamheid', 'Stress'],
    'it':['Ansia', 'Disturbo depressivo', 'Insonnia', 'Stress', 'Stress']
}
#TODO: replace placeholder with loneliness wikipedia page for italy, also update the data!

In [73]:
def get_data(lang):
    df = pageviews_data[lang].copy()
    df.rename(columns = dict(zip(diseases_names[lang], diseases_names['en'])), inplace = True)
    df['Date'] = pd.to_datetime(df['Date'])
    df['Day_of_the_week'] = df['Date'].apply(lambda x: x.strftime("%A"))
    df.set_index('Date', inplace=True)
    df = df[diseases_names['en'] + ['Day_of_the_week']]
    return df

In [74]:
#APPLE
apple_mobility_data = apple_mobility_data.drop(columns = ['alternative_name', 'country', 'sub-region'], axis=1)

In [75]:
region = {
    'nl':'Netherlands',
    'it':'Italy'
}

In [76]:
def get_apple_data(lang):
    df = apple_mobility_data.loc[apple_mobility_data['region'] == region[lang]]
    df.index = ['driving', 'transit', 'walking']
    df = df.drop(columns = ['geo_type', 'region' ,'transportation_type'], axis = 1)
    df = df.T
    df.index = pd.to_datetime(df.index)
    return df

In [77]:
#GOOGLE
def get_google_data(lang):
    df = google_mobility_data[google_mobility_data['country_region_code'] == lang.upper()]
    df = df[df['sub_region_1'].isna()] #keep only rows with country level data (for italy only using sub_region_1 is enough)
    df = df.dropna(axis=1, how='all') #keep only rows without NaN values(should drop all "sub_region"s)
    df = df.interpolate() #replace NaN values with average of bfill and ffill (No NaN values in this filtered dataframe but not necessarily the case)
    df.set_index(pd.DatetimeIndex(df['date']), inplace = True)
    del df['date']
    df = df.drop_duplicates()
    df.rename(columns = {'retail_and_recreation_percent_change_from_baseline':'retail',
                        'grocery_and_pharmacy_percent_change_from_baseline':'grocery',
                        'parks_percent_change_from_baseline':'parks',
                        'transit_stations_percent_change_from_baseline':'stations',
                        'workplaces_percent_change_from_baseline':'workplaces',
                        'residential_percent_change_from_baseline':'residential'},
                        inplace = True)
    return df

In [78]:
def merge_datasets(pageviews, apple, google):
    df = pd.merge(pageviews, apple, left_index=True, right_index=True)
    df = pd.merge(df, google, left_index=True, right_index=True)
    return df

In [79]:
def regression(disease, lang, df):
    mod = smf.ols(formula= disease + ' ~ driving + transit + walking + retail + grocery + parks + stations + workplaces + residential + C(Day_of_the_week)', data=df)
    np.random.seed(2)
    res = mod.fit()
    print('Regression for ' + disease + ' in ' + region[lang] + '\n')
    print(res.summary())

In [80]:
def loop(langs, diseases):
    for lang in langs:
        pageviews = get_data(lang)
        apple = get_apple_data(lang)
        google = get_google_data(lang)
        df = merge_datasets(pageviews, apple, google)
        for disease in diseases:
            regression(disease, lang, df)

In [81]:
loop(LANGUAGES, diseases_names['en'])

Regression for anxiety in Netherlands

                            OLS Regression Results                            
Dep. Variable:                anxiety   R-squared:                       0.187
Model:                            OLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                     2.684
Date:                Sat, 17 Dec 2022   Prob (F-statistic):            0.00105
Time:                        12:09:51   Log-Likelihood:                -522.97
No. Observations:                 191   AIC:                             1078.
Df Residuals:                     175   BIC:                             1130.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

KeyError: "['loneliness'] not in index"