# Step 1
Use CH government data (main references: [From generation to generation - development of causes of death 1970 to 2004](https://opendata.swiss/en/dataset/von-generation-zu-generation-entwicklung-der-todesursachen-1970-bis-2004), and [Pocket Health Statistics](https://opendata.swiss/en/dataset/gesundheit-taschenstatistik-2019)) to build baseline numbers.

In [1]:
import numpy as np
import pandas as pd

baseline_men_death_rates_percentiles_2000_2004 = [41, 52, 66, 77, 85, 90, 93]
baseline_women_death_rates_percentiles_2000_2004 = [52, 62, 76, 84, 90, 94, 96]

swiss_baseline_behavior_data = pd.DataFrame([ ['Male', 77.8, 62.8, 51.0, 31.0, 14.9],
                                              ['Female', 73.6, 73.4, 33.0, 23.3, 7.1]], 
    columns=['Biological Gender', 'Physically Active', 'Pays attention to diet', 
             'Overweight or Obese', 'Smoker', 'Consumes alcohol daily'])

In [2]:
swiss_baseline_behavior_data.head()

Unnamed: 0,Biological Gender,Physically Active,Pays attention to diet,Overweight or Obese,Smoker,Consumes alcohol daily
0,Male,77.8,62.8,51.0,31.0,14.9
1,Female,73.6,73.4,33.0,23.3,7.1


# Step 2
Augment the baseline data by generating several samples based on all the metrics extracted from the papers (listed at the end of this section).

In [3]:
# set up categories
bio_genders = ['Female', 'Male']
diet_choices = ['Mediterranean', 'Intermittent Fasting', 'Keto', 'Vegan/Plant-Based', 'Vegetarian', 'Carnivore', 'Paleo']
water_consumption_choices = ['Less than recommended', 'Normal']
smoking_behavior = ['Never smoked', 'Regular smoker', 'Ex-smoker']
alcohol_choices = ['Abstainer/rare drinker', 'Weekly drinker', '<15g alcohol per day', 'Between 15g and 50g alcohol per day', '>50g alcohol per day']
hours_sleep_choices = ['Less 6 hours per night', '6 to 8 hours per night', '8 to 10 hours per night', 'More than 10 hours per night']

In [4]:
def BMI(height, weight): 
    bmi = weight/(height**2) 
    return bmi 

In [5]:
# generate life-expectancy influencing fields
import random

NB_SAMPLES = 100000
np.random.seed(2021)

# data extracted from [15]
average_adult_height_men = 178.4
average_adult_height_men_std = 7.6

average_adult_height_women = 164.7
average_adult_height_women_std = 7.1

# data from [16]
average_adult_weight_men = 77.5
average_adult_weight_men_std = 12
average_adult_weight_women = 63.15
average_adult_weight_women_std = 11


generated_samples = []
for _ in range(NB_SAMPLES):
    if np.random.rand() >= 0.5:
        # gender, height and weight
        gender = 'Female'
        height = np.random.normal(loc=average_adult_height_women, scale=average_adult_height_women_std)
        weight = np.random.normal(loc=average_adult_weight_women, scale=average_adult_weight_women_std)
        baseline_longevity = baseline_women_death_rates_percentiles_2000_2004[3]
    else:   
        # gender, height and weight
        gender = 'Male'
        height = np.random.normal(loc=average_adult_height_men, scale=average_adult_height_men_std)
        weight = np.random.normal(loc=average_adult_weight_men, scale=average_adult_weight_men_std)
        baseline_longevity = baseline_men_death_rates_percentiles_2000_2004[3]
        
    # adjusting for switzerland's overweight levels
    prob_overweight = swiss_baseline_behavior_data[swiss_baseline_behavior_data['Biological Gender'] == gender]['Overweight or Obese'].iloc[0] / 100.0
    weight_offset = 0
    if np.random.rand() < prob_overweight:
        while (BMI(height/100.0, weight) < 25):
            weight = (1 + np.random.rand()) * weight

    # smoking habits, adjusted for switzerland data
    prob_smoker = swiss_baseline_behavior_data[swiss_baseline_behavior_data['Biological Gender'] == gender]['Smoker'].iloc[0] / 100.0
    smoking_freq = 'Regular smoker' if np.random.rand() < prob_smoker else random.choice(['Never smoked', 'Ex-smoker'])

    # alcohol drinking habits, adjusted for switzerland data
    prob_daily_drinker = swiss_baseline_behavior_data[swiss_baseline_behavior_data['Biological Gender'] == gender]['Consumes alcohol daily'].iloc[0] / 100.0
    alcohol_consump = random.choice(['<15g alcohol per day', 'Between 15g and 50g alcohol per day', '>50g alcohol per day']) \
                                    if np.random.rand() < prob_daily_drinker else random.choice(['Abstainer/rare drinker', 'Weekly drinker'])
        
                
    diet = random.choice(diet_choices) # paying attention to diet doesn't affect choice of diet
    
    # no explicit info on these two fields
    water_intake = random.choice(water_consumption_choices)
    hours_sleep = random.choice(hours_sleep_choices)
    
    # compute expected life expectancy, using data from papers [1-14]
    longevity = (baseline_longevity 
                     - 0.4 * (weight - (average_adult_weight_women if gender == 'Female' else average_adult_weight_men))
                     + ((np.random.rand()*2+1.5)/100.0 * baseline_longevity if diet == 'Mediterranean' else
                        (np.random.rand()*10)/100.0 * baseline_longevity if diet == 'Intermittent Fasting' else
                        (np.random.rand()*15-5)/100.0 * baseline_longevity if diet == 'Keto' else
                        (np.random.rand()*17+5)/100.0 * baseline_longevity if diet == 'Vegan/Plant-Based' else
                        (12)/100.0 * baseline_longevity if diet == 'Vegetarian' else 
                        (-5)/100.0 * baseline_longevity) #'Carnivore' and 'Paleo'
                     - (0.01 *  baseline_longevity if water_intake == 'Less than recommended' else 0)
                     + ((2.5)/100.0  * baseline_longevity if smoking_freq == 'Never smoked' else
                        (-10)/100.0  * baseline_longevity if smoking_freq == 'Regular smoker' else
                        0) # ex-smoker
                     + ((np.random.rand()*4-2)/100.0 * baseline_longevity if alcohol_consump == '<15g alcohol per day' or alcohol_consump == 'Weekly drinker' else
                        (-5.0)/100.0  * baseline_longevity if alcohol_consump == 'Between 15g and 50g alcohol per day' else
                        (np.random.rand()*10-20)/100.0  * baseline_longevity if alcohol_consump == '>50g alcohol per day' else
                        0) # Abstainer/rare drinker
                     + ((-3)/100.0  * baseline_longevity if hours_sleep == 'Less 6 hours per night' or alcohol_consump == 'Weekly drinker' else
                        (-2.5)/100.0  * baseline_longevity if hours_sleep == '8 to 10 hours per night' else
                        (-4.5)/100.0  * baseline_longevity if hours_sleep == 'More than 10 hours per night' else
                        0) # 6 to 8 hours per night
                     )
    
    # save generated health data
    generated_samples.append([gender, weight, height, diet, water_intake, 
                                            smoking_freq, alcohol_consump, hours_sleep, 
                                            longevity])

In [6]:
augmented_data = pd.DataFrame(generated_samples, columns=['Biological Gender', 'Weight', 'Height', 'Diet', 'Daily water intake', 
                               'Smoking Frequency', 'Alcohol', 'Hours of Sleep', 'Longevity'])
augmented_data.to_csv('augmented_data_df.csv')

In [7]:
augmented_data.head()

Unnamed: 0,Biological Gender,Weight,Height,Diet,Daily water intake,Smoking Frequency,Alcohol,Hours of Sleep,Longevity
0,Female,67.791249,160.065238,Mediterranean,Normal,Never smoked,Weekly drinker,More than 10 hours per night,84.134038
1,Female,126.819543,163.350293,Carnivore,Less than recommended,Ex-smoker,Weekly drinker,6 to 8 hours per night,51.178058
2,Female,68.576106,165.261044,Carnivore,Less than recommended,Never smoked,Abstainer/rare drinker,More than 10 hours per night,75.109558
3,Female,48.788133,167.244835,Keto,Normal,Never smoked,Abstainer/rare drinker,Less 6 hours per night,94.350782
4,Male,94.015129,187.118993,Vegetarian,Less than recommended,Regular smoker,Abstainer/rare drinker,More than 10 hours per night,67.698948


```
[1] Samaras, T. T., & Storms, L. H. (1992). Impact of height and weight on life span. Bulletin of the World Health Organization, 70(2), 259–267.
[2] Thomas T Samaras, Lowell H Storms, Harold Elrick (2002). Longevity, mortality and body weight. Ageing Research Reviews, Volume 1, Issue 4, 673-691.
[3] Martinez-Gonzalez, M. A., & Martin-Calvo, N. (2016). Mediterranean diet and life expectancy; beyond olive oil, fruits, and vegetables. Current opinion in clinical nutrition and metabolic care, 19(6), 401–407.
[4] Catterson, J. H., Khericha, M., Dyson, M. C., Vincent, A. J., Callard, R., Haveron, S. M., Rajasingam, A., Ahmad, M., & Partridge, L. (2018). Short-Term, Intermittent Fasting Induces Long-Lasting Gut Health and TOR-Independent Lifespan Extension. Current biology : CB, 28(11), 1714–1724.e4. 
[5]Xie, K., Neff, F., Markert, A., Rozman, J., Aguilar-Pimentel, J. A., Amarie, O. V., Becker, L., Brommage, R., Garrett, L., Henzel, K. S., Hölter, S. M., Janik, D., Lehmann, I., Moreth, K., Pearson, B. L., Racz, I., Rathkolb, B., Ryan, D. P., Schröder, S., Treise, I., … Ehninger, D. (2017). Every-other-day feeding extends lifespan but fails to delay many symptoms of aging in mice. Nature communications, 8(1), 155.
[6] Roberts, M. N., Wallace, M. A., Tomilov, A. A., Zhou, Z., Marcotte, G. R., Tran, D., Perez, G., Gutierrez-Casado, E., Koike, S., Knotts, T. A., Imai, D. M., Griffey, S. M., Kim, K., Hagopian, K., McMackin, M. Z., Haj, F. G., Baar, K., Cortopassi, G. A., Ramsey, J. J., & Lopez-Dominguez, J. A. (2017). A Ketogenic Diet Extends Longevity and Healthspan in Adult Mice. Cell metabolism, 26(3), 539–546.e5. 
[7] Naghshi S, Sadeghi O, Willett W C, Esmaillzadeh A. (2020) Dietary intake of total, animal, and plant proteins and risk of all cause, cardiovascular, and cancer mortality: systematic review and dose-response meta-analysis of prospective cohort studies. BMJ 2020, 370 :m2412 doi:10.1136/bmj.m2412. 
[8] Orlich MJ, Singh PN, Sabaté J, et al. Vegetarian Dietary Patterns and Mortality in Adventist Health Study 2. JAMA Intern Med. 2013;173(13):1230–1238. doi:10.1001/jamainternmed.2013.6473
[9] Seidelmann SB, Claggett B, Cheng S, Henglin M, Shah A, Steffen LM, Folsom AR, Rimm EB, Willett WC, Solomon SD. (2018) Dietary carbohydrate intake and mortality: a prospective cohort study and meta-analysis. The Lancet, volume 3, issue 9, 419-428.
[10] Dmitrieva NI, Burg MB. (2015) Elevated Sodium and Dehydration Stimulate Inflammatory Signaling in Endothelial Cells and Promote Atherosclerosis. PLOS ONE 10(6): e0128870.
[11] Darden, M., Gilleskie, D. B., & Strumpf, K. (2018). Smoking and Mortality: New Evidence from a Long Panel. International economic review, 59(3), 1571–1619.
[12] van den Brandt, P. A., & Brandts, L. (2020). Alcohol consumption in later life and reaching longevity: the Netherlands Cohort Study. Age and ageing, 49(3), 395–402.
[13] Burton R, Sheron N. (2018). No level of alcohol consumption improves health. The Lancet, volume 392, issue 10152, 987-988.
[14] Cappuccio, F. P., D'Elia, L., Strazzullo, P., & Miller, M. A. (2010). Sleep duration and all-cause mortality: a systematic review and meta-analysis of prospective studies. Sleep, 33(5), 585–592.
[15] Roser M, Appel C, Ritchie H. (2013). Human Height. OurWorldInData.org, 'https://ourworldindata.org/human-height' [Online Resource]
[16] Millar WJ. (1986). Distribution of body weight and height: Comparison of estimates based on self-reported and observed measures. Journal of Epidemiology and Community Health, 40, 319-323.


---
Papers [1], [2], [15], [16] related to weight, height and/or BMI (and on some, longevity).
Papers [3], [4], [5], [6], [7], [8], and [9] related to diet choices and longevity.
Paper  [10] related to water consumption and longevity.
Paper  [11] related to smoking and longevity.
Papers [12] and [13] related to alcohol consumption and longevity.
Paper  [14] related to sleep habits and longevity.
```

# Step 3
Use augmented dataset to build a first instance of the predictive model.

In [2]:
augmented_data = pd.read_csv('augmented_data_df.csv').drop(['Unnamed: 0'], axis=1)
augmented_data.tail()

Unnamed: 0,Biological Gender,Weight,Height,Diet,Daily water intake,Smoking Frequency,Alcohol,Hours of Sleep,Longevity
99995,Female,93.621495,160.257449,Vegetarian,Normal,Never smoked,Abstainer/rare drinker,6 to 8 hours per night,83.991402
99996,Male,84.063275,181.556145,Mediterranean,Normal,Never smoked,Abstainer/rare drinker,Less 6 hours per night,75.671197
99997,Female,57.268023,166.014096,Paleo,Normal,Never smoked,Weekly drinker,Less 6 hours per night,81.106788
99998,Female,82.8099,172.001727,Paleo,Less than recommended,Regular smoker,Weekly drinker,6 to 8 hours per night,60.029553
99999,Male,63.315653,195.631114,Mediterranean,Less than recommended,Ex-smoker,Weekly drinker,6 to 8 hours per night,82.958765


In [3]:
Y = augmented_data.iloc[:, -1].values

augmented_data = augmented_data.drop(['Longevity'], axis=1)

for catg in ['Biological Gender', 'Diet', 'Daily water intake', 'Smoking Frequency', 'Alcohol', 'Hours of Sleep']:
    augmented_data = pd.concat([augmented_data.drop([catg], axis=1), 
                                pd.get_dummies(augmented_data[catg], prefix=catg)], axis=1)

X = augmented_data.values

In [105]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

def build_model():
    normalizer = preprocessing.Normalization(axis=-1)
    normalizer.adapt(np.array(X))
    
    model = tf.keras.Sequential([
        normalizer,
        layers.Dense(30, input_dim=25, kernel_initializer='normal', activation='relu'),
        layers.Dense(14, kernel_initializer='normal', activation='relu', kernel_regularizer='l2'),
        layers.Dense(1, kernel_initializer='normal')])
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [106]:
pred_model = build_model()

pred_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

In [108]:
from tensorflow.keras.callbacks import ModelCheckpoint

# add a checkpoint to save the lowest validation loss
filepath = 'tf2_best_model.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, \
                             save_best_only=True, save_weights_only=False, \
                             mode='auto', save_frequency=1)

In [109]:
%%time
history = pred_model.fit(
    X, Y,
    epochs=100,
    verbose=0, # no logging
    validation_split = 0.15,
    callbacks=[checkpoint])


Epoch 00001: val_loss improved from inf to 2.04549, saving model to tf2_best_model.hdf5

Epoch 00002: val_loss improved from 2.04549 to 2.00536, saving model to tf2_best_model.hdf5

Epoch 00003: val_loss did not improve from 2.00536

Epoch 00004: val_loss did not improve from 2.00536

Epoch 00005: val_loss improved from 2.00536 to 1.83894, saving model to tf2_best_model.hdf5

Epoch 00006: val_loss did not improve from 1.83894

Epoch 00007: val_loss did not improve from 1.83894

Epoch 00008: val_loss did not improve from 1.83894

Epoch 00009: val_loss did not improve from 1.83894

Epoch 00010: val_loss did not improve from 1.83894

Epoch 00011: val_loss did not improve from 1.83894

Epoch 00012: val_loss did not improve from 1.83894

Epoch 00013: val_loss did not improve from 1.83894

Epoch 00014: val_loss did not improve from 1.83894

Epoch 00015: val_loss did not improve from 1.83894

Epoch 00016: val_loss did not improve from 1.83894

Epoch 00017: val_loss did not improve from 1.838

In [110]:
pred_model.save('health_outcome_model')

2021-09-25 22:03:48.915829: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: health_outcome_model/assets


In [111]:
# # Just checking loading works fine

# rec_health_model = keras.models.load_model("health_outcome_model")

# np.testing.assert_allclose(
#     pred_model.predict(X[:, :100]), rec_health_model.predict(X[:, :100])
# )