In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

### Data Import and cleaning

In [None]:
grc = pd.read_csv("data/year_osward_grocery.csv", index_col="area_id")
grc.head()

In [None]:
diab = pd.read_csv("data/diabetes_estimates_osward_2016.csv", index_col = "area_id")
diab.head()

In [None]:
areas = set(grc.index).intersection(set(diab.index))
print(f"There are {len(grc)} areas in the Tesco dataset.")
print(f"{len(areas)} of them appear in the Diabetes dataset.")

In [None]:
#using only shared areas and merging both df
idx = grc.index.intersection(diab.index)
grc = grc.loc[idx]
diab = diab.loc[idx]
diab.reindex(grc.index)
df = pd.concat([grc, diab], axis = 1)
df.head()

### Data Preprossesing

We are going to create new features : young, mid_age and old densities for each area, helping us to represent the kinf of area we are dealing with. We will then check if these features are correlated with the areas product purchase density (f_product). (done for msoa)

For stronger results, we are considering only areas with a representativeness_norm > 25%. We also only take into accouns correlations having p-values < 0,05

In [None]:
df["young_density"] = df["age_0_17"]/df["population"]
df["mid_age_density"] = df["age_18_64"]/df["population"]
df["old_density"] = df["age_65+"]/df["population"]
df = df.loc[df["representativeness_norm"] > 0.25]

df_age_diab = df[["young_density","mid_age_density", "old_density", "estimated_diabetes_prevalence"]]

df_age_diab.describe()

From this quick description, we notice that the densities have roughly the same std, that the mean diabetes prevalence is 6.42 and most importantly, that each area has at least 56% of middle age population ! Compared to young_density where each area has at least 8.78% and old_density 3.6%. This is logical. However, we have to take this into account when analyzing future results.

### Correlations analyisis 

Now, we check if age densities correlate with diabetes prevalence. To do so, we are computing the Spearman rank correlation and keep only correlations with p < 0.05

In [None]:
rho, p_val = stats.spearmanr(df_age_diab, axis = 0)

In [None]:
rho = pd.DataFrame(rho, index = df_age_diab.columns, columns = df_age_diab.columns)

#place NaN on unrelevant correlations
rho.where(p_val < 0.05, inplace = True)
rho

In [None]:
plt.figure(figsize = (10,5))
sns.barplot(x = rho.index.drop("estimated_diabetes_prevalence"),
            y = rho["estimated_diabetes_prevalence"].drop("estimated_diabetes_prevalence"))

plt.title("Correlation between diabetes prevalence and age densities")
plt.show()

### Interesting results so far !

No need to talk much about the results, which are quite visual ! The younger the population, the higher the estimated diabetes prevalence.
Correlation between old densities was not found to be relevant though.

Let's compare the number of gp patients in younger areas with the number of gp patients in other areas


In [None]:
df_age_diab.loc[:,"gp_patients"] = df.loc[:,"gp_patients"]
df_age_diab.loc[:,"gp_patients_diabetes"] = df.loc[:,"gp_patients_diabetes"]
df_age_diab.describe()


In [None]:
rho, p_val = stats.spearmanr(df_age_diab, axis = 0)

rho = pd.DataFrame(rho, index = df_age_diab.columns, columns = df_age_diab.columns)

#place NaN on unrelevant correlations
rho.where(p_val < 0.05, inplace = True)
rho

In [None]:
plt.figure(figsize = (14,5))
sns.barplot(x = rho.index.drop(["gp_patients", "gp_patients_diabetes"]),
            y = rho["gp_patients"].drop(["gp_patients", "gp_patients_diabetes"]))

plt.title("Correlation between age densities and gp_patients subscribers")
plt.show()

Mid age density is slightly more correlated with the registrations at a gp practice than younger densities areas. But the number of gp patients is also slightly positively correlated with the estimated diabetes prevalence.

From this correlations plot, we conclude that the correlations found in figure are meaningful. 

We also see that the older the population is, the less subscribers at a gp there are. which can explain the fact why the old_density was not taken into account when passing through our correlation filter (p < 0.05)

We are therefore primarily going to focus on the young and mid_age populations.

 ### So, why ? 

We now we would like to know why younger populations tend to have a higher diabetes prevalence estimate.
To do so, we are going to compare the food habits of young populations compared to the others. We are only keeping correlations > 0.25 in absolute value

### Which types of products prefer the young person ? 

In [None]:
df_prod = df[["f_beer","f_dairy","f_eggs","f_fats_oils"
              ,"f_fish","f_fruit_veg","f_grains","f_meat_red"
              ,"f_poultry","f_readymade","f_sauces","f_soft_drinks"
              ,"f_spirits","f_sweets","f_tea_coffee","f_water","f_wine"
              , "young_density"
              , "mid_age_density", "old_density"]]

In [None]:
rho, p_val = stats.spearmanr(df_prod, axis = 0)

rho = pd.DataFrame(rho, index = df_prod.columns, columns = df_prod.columns)

#place NaN on unrelevant correlations
rho.where(p_val < 0.05, inplace = True)
rho.where(abs(rho) > 0.25, inplace = True)

In [None]:
plt.figure(figsize = (20,5))
sns.barplot(x = rho.index.drop(["young_density", "mid_age_density", "old_density"]),
            y = rho.drop(["young_density", "mid_age_density", "old_density"])["young_density"])
plt.show() 

In [None]:
plt.figure(figsize = (20,5))
sns.barplot(x = rho.index.drop(["young_density", "mid_age_density", "old_density"]),
            y = rho.drop(["young_density", "mid_age_density", "old_density"])["mid_age_density"])
plt.show()

In [None]:
#Creating both barplots side by side 
import numpy as np
fig = plt.figure(figsize = (20,5))
plot_df = rho[["mid_age_density", "young_density"]]
plot_df.fillna(0, inplace = True)

#Drop lines with no particular information
idx_to_drop = plot_df[(plot_df["mid_age_density"] == 0) & (plot_df["young_density"]==0)].index
plot_df.drop(idx_to_drop, inplace = True)
idx_to_drop

In [None]:
fig = plt.figure(figsize = (15,5))
x_ticks = plot_df.index.drop(["young_density", "mid_age_density", "old_density"])

x = np.arange(10)
ax1 = plt.subplot(1,1,1)
ax1.set_ylim([-1,1])
w = 0.3
plt.xticks(x + w/2, x_ticks, rotation = "vertical")
mid = ax1.bar(x, plot_df.drop(["young_density", "mid_age_density", "old_density"])["mid_age_density"], width = w, color = 'b', align = "center")

ax2 = ax1.twinx()
ax2.set_ylim([-1,1])
young = ax2.bar(x + w, plot_df.drop(["young_density", "mid_age_density", "old_density"])["young_density"], width = w, color = 'g', align = "center")

plt.legend([mid, young], ["middle aged density", "young density"], loc = 'upper left')
plt.title("Correlations between age density of the areas and products purchases")
plt.show()

 ### Interpretation
 
 Non surprisingly, younger populations are strongly negatively correlated with alcohol products (beer & wine). Indeed, it's illegal to buy alcohol before 18 in London !
 
 However, they are being bad students regarding fruits and vegetables products and fish. But they are keen on sweets, soft drinks and grains products !
 
 To the side of the mid-age denstiy populations it is quite the oposite (which is logical, since the less the young density, the more in the mid-age density). 

### So how are the products purchases correlated with diabetes prevalence ? 

In [None]:
df_prod.loc[:,"diabetes_prevalence"] = df.loc[:,"estimated_diabetes_prevalence"]

In [None]:
rho, p_val = stats.spearmanr(df_prod, axis = 0)

rho = pd.DataFrame(rho, index = df_prod.columns, columns = df_prod.columns)

#place NaN on unrelevant correlations
rho.where(p_val < 0.05, inplace = True)
rho.where(abs(rho) > 0.25, inplace = True)

In [None]:
plt.figure(figsize = (20,5))
sns.barplot(x = rho.index.drop(["young_density", "mid_age_density", "old_density", "diabetes_prevalence"]),
            y = rho.drop(["young_density", "mid_age_density", "old_density", "diabetes_prevalence"])["diabetes_prevalence"])
plt.show()

## Déja Vu ? 
Effectively, all the products follow the same correlations trends with diabetes_prevalence and young_density areas ! 

### But wait, what ?? Is this saying that alcohol consumption decreases the risk of getting diabetes ? 

Well, according to [this](https://time.com/4876998/drinking-wine-diabetes/#:~:text=People%20who%20had%20the%20lowest,had%20a%2058%25%20reduced%20risk.) study, yes. But stay moderate !


## Some plots we could use in the story/study

### young density vs diabetes prevalence (Scatter Plot)

In [None]:
plt.scatter(df_prod["young_density"], df_prod["diabetes_prevalence"])
plt.show()

### mid-age density vs diabetes prevalence

In [None]:
plt.scatter(df_prod["mid_age_density"], df_prod["diabetes_prevalence"])
plt.show()

## Experimenting plotly 

In [None]:
import plotly.graph_objs as go
from plotly.offline import iplot


In [None]:
# create trace1 
trace1 = go.Bar(
                x = x_ticks,
                y = plot_df.drop(["young_density", "mid_age_density", "old_density"])["mid_age_density"],
                name = "adults density",
                marker = dict(color = 'rgba(255, 174, 255, 0.5)',
                             line=dict(color='rgb(0,0,0)',width=1.5)),
                #text = df2014.country)
                )
    
# create trace2 
trace2 = go.Bar(
                x = x_ticks,
                y = plot_df.drop(["young_density", "mid_age_density", "old_density"])["young_density"],
                name = "young density",
                marker = dict(color = 'rgba(255, 255, 128, 0.5)',
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                #text = df2014.country)
                )
    
data = [trace1, trace2]
layout = go.Layout(barmode = "group", title = "Correlations between products purchases and age density")
fig = go.Figure(data = data, layout = layout)
iplot(fig)