In [148]:
import pandas as pd
import numpy as np 
import seaborn as sns
import plotly.graph_objs as go
from scipy import stats
import plotly.express as px

#### Import data

In [149]:
inc = pd.read_excel("data/ons-model-based-income-estimates-msoa.xls",
                       sheet_name="2015-16 (annual income)",
                       index_col="MSOA code")

In [150]:
grc = pd.read_csv("data/year_msoa_grocery.csv", index_col="area_id")
grc.head()

Unnamed: 0_level_0,weight,weight_perc2.5,weight_perc25,weight_perc50,weight_perc75,weight_perc97.5,weight_std,weight_ci95,volume,volume_perc2.5,...,man_day,population,male,female,age_0_17,age_18_64,age_65+,avg_age,area_sq_km,people_per_sq_km
area_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E02000001,323.180804,35.0,150.0,250.0,400.0,1000.0,323.685705,1.00136,98.849228,13.0,...,103934,6687.0,3697.0,2990.0,1018.0,4349.0,1320.0,43.910124,2.9,2305.862069
E02000002,397.651232,32.5,150.0,300.0,500.0,1020.0,550.434424,6.030232,118.288417,11.0,...,9952,7379.0,3549.0,3830.0,2150.0,4151.0,1078.0,35.567286,2.16,3416.203704
E02000003,371.276702,30.0,140.0,250.0,450.0,1200.0,564.404826,4.398728,118.385615,12.0,...,19576,10720.0,5297.0,5423.0,2762.0,6649.0,1309.0,35.700466,2.14,5009.345794
E02000004,347.481292,29.0,126.0,250.0,430.0,1000.0,479.372575,5.297349,104.435407,11.0,...,10689,6536.0,3119.0,3417.0,1454.0,4044.0,1038.0,38.493115,2.48,2635.483871
E02000005,400.576688,32.5,150.0,296.0,500.0,1050.0,593.021809,5.107669,128.997438,11.0,...,15071,9243.0,4478.0,4765.0,2678.0,5554.0,1011.0,34.156335,1.19,7767.226891


#### filter unwanted columns : percentiles, confidence intervals, std

In [151]:
grc = grc[grc.columns.drop(list(grc.filter(regex='perc')))]
grc = grc[grc.columns.drop(list(grc.filter(regex = 'ci95')))]
grc = grc[grc.columns.drop(list(grc.filter(regex = 'std')))]   

#### Convert age_x into densities (age_x/population). This is easier to interpret as it is a percentage.  Doing the same for gender


In [152]:
grc["age_0_17"] = grc["age_0_17"]/grc["population"]
grc["age_18_64"] = grc["age_18_64"]/grc["population"]
grc["age_65+"] = grc["age_65+"]/grc["population"]
grc["male"] = grc["male"]/grc["population"]
grc["female"] = grc["female"]/grc["population"]

#### merge income and groceries dataframes, use only "Net annual income" and "Local authority name" features from inc df

In [153]:
#using only shared areas and merging both df
inc = inc[["Net annual income (£)", "Local authority name"]]
idx = grc.index.intersection(inc.index)
grc = grc.loc[idx]
inc = inc.loc[idx]
inc.reindex(grc.index)
df = pd.concat([grc, inc], axis = 1)

#### Since we have access to locla authority names, we are going to group areas belonging to same local authority by means of the mean operator 

In [154]:
df = df.groupby("Local authority name").mean()


#### We consider only local authorities having a representativeness bigger than 20%

In [155]:
df = df[df["representativeness_norm"] > 0.20]

We have quite a lot of features, 76 in total ! Some of them are highly correlated (or even linearly dependant) with others, the aim of the next step is to compare several of these features (such as type of product bought or nutrient intake) with features like Net annual income, people_per_sq_km etc...


#### 1st, let's see how the features are correlated, we are only taking into account correaltions having p-value < 0.05

In [156]:
rho, p_val = stats.spearmanr(df, axis = 0)

rho = pd.DataFrame(rho, index = df.columns, columns = df.columns)

#place NaN on unrelevant correlations
rho.where(p_val < 0.05, inplace = True)

In [157]:
rho #NaN when correlation has a p-value >= 0.05

Unnamed: 0,weight,volume,fat,saturate,salt,sugar,protein,carb,fibre,alcohol,...,population,male,female,age_0_17,age_18_64,age_65+,avg_age,area_sq_km,people_per_sq_km,Net annual income (£)
weight,1.000000,0.793913,,,,,,,-0.622609,-0.550435,...,,,,0.495652,,,,,,-0.617391
volume,0.793913,1.000000,0.501739,0.628696,,0.610435,-0.453913,0.724348,,-0.761739,...,,,,0.741739,-0.466957,,,,,-0.544348
fat,,0.501739,1.000000,0.839130,,0.753913,,0.736522,,-0.577391,...,,,,0.713043,,,,,,-0.441739
saturate,,0.628696,0.839130,1.000000,,0.676522,-0.409565,0.581739,,-0.587826,...,,,,0.573913,,,,,,
salt,,,,,1.000000,,0.641739,,-0.504348,,...,,,,,,-0.496522,-0.578261,,,-0.597391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
age_65+,,,,,-0.496522,,,,0.479130,,...,,,,,-0.765217,1.000000,0.971304,0.673043,-0.723478,0.428696
avg_age,,,,,-0.578261,,,,0.494783,,...,,,,,-0.655652,0.971304,1.000000,0.564348,-0.600870,0.499130
area_sq_km,,,,,,0.517391,-0.457391,0.457391,,,...,,,,0.542609,-0.873913,0.673043,0.564348,1.000000,-0.934783,
people_per_sq_km,,,,,,,,,,,...,,,,-0.430435,0.853913,-0.723478,-0.600870,-0.934783,1.000000,


In [158]:
trace1 = {
  "type": "heatmap", 
  "x": rho.columns, 
  "y": rho.columns, 
  "z": rho
}
data = go.Data([trace1])
layout = {"title": "Features Correlation Matrix"}
fig = go.Figure(data=data, layout=layout)
fig.show()


plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.




In [159]:
#fig.write_html("corr_matrix_local_auth.html")

We are going to separate the dataframe into low, medium and high income and see how product purchase differ

In [160]:
df["Net annual income (£)"].describe()

count       24.000000
mean     39859.645523
std       2953.019523
min      35670.270270
25%      37408.251634
50%      39696.401985
75%      42120.416667
max      47237.837838
Name: Net annual income (£), dtype: float64

In [161]:
#plot incomes with categorical low, med, high : 

df["income group"] = df["Net annual income (£)"].apply(lambda x: "Low" if x <=37408 else "Medium" if x <= 42120 else "High")

In [162]:
fig = px.bar(df, x=df.index, y="Net annual income (£)", color = "income group")
fig.update_yaxes(range = [35000, 48000])
fig.show()

In [163]:
#fig.write_html("Income_class.html")

#### What about these income groups and differences in product purchases ? 

In [164]:
df_classes = df.groupby("income group").mean()

In [165]:
df_classes

Unnamed: 0_level_0,weight,volume,fat,saturate,salt,sugar,protein,carb,fibre,alcohol,...,population,male,female,age_0_17,age_18_64,age_65+,avg_age,area_sq_km,people_per_sq_km,Net annual income (£)
income group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
High,358.731398,107.03461,8.789719,3.498986,0.558266,9.646598,5.259406,16.840343,1.621996,0.274367,...,7771.092887,0.501823,0.498177,0.194778,0.670226,0.134995,38.050145,1.374102,8976.853226,43636.845989
Low,411.78248,117.064227,9.027571,3.551179,0.59945,9.986834,5.282442,18.284315,1.57855,0.228695,...,8854.332922,0.499439,0.500561,0.238333,0.666253,0.095414,34.486773,1.182109,10403.992641,36279.370442
Medium,381.198044,111.444042,8.977441,3.524088,0.579207,10.137658,5.255349,18.202135,1.617542,0.231732,...,9241.817227,0.501143,0.498857,0.220275,0.670248,0.109476,35.58559,1.45444,9897.897382,39761.182831


In [166]:
food_cols=[
 'f_beer',
 'f_dairy',
 'f_eggs',
 'f_fats_oils',
 'f_fish',
 'f_fruit_veg',
 'f_grains',
 'f_meat_red',
 'f_poultry',
 'f_readymade',
 'f_sauces',
 'f_soft_drinks',
 'f_spirits',
 'f_sweets',
 'f_tea_coffee',
 'f_water',
 'f_wine']

In [167]:
fig = go.Figure(data=[
    go.Bar(name='High', x=food_cols, y= df_classes.loc["High",food_cols]),
    go.Bar(name='Medium', x=food_cols, y= df_classes.loc["Medium", food_cols]),
    go.Bar(name = 'Low', x = food_cols, y = df_classes.loc["Low", food_cols])
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()
#fig.write_html("html/prod_income.html")

We observe that High income areas buy less products positively correlated with diabetes prevalence (from other data study), and more products negatively correlated with diabetes prevalence. Even if the difference is quite small, it does so for every product !

But a good point to note is that the most frequently bought products for each local authority belong to the fruits & veggies class !

# OSWARD analysis (diabetes prevalence feature available)

#### Same procedure as 1st data import

In [168]:
grc = pd.read_csv("data/year_osward_grocery.csv", index_col="area_id")
grc.head()

Unnamed: 0_level_0,weight,weight_perc2.5,weight_perc25,weight_perc50,weight_perc75,weight_perc97.5,weight_std,weight_ci95,volume,volume_perc2.5,...,man_day,population,male,female,age_0_17,age_18_64,age_65+,avg_age,area_sq_km,people_per_sq_km
area_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E05000026,449.535137,32.5,166.4,300.0,500.0,1500.0,771.349423,3.716832,125.960833,11.0,...,37315,14370.0,7469.0,6901.0,4211.0,9421.0,738.0,29.572999,1.26,11404.761905
E05000027,413.130263,32.5,150.0,300.0,500.0,1500.0,626.395178,5.164174,110.664114,15.0,...,14474,10845.0,5228.0,5617.0,3205.0,6608.0,1032.0,33.568004,1.36,7974.264706
E05000028,407.100472,32.5,160.0,300.0,500.0,1200.0,545.890959,2.710677,121.99071,11.0,...,32138,13856.0,6750.0,7106.0,4180.0,8537.0,1139.0,32.032693,1.29,10741.085271
E05000029,384.173858,30.0,150.0,250.0,454.0,1500.0,591.837557,4.968373,122.245578,11.0,...,16223,10850.0,5300.0,5550.0,3023.0,6251.0,1576.0,36.004793,3.38,3210.059172
E05000030,356.882607,30.0,140.0,250.0,450.0,1000.0,465.28418,3.880963,109.959688,13.69,...,17522,11348.0,5515.0,5833.0,2747.0,6961.0,1640.0,37.247444,3.45,3289.275362


In [169]:
diab = pd.read_csv("data/diabetes_estimates_osward_2016.csv", index_col = "area_id")
diab.head()

Unnamed: 0_level_0,gp_patients,gp_patients_diabetes,estimated_diabetes_prevalence
area_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
E05000026,13136,1068,8.1
E05000027,8954,631,7.0
E05000028,12032,958,8.0
E05000029,8853,700,7.9
E05000030,8813,640,7.3


In [170]:
areas = set(grc.index).intersection(set(diab.index))
print(f"There are {len(grc)} areas in the Tesco dataset.")
print(f"{len(areas)} of them appear in the Diabetes dataset.")

There are 638 areas in the Tesco dataset.
547 of them appear in the Diabetes dataset.


In [171]:
#using only shared areas and merging both df
idx = grc.index.intersection(diab.index)
grc = grc.loc[idx]
diab = diab.loc[idx]
diab.reindex(grc.index)
df = pd.concat([grc, diab], axis = 1)

In [172]:
df = df[df.columns.drop(list(df.filter(regex='perc')))]
df = df[df.columns.drop(list(df.filter(regex = 'ci95')))]
df = df[df.columns.drop(list(df.filter(regex = 'std')))]  

df["age_0_17"] = df["age_0_17"]/df["population"]
df["age_18_64"] = df["age_18_64"]/df["population"]
df["age_65+"] = df["age_65+"]/df["population"]
df["male"] = df["male"]/df["population"]
df.head()

Unnamed: 0_level_0,weight,volume,fat,saturate,salt,sugar,protein,carb,fibre,alcohol,...,female,age_0_17,age_18_64,age_65+,avg_age,area_sq_km,people_per_sq_km,gp_patients,gp_patients_diabetes,estimated_diabetes_prevalence
area_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E05000026,449.535137,125.960833,9.488797,3.693721,0.58324,10.966213,4.97756,19.381951,1.564721,0.198172,...,6901.0,0.293041,0.655602,0.051357,29.572999,1.26,11404.761905,13136,1068,8.1
E05000027,413.130263,110.664114,9.733634,3.565913,0.568184,10.514427,5.211694,18.950348,1.58196,0.209917,...,5617.0,0.295528,0.609313,0.095159,33.568004,1.36,7974.264706,8954,631,7.0
E05000028,407.100472,121.99071,9.21631,3.613582,0.610536,10.690272,5.192412,19.662048,1.572323,0.225425,...,7106.0,0.301674,0.616123,0.082203,32.032693,1.29,10741.085271,12032,958,8.0
E05000029,384.173858,122.245578,9.700137,3.961264,0.53318,12.938606,4.718184,20.084734,1.550344,0.20038,...,5550.0,0.278618,0.576129,0.145253,36.004793,3.38,3210.059172,8853,700,7.9
E05000030,356.882607,109.959688,9.381808,3.614663,0.566784,11.332898,5.307003,19.581403,1.607947,0.168952,...,5833.0,0.242069,0.613412,0.144519,37.247444,3.45,3289.275362,8813,640,7.3


## Correlations between age densities and diabetes prevalence
#### (always keep correaltions with p < 0.05)

In [173]:
age_densitites_diab =  ["age_0_17","age_18_64", "age_65+", "estimated_diabetes_prevalence"]

rho, p_val = stats.spearmanr(df[age_densitites_diab], axis = 0)

In [174]:
rho = pd.DataFrame(rho, index = df[age_densitites_diab].columns, columns = df[age_densitites_diab].columns)

#place NaN on unrelevant correlations
rho.where(p_val < 0.05, inplace = True)
rho

Unnamed: 0,age_0_17,age_18_64,age_65+,estimated_diabetes_prevalence
age_0_17,1.0,-0.591945,,0.642505
age_18_64,-0.591945,1.0,-0.675707,-0.396497
age_65+,,-0.675707,1.0,
estimated_diabetes_prevalence,0.642505,-0.396497,,1.0


#### plot correlations between diabetes prevalence and age_densities

In [175]:
rho_plot = rho.drop("estimated_diabetes_prevalence")
rho_plot["color"] = rho_plot["estimated_diabetes_prevalence"].apply(lambda c : 1 if c > 0 else 0)
trace1 = go.Bar(x = rho_plot.index, y = rho_plot["estimated_diabetes_prevalence"])
layout = go.Layout(barmode = "group")

data = trace1
fig = go.Figure(data = data, layout = layout)
fig.show()

In [176]:
#fig.write_html("html/age_diab_corr.html")

### This high correlation between younger populations and age densities are interesting, Let's check the type of products that young populations prefer. 

In [177]:
cols = [
 'f_beer',
 'f_dairy',
 'f_eggs',
 'f_fats_oils',
 'f_fish',
 'f_fruit_veg',
 'f_grains',
 'f_meat_red',
 'f_poultry',
 'f_readymade',
 'f_sauces',
 'f_soft_drinks',
 'f_spirits',
 'f_sweets',
 'f_tea_coffee',
 'f_water',
 'f_wine', 
 'age_0_17', 
 'age_18_64']
rho, p_val = stats.spearmanr(df[cols])

In [178]:
rho = pd.DataFrame(rho, index = cols, columns = cols)
rho.where(p_val < 0.05, inplace = True)
rho["age_18_64"]

f_beer           0.349122
f_dairy          0.284448
f_eggs           0.472766
f_fats_oils      0.180309
f_fish           0.499065
f_fruit_veg      0.409391
f_grains        -0.465175
f_meat_red       0.204208
f_poultry        0.227549
f_readymade     -0.344177
f_sauces         0.142840
f_soft_drinks   -0.183621
f_spirits             NaN
f_sweets        -0.510212
f_tea_coffee    -0.119143
f_water               NaN
f_wine           0.323166
age_0_17        -0.591945
age_18_64        1.000000
Name: age_18_64, dtype: float64

In [179]:
# create trace1 
trace1 = go.Bar(
                x = food_cols,
                y = rho.drop(["age_0_17", "age_18_64"])["age_18_64"],
                name = "adults density",
                marker = dict(color = 'rgba(255, 174, 255, 0.5)',
                             line=dict(color='rgb(0,0,0)',width=1.5)),
                #text = df2014.country)
                )
    
# create trace2 
trace2 = go.Bar(
                x = food_cols,
                y = rho.drop(["age_0_17", "age_18_64"])["age_0_17"] ,
                name = "young density",
                marker = dict(color = 'rgba(255, 255, 128, 0.5)',
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                #text = df2014.country)
                )
    
data = [trace1, trace2]
layout = go.Layout(barmode = "group")
fig = go.Figure(data = data, layout = layout)
fig.show()

In [180]:
#fig.write_html("html/age_prod_corr.html")

### and now let's check the types of product positively correlated with diabetes prevalence

In [181]:
cols =  [
 'f_beer',
 'f_dairy',
 'f_eggs',
 'f_fats_oils',
 'f_fish',
 'f_fruit_veg',
 'f_grains',
 'f_meat_red',
 'f_poultry',
 'f_readymade',
 'f_sauces',
 'f_soft_drinks',
 'f_spirits',
 'f_sweets',
 'f_tea_coffee',
 'f_water',
 'f_wine', 
 'estimated_diabetes_prevalence']

In [182]:
rho, p_val = stats.spearmanr(df[cols])
rho = pd.DataFrame(rho, index = cols, columns = cols)
rho.where(p_val < 0.05, inplace = True)

In [183]:
trace1 = go.Bar(
                x = food_cols,
                y = rho.drop("estimated_diabetes_prevalence")["estimated_diabetes_prevalence"],
                marker = dict(color = 'rgba(255, 255, 128, 0.5)',
                             line=dict(color='rgb(0,0,0)',width=1.5)),
                
                )

data = trace1
layout = go.Layout(barmode = "group")
fig = go.Figure(data = data, layout = layout)
fig.show()

In [184]:
#fig.write_html("html/prod_diab_corr.html")

### Ok we now understand why younger populations have a higher diabetes prevalence score, it seems to be because their purchase habits are correlated with higher diabetes prevalence score !

In [185]:
fig = px.scatter(df, x="age_0_17", y="estimated_diabetes_prevalence",
                 size="gp_patients", color="h_nutrients_calories",
                 log_x=False )
fig.show()

In [186]:
#fig.write_html("html/age_diab_scatter.html")

Seeing this, we definitely would like to try to perform linear regression on it ! But which other features than age could we include in the model ? It has to be correlated with diabetes prevalence, but preferably **uncorrelated** with age.


In [187]:
nutrients = ["fat", "saturate", "sugar", "protein", "carb", "fibre","age_0_17", "h_nutrients_calories", "estimated_diabetes_prevalence"]

In [188]:
rho, p_val = stats.spearmanr(df[nutrients])
rho = pd.DataFrame(rho, index = nutrients, columns = nutrients)
rho.where(p_val < 0.05, inplace = True)
rho

Unnamed: 0,fat,saturate,sugar,protein,carb,fibre,age_0_17,h_nutrients_calories,estimated_diabetes_prevalence
fat,1.0,0.796702,0.618986,,0.598552,0.162188,0.410582,-0.618823,0.364667
saturate,0.796702,1.0,0.463348,-0.156344,0.369775,,0.328466,-0.47478,0.355926
sugar,0.618986,0.463348,1.0,-0.441407,0.868335,,0.519648,-0.795606,0.534064
protein,,-0.156344,-0.441407,1.0,-0.283176,0.265792,-0.360633,0.519626,-0.467627
carb,0.598552,0.369775,0.868335,-0.283176,1.0,,0.614608,-0.855694,0.684758
fibre,0.162188,,,0.265792,,1.0,-0.253723,0.140226,-0.343192
age_0_17,0.410582,0.328466,0.519648,-0.360633,0.614608,-0.253723,1.0,-0.65337,0.642505
h_nutrients_calories,-0.618823,-0.47478,-0.795606,0.519626,-0.855694,0.140226,-0.65337,1.0,-0.774751
estimated_diabetes_prevalence,0.364667,0.355926,0.534064,-0.467627,0.684758,-0.343192,0.642505,-0.774751,1.0


In [189]:
fig = px.scatter(df, x="age_0_17", y="h_nutrients_calories",
                 size="gp_patients", color="estimated_diabetes_prevalence",
                 log_x=False )
fig.show()

In [190]:
#fig.write_html("html/h_age_scatter_diab.html")

Based on this, we are going to perform a classification model that will predict if an area has a high or low diabetes risk. 
As features, we are going to use young age density and h_nutrient_calories (entropy of nutrients) and we are going to use the LogisticRegression model. 

In [191]:
df["estimated_diabetes_prevalence"].describe()

count    547.000000
mean       6.264168
std        1.952249
min        2.000000
25%        4.800000
50%        6.100000
75%        7.400000
max       12.600000
Name: estimated_diabetes_prevalence, dtype: float64

From the above description, we set a threshold that indicates if an area has high, medium or low risk of diabetes. 

In [192]:
#df["diabetes_risk"] = df["estimated_diabetes_prevalence"].apply(lambda x :
 #                                                               "Low" if x <= 4.8 else "Medium" if x <= 7.4 else "High")


In [193]:
#fig.write_html("html/rocs.html")

## Below lies the plot with true labels as colors

In [194]:
"""#df["y_scores"] = y_scores
fig = px.scatter(df, x="age_0_17", y="h_nutrients_calories",
                  color="diabetes_risk",
                 log_x=False )
fig.show()"""


'#df["y_scores"] = y_scores\nfig = px.scatter(df, x="age_0_17", y="h_nutrients_calories",\n                  color="diabetes_risk",\n                 log_x=False )\nfig.show()'

In [195]:
#fig.write_html("html/classes.html")

In [196]:
df["diabetes_risk"] = df["estimated_diabetes_prevalence"].apply(lambda x :
                                                                0 if x <= 6.1 else 1 )

X = df[["age_0_17", "h_nutrients_calories"]]
y = df["diabetes_risk"]

In [197]:
fig = px.scatter(df, x="age_0_17", y="h_nutrients_calories",
                  color="diabetes_risk",
                 log_x=False )
fig.show()
#fig.write_html("html/true_plot.html")

In [198]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
from jupyter_dash import JupyterDash
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn import metrics, datasets
from sklearn import linear_model, tree, neighbors, ensemble

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)

MODELS = {'Logistic': linear_model.LogisticRegression,
          'Decision Tree': tree.DecisionTreeClassifier,
          'k-NN': neighbors.KNeighborsClassifier,
          'Random Forest': ensemble.RandomForestClassifier,
          }

app = JupyterDash(__name__)

app.layout = html.Div([
    html.P("Train Model:"),
    dcc.Dropdown(
        id='model-name',
        options=[{'label': x, 'value': x} 
                 for x in MODELS],
        value='Logistic',
        clearable=False
    ),
    dcc.Graph(id="graph"),
])

@app.callback(
    Output("graph", "figure"), 
    [Input('model-name', "value")])
def train_and_display(name):
    model = MODELS[name]()
    model.fit(X_train, y_train)

    y_score = model.predict_proba(X_test)[:, 1]

    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score)
    score = metrics.auc(fpr, tpr)

    fig = px.area(
        x=fpr, y=tpr,
        title=f'ROC Curve (AUC={score:.4f})',
        labels=dict(
            x='False Positive Rate', 
            y='True Positive Rate'))
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1)
    #fig.write_html("ROC_models.html")
    return fig

app.run_server(debug=True)

Dash app running on http://127.0.0.1:8050/


In [199]:
models = {"Logistic Regression":linear_model.LogisticRegression(),
          "Decision Tree":tree.DecisionTreeClassifier(), 
          "k-NN":neighbors.KNeighborsClassifier(), 
          "Random Forest":ensemble.RandomForestClassifier()}

colors = pd.DataFrame()
for model in models:
    models[model].fit(X_train, y_train)
    y_hat = models[model].predict(X)
    colors[model] = y_hat



In [200]:
models = ["Logistic Regression", "Decision Tree", "k-NN", "Random Forest"]
col_options = [dict(label = x, value = x) for x in models]

In [201]:
app = JupyterDash(__name__)

app.layout = html.Div(
    [
        html.Div(
            html.P(["model", dcc.Dropdown(id="model", options=col_options)]),
            style={"width": "25%", "float": "left"},
        ),
        dcc.Graph(id="graph", style={"width": "75%", "display": "inline-block"}),
    ]
)

@app.callback(Output("graph", "figure"), Input("model", "value"))
def make_figure(color):
    fig =  px.scatter(
        data_frame = colors,
        x=X["age_0_17"],
        y=X["h_nutrients_calories"],
        color=color
    )
    #fig.wirte_html("classification_models.html")
    return fig

app.run_server(debug = True)

Dash app running on http://127.0.0.1:8050/


In [202]:
model = linear_model.LogisticRegression()
model.fit(X_train,y_train)
y_lr = model.predict(X)
fig = px.scatter(x = X["age_0_17"], y = X["h_nutrients_calories"], color = y_lr)
fig.show()
#fig.write_html("LogisticReg.html")

In [203]:
model =ensemble.RandomForestClassifier()
model.fit(X_train,y_train)
y_lr = model.predict(X_test)
fig = px.scatter(x = X_test["age_0_17"], y = X_test["h_nutrients_calories"], color = y_lr)
fig.show()
#fig.write_html("LogisticReg.html")

In [204]:
fig = px.scatter(x = X["age_0_17"], y = X["h_nutrients_calories"], color = colors["Random Forest"])
fig.show()
#fig.write_html("RandomForest.html")

In [209]:
from sklearn.metrics import roc_curve, roc_auc_score
model = ensemble.RandomForestClassifier()
model.fit(X_train, y_train)
y_score = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)

# The histogram of scores compared to true labels
fig_hist = px.histogram(
    x=y_score, color=y_test, nbins=50,
    labels=dict(color='True Labels', x='Score')
)

fig_hist.show()
#fig_hist.write_html("html/score_hist.html")
# Evaluating model performance at various thresholds
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df,
    width=700, height=500
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()
fig_thresh.write_html("html/fpr_tpr_test.html")

In [206]:
from sklearn.metrics import roc_curve, auc
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()
#fig.write_html("html/roc_rf_test.html")