In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression,\
                                RidgeCV, LassoCV, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.decomposition import PCA
import eli5

[BOOKMARK](#BOOKMARK)

### Functions

In [2]:
results = pd.DataFrame(columns = ['Model', 'Train Score', 'Val Score', 'X Val Score', 'RMSE Train', 'RMSE Val'])

def update_df(model, model_name):
    '''fn updates a dataframe for quick reference of R squared scores and RMSE'''
    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        train_score = model.score(Xs_train, y_train)
        val_score = model.score(Xs_test, y_test)
        x_val_score = cross_val_score(model, Xs_train, y_train).mean()
        rmse1 = mean_squared_error(y_train, lr.predict(Xs_train))**0.5
        rmse2 = mean_squared_error(y_test, lr.predict(Xs_test))**0.5
        
        results.loc[len(results.index)] = [model_name, train_score, val_score, x_val_score, rmse1, rmse2] 
        
    return results

### Read in data

In [3]:
df = pd.read_csv('./cleaned_data_descriptive/combined_df.csv',index_col=[0])

In [4]:
df.head()

Unnamed: 0,county,sex,xiang,q010,q015,q016,q022,q023,q024,q025,...,u024,u025,u026,u027,u028,u029,u030,u031,u032,u033
0,AA,F,1,,,,0.0,,,,...,,,,,,,,,,
1,AA,F,2,,,,0.0,,,,...,,,,,,,,,,
2,AA,F,3,,,,0.0,,,,...,,,,,,,,,,
3,AA,T,3,40.2,60.0,70.0,3.3,22.4,41.9,35.7,...,,,,,,,,,,
4,AB,F,1,,,,0.0,,,,...,,,,,,,,,,


In [5]:
df.shape

(275, 296)

In [6]:
df[df['sex'] == 'T'].shape

(69, 296)

In [7]:
ss = StandardScaler()

In [8]:
# df.columns.to_list() # sanity check

In [9]:
clean_df = df[(df['sex'] == 'T') & (df['xiang'] == 3)]\
.dropna(subset='m005_ALL35_69').dropna(axis = 'columns')

In [10]:
clean_df['m005_ALL35_69']

3      11.29
7      11.82
11     10.55
15     16.19
19     13.59
       ...  
259    17.05
263    13.29
266    15.63
270    18.88
274    17.95
Name: m005_ALL35_69, Length: 66, dtype: float64

In [11]:
clean_df.shape

(66, 201)

In [12]:
clean_df.describe()

Unnamed: 0,xiang,q010,q015,q016,q022,q023,q024,q025,q037,q038,...,r017,r018,r019,r020,r021,r022,r023,r024,r025,r026
count,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,...,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0
mean,3.0,75.156061,59.174242,72.848485,3.065152,29.019697,45.112121,24.787879,2112.939394,1004.287879,...,0.911818,0.855152,8.555152,0.394091,0.555303,3.926212,9.411364,0.537727,1.667424,11.307576
std,0.0,19.578026,19.377126,16.551544,2.69121,12.444637,9.117178,9.574333,1013.991153,832.39173,...,0.385239,0.739628,3.392067,0.313894,0.332858,1.55735,1.29692,0.093936,0.266373,2.896327
min,3.0,0.0,13.0,19.0,0.0,4.7,19.4,9.3,691.0,63.0,...,0.28,0.08,3.73,0.04,0.13,1.19,6.54,0.36,1.17,7.5
25%,3.0,71.775,45.625,62.5,1.6,20.15,38.9,18.225,1451.25,418.75,...,0.595,0.3125,5.7,0.2,0.34,2.625,8.5075,0.49,1.45,9.2
50%,3.0,81.95,58.25,75.0,2.35,28.35,45.6,23.4,1983.0,706.5,...,0.83,0.58,7.7,0.265,0.465,4.03,9.3,0.53,1.625,10.55
75%,3.0,88.625,72.375,84.5,4.375,35.475,51.675,32.025,2505.5,1350.25,...,1.25,1.185,11.3125,0.605,0.6825,5.25,10.06,0.56,1.8175,12.075
max,3.0,96.7,95.0,100.0,12.5,63.0,64.8,49.6,7238.0,3925.0,...,1.65,2.72,17.32,1.83,1.98,6.46,12.78,0.78,2.35,19.6


### Linear Regression

Linear Regression makes the most sense to do since all of our data is numeric albeit on different numeric scales. We can use standard scale and go from there.

In [13]:
X = clean_df.drop(columns = ['county', 'sex', 'xiang','m005_ALL35_69'])
y = clean_df['m005_ALL35_69']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

We're suffering from dimensionality here; we have too many features. We need to figure out which ones to focus on; trying to limit to 10 features for our 69 or so rows. Lasso and/or Ridge could help us out here.

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((49, 197), (17, 197), (49,), (17,))

Standard scaler to get everything on the same page.

In [15]:
Xs_train = ss.fit_transform(X_train)
Xs_test = ss.transform(X_test)

In [16]:
lr = LinearRegression()

In [17]:
lr.fit(Xs_train, y_train)

In [18]:
update_df(lr, 'm005_ALL_35-69.LinReg')

Unnamed: 0,Model,Train Score,Val Score,X Val Score,RMSE Train,RMSE Val
0,m005_ALL_35-69.LinReg,1.0,0.88294,0.616316,8.404957e-15,1.206978


In [19]:
ridge = RidgeCV(alphas = np.logspace(0,2,100)) # 

In [20]:
lasso = LassoCV(alphas = np.arange(0.001, 10, 1))

In [21]:
ridge.fit(Xs_train, y_train)

In [22]:
ridge.alpha_

1.0

In [23]:
update_df(ridge, "m005_ALL_35-69.L2-1")

Unnamed: 0,Model,Train Score,Val Score,X Val Score,RMSE Train,RMSE Val
0,m005_ALL_35-69.LinReg,1.0,0.88294,0.616316,8.404957e-15,1.206978
1,m005_ALL_35-69.L2-1,0.999965,0.881611,0.614525,8.404957e-15,1.206978


In [24]:
lasso.fit(Xs_train, y_train)

In [25]:
update_df(lasso, "m005_ALL_35-69.L1-1")

Unnamed: 0,Model,Train Score,Val Score,X Val Score,RMSE Train,RMSE Val
0,m005_ALL_35-69.LinReg,1.0,0.88294,0.616316,8.404957e-15,1.206978
1,m005_ALL_35-69.L2-1,0.999965,0.881611,0.614525,8.404957e-15,1.206978
2,m005_ALL_35-69.L1-1,0.999998,0.999996,0.999285,8.404957e-15,1.206978


In [26]:
importance = np.abs(lasso.coef_)
features = X.columns
np.array(features)[importance>0]

array(['q022', 'q096', 'q102', 'q105', 'q121', 'q122', 'q126', 'q152',
       'd038', 'm008_MEDICALc', 'm010_NONMEDc', 'p027', 'p034', 'p042',
       'r002', 'r016', 'r026'], dtype=object)

In [27]:
# np.array(features)
# importance
feat_imp = pd.DataFrame({'features':features, 'importance':importance}, columns=['features','importance'])

It'd be nice to be able to pull the code descriptions without searching the whole sheet one at a time:

In [28]:
with open('./data/CHNAME.txt', 'r') as file:
    data = file.read()
    rows = data.split('\n')

descriptions = pd.DataFrame(rows)

In [29]:
# descriptions.head()
# return every other row
text_df = descriptions.iloc[::2,:]
text_df['code'] = text_df[0].str.split(expand=True).iloc[:,0]
text_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df['code'] = text_df[0].str.split(expand=True).iloc[:,0]


Unnamed: 0,0,code
0,M001 ALL0-4 mortality ALL CAUSES AGE 0-4 (...,M001
2,M002 ALL5-14 mortality ALL CAUSES AGE 5-14 ...,M002
4,M003 ALL15-34 mortality ALL CAUSES AGE 15-34...,M003
6,M004 ALL0-34 mortality ALL CAUSES AGE 0-34 ...,M004
8,M005 ALL35-69 mortality ALL CAUSES AGE 35-69...,M005


In [30]:
text_df.rename(columns={0:'description'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df.rename(columns={0:'description'}, inplace=True)


In [31]:
text_df['code'] = text_df['code'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df['code'] = text_df['code'].str.lower()


In [32]:
text_df.head()

Unnamed: 0,description,code
0,M001 ALL0-4 mortality ALL CAUSES AGE 0-4 (...,m001
2,M002 ALL5-14 mortality ALL CAUSES AGE 5-14 ...,m002
4,M003 ALL15-34 mortality ALL CAUSES AGE 15-34...,m003
6,M004 ALL0-34 mortality ALL CAUSES AGE 0-34 ...,m004
8,M005 ALL35-69 mortality ALL CAUSES AGE 35-69...,m005


In [33]:
feat_imp = feat_imp.merge(text_df, left_on='features', right_on='code', how='left').drop(columns = ['code'])

In [34]:
pd.set_option('max_colwidth', 150) # so I can see the full descriptions

In [35]:
feat_imp[feat_imp['importance']>0]

Unnamed: 0,features,importance,description
3,q022,0.000505,Q022 dEDUCATED questionnaire PERCENTAGE WHO ARE WELL-EDUCATED
23,q096,2.7e-05,Q096 dMALARIA questionnaire PERCENTAGE WITH HISTORY OF MALARIA DIAGNOSIS
29,q102,0.000886,Q102 dPHLEGMw questionnaire PERCENTAGE WHO COUGH UP PHLEGM MOST MORNINGS IN WINTER
32,q105,0.000454,Q105 dPHLEGMyr questionnaire NUMBER OF YEARS TROUBLED BY PHLEGM (years)
40,q121,2.5e-05,Q121 dANTIBIOT questionnaire PERCENTAGE USED MAINLY WESTERN ANTIBIOTICS DURING PAST 6 MONTHS
41,q122,0.000499,Q122 dANTACID questionnaire PERCENTAGE USED MAINLY WESTERN ANTACIDS DURING PAST 6 MONTHS
45,q126,0.000855,Q126 dWTLOSS questionnaire PERCENTAGE WHO LOST WEIGHT DURING FOOD SHORTAGE
61,q152,7e-06,Q152 dWINE questionnaire PERCENTAGE WHO HAVE EVER DRUNK WINE 3 OR MORE DAYS A WEEK FOR 6 MONTHS
107,d038,0.000116,"D038 WHTFLOUR diet survey WHEAT FLOUR INTAKE (g/day/reference man, air-dry basis)"
127,m008_MEDICALc,2.879213,


In [36]:
clean_df['m008_MEDICALc'].describe()

count    66.000000
mean     12.991667
std       3.039937
min       8.010000
25%      10.467500
50%      13.040000
75%      14.940000
max      21.230000
Name: m008_MEDICALc, dtype: float64

We don't want other mortality reasons to affect our predictors. We will remove them from the feature list moving forward.

In [37]:
drop_columns = ['county', 'sex', 'xiang','m010_NONMEDc','m005_ALL35_69']

In [38]:
X = clean_df.drop(columns = drop_columns)
y = clean_df['m005_ALL35_69']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

In [39]:
Xs_train = ss.fit_transform(X_train)
Xs_test = ss.transform(X_test)

In [40]:
lr.fit(Xs_train, y_train)
lasso.fit(Xs_train, y_train)
update_df(lasso, "m005_ALL_35-69.L1-2")

  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Model,Train Score,Val Score,X Val Score,RMSE Train,RMSE Val
0,m005_ALL_35-69.LinReg,1.0,0.88294,0.616316,8.404957e-15,1.206978
1,m005_ALL_35-69.L2-1,0.999965,0.881611,0.614525,8.404957e-15,1.206978
2,m005_ALL_35-69.L1-1,0.999998,0.999996,0.999285,8.404957e-15,1.206978
3,m005_ALL_35-69.L1-2,0.999989,0.988186,0.931555,5.548124e-15,1.264333


Great scores for Lasso 2 overall. We can use this model's settings as a springboard for the other mortalities.

## BOOKMARK

In [41]:
# clean_df.columns.to_list()
 # All the mortality columns
 # 'm005_ALL35_69',
 # 'm008_MEDICALc',
 # 'm065_STROKEc',
 # 'm023_ALLCAc',
 # 'm059_ALLVASCc',
 # 'm028_OESOPHCAc',
 # 'm072_COPDc',
 # 'm010_NONMEDc',

In [42]:
features_dropped = ['county', 'sex', 'xiang','m010_NONMEDc',\
                    'm005_ALL35_69', 'm008_MEDICALc', 'm065_STROKEc',\
                   'm023_ALLCAc', 'm059_ALLVASCc', 'm028_OESOPHCAc', 'm072_COPDc',\
                   'm072_COPDc']

mortality_feat = ['m010_NONMEDc', 'm005_ALL35_69', 'm008_MEDICALc', 'm065_STROKEc',\
                   'm023_ALLCAc', 'm059_ALLVASCc', 'm028_OESOPHCAc', 'm072_COPDc']

In [43]:
# Creating one code block to take care of all the models in one go:
lr = LinearRegression()
ss = StandardScaler()
lasso = LassoCV(alphas = np.arange(0.001, 10, 1))

imp_features = pd.DataFrame()
# base code modified from Jahnavi's new_model function
# looping over each target
for i in mortality_feat:
    # suppressing the convergence warnings if any
    import warnings
    warnings.filterwarnings("ignore")

    X = clean_df.drop(columns = features_dropped)
    y = clean_df[i]
    # train test split ('X before y, train before test' -James 2022)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)
    
    ss.fit(X_train)
    Xs_train = ss.transform(X_train)
    Xs_test = ss.transform(X_test)
    
    lr.fit(Xs_train, y_train)
    lasso.fit(Xs_train, y_train)
    
    mortal = i

    update_df(lasso, mortal)
    
    # creating a df to get feature importance out
    feature_imp = pd.Series(lasso.coef_, index = X.columns)
    feature_imp = feature_imp.sort_values(ascending=False)
    
    imp_features[i] = feature_imp


In [44]:
results

Unnamed: 0,Model,Train Score,Val Score,X Val Score,RMSE Train,RMSE Val
0,m005_ALL_35-69.LinReg,1.0,0.88294,0.616316,8.404957e-15,1.206978
1,m005_ALL_35-69.L2-1,0.999965,0.881611,0.614525,8.404957e-15,1.206978
2,m005_ALL_35-69.L1-1,0.999998,0.999996,0.999285,8.404957e-15,1.206978
3,m005_ALL_35-69.L1-2,0.999989,0.988186,0.931555,5.548124e-15,1.264333
4,m010_NONMEDc,0.482118,0.169658,-0.124616,1.332323e-13,44.534195
5,m005_ALL35_69,0.999993,0.056338,0.070911,7.825686e-15,2.689775
6,m008_MEDICALc,0.999991,0.107569,0.160962,6.704399e-15,2.591834
7,m065_STROKEc,0.770332,0.290192,0.00949,2.591966e-13,130.003377
8,m023_ALLCAc,0.999981,-0.538021,-0.46649,5.398386e-15,1.862238
9,m059_ALLVASCc,0.999958,0.138958,-0.10403,3.350999e-15,1.742879


In [68]:
# imp_features['m010_NONMEDc'][imp_features['m010_NONMEDc'] != 0].to_frame().sort_values(by='m010_NONMEDc', ascending=False)
# Forming a new df to store scores with the desciptive codes

In [67]:
text_df.columns

Index(['description', 'code'], dtype='object')

In [51]:
imp_features.reset_index(inplace=True)

In [54]:
score_df = imp_features.merge(text_df, left_on='index', right_on='code', how='left').drop(columns = ['index'])

In [66]:
score_df.head(2)

Unnamed: 0,m010_NONMEDc,m005_ALL35_69,m008_MEDICALc,m065_STROKEc,m023_ALLCAc,m059_ALLVASCc,m028_OESOPHCAc,m072_COPDc,description,code
0,7.084762,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,Q153 dWINEday questionnaire CURRENT DAILY CONSUMPTION OF WINE (g per person),q153
1,5.215087,0.087319,0.0,-1.88745,0.056419,-0.047364,4.606957,0.0,Q023 c%NOSCHL questionnaire PERCENTAGE OF HOUSEHOLD HEADS ATTENDED NO SCHOOL,q023


In [59]:
# This filtering will show only non-zero scores; 
# showing the weights of what the lasso regression actually used and how.
score_df[['m005_ALL35_69', 'description', 'code']][score_df['m005_ALL35_69'] != 0]\
.sort_values(by='m005_ALL35_69', ascending=False)

Unnamed: 0,m005_ALL35_69,description,code
117,1.352670,Q099 dBRTHFAST questionnaire PERCENTAGE WHO SUFFER BREATHLESSNESS WHEN HURRYING OR WALKING UPHILL,q099
92,0.726593,P043 HBsAb plasma HEPATITIS B ANTI-SURFACE ANTIGEN ANTIBODY,p043
8,0.658357,"D044 SALTVEG diet survey DRIED AND SALT-PRESERVED VEGETABLE INTAKE (g/day/reference man, as-consumed basis)",d044
42,0.482376,"D042 LIGHTVEG diet survey LIGHT COLOURED VEGETABLE INTAKE (g/day/reference man, fresh weight)",d042
150,0.454607,Q171 dSALTVEG questionnaire DAYS PER YEAR EAT SALT PRESERVED VEGETABLES,q171
...,...,...,...
179,-0.300691,Q155 dLIQRday questionnaire CURRENT DAILY CONSUMPTION OF LIQUOR (g per person),q155
128,-0.410812,Q040 c%INCALC questionnaire PERCENTAGE OF 1989 HOUSEHOLD INCOME SPENT ON ALCOHOL,q040
25,-0.545873,P009 B-CAROT plasma BETA CAROTENE (ug/dL),p009
88,-0.673292,P038 PEPSIN plasma PEPSINOGEN I/II,p038


In [60]:
imp_features['m008_MEDICALc'][imp_features['m008_MEDICALc'] != 0].to_frame().sort_values(by='m008_MEDICALc', ascending=False)
score_df[['m008_MEDICALc', 'description', 'code']][score_df['m008_MEDICALc'] != 0]\
.sort_values(by='m008_MEDICALc', ascending=False)

Unnamed: 0,m008_MEDICALc,description,code
117,1.446953,Q099 dBRTHFAST questionnaire PERCENTAGE WHO SUFFER BREATHLESSNESS WHEN HURRYING OR WALKING UPHILL,q099
92,0.578106,P043 HBsAb plasma HEPATITIS B ANTI-SURFACE ANTIGEN ANTIBODY,p043
8,0.488523,"D044 SALTVEG diet survey DRIED AND SALT-PRESERVED VEGETABLE INTAKE (g/day/reference man, as-consumed basis)",d044
42,0.484209,"D042 LIGHTVEG diet survey LIGHT COLOURED VEGETABLE INTAKE (g/day/reference man, fresh weight)",d042
113,0.420315,Q125 dFAMINDUR questionnaire TOTAL DURATION OF SEVERE FOOD SHORTAGES DURING THE LAST 30 YEARS (months),q125
...,...,...,...
25,-0.368200,P009 B-CAROT plasma BETA CAROTENE (ug/dL),p009
50,-0.398946,Q010 c%AGRICUL questionnaire PERCENTAGE OF HOUSEHOLD HEADS WHOSE PRIMARY OCCUPATION IS AGRICULTURE,q010
10,-0.489826,P002 HDLCHOL plasma HIGH DENSITY LIPOPROTEIN CHOLESTEROL (mg/dL),p002
188,-0.562347,P027 Cu plasma COPPER (mg/dL),p027


In [61]:
imp_features['m065_STROKEc'][imp_features['m065_STROKEc'] != 0].to_frame().sort_values(by='m065_STROKEc', ascending=False)
score_df[['m065_STROKEc', 'description', 'code']][score_df['m065_STROKEc'] != 0]\
.sort_values(by='m065_STROKEc', ascending=False)

Unnamed: 0,m065_STROKEc,description,code
98,26.61212,Q100 dBRTHLEV questionnaire PERCENTAGE WHO SUFFER BREATHLESSNESS WHEN WALKING WITH OTHERS ON LEVEL GROUND,q100
182,17.366433,Q158 dWHEAT questionnaire DAILY CONSUMPTION OF WHEAT (g/day air-dry basis),q158
103,17.271656,Q106 dWHEEZE questionnaire PERCENTAGE WHOSE CHEST OFTEN SOUNDS WHEEZY,q106
5,12.302131,R024 20:2n6 red blood cell TOTAL LIPID EICOSADIENOIC ACID (20:2(6)) (% of total fatty acid by weight),r024
61,7.621683,R013 22:0 red blood cell TOTAL LIPID BEHENIC ACID (22:0) (% of total fatty acid by weight),r013
92,6.231466,P043 HBsAb plasma HEPATITIS B ANTI-SURFACE ANTIGEN ANTIBODY,p043
142,6.15892,Q137 dCIGCONS questionnaire CURRENT DAILY CONSUMPTION OF MANUFACTURED CIGARETTES (no. per person),q137
105,1.866571,Q109 dDBP questionnaire DIASTOLIC BLOOD PRESSURE (mm Hg),q109
60,0.316645,R012 20:0 red blood cell TOTAL LIPID ARACHIDIC ACID (20:0) (% of total fatty acid by weight),r012
117,0.217354,Q099 dBRTHFAST questionnaire PERCENTAGE WHO SUFFER BREATHLESSNESS WHEN HURRYING OR WALKING UPHILL,q099


In [62]:
imp_features['m023_ALLCAc'][imp_features['m023_ALLCAc'] != 0].to_frame().sort_values(by='m023_ALLCAc', ascending=False)
score_df[['m023_ALLCAc', 'description', 'code']][score_df['m023_ALLCAc'] != 0]\
.sort_values(by='m023_ALLCAc', ascending=False)

Unnamed: 0,m023_ALLCAc,description,code
117,0.683157,Q099 dBRTHFAST questionnaire PERCENTAGE WHO SUFFER BREATHLESSNESS WHEN HURRYING OR WALKING UPHILL,q099
113,0.576475,Q125 dFAMINDUR questionnaire TOTAL DURATION OF SEVERE FOOD SHORTAGES DURING THE LAST 30 YEARS (months),q125
24,0.483848,P008 A-CAROT plasma ALPHA CAROTENE (ug/dL),p008
163,0.475164,D013 VITE diet survey TOTAL VITAMIN E INTAKE (mg/day/reference man),d013
127,0.453919,Q039 cSUPINC89 questionnaire HOUSEHOLD SIDELINE AND BUSINESS INCOME FOR 1989 (Yuan),q039
...,...,...,...
177,-0.308406,Q152 dWINE questionnaire PERCENTAGE WHO HAVE EVER DRUNK WINE 3 OR MORE DAYS A WEEK FOR 6 MONTHS,q152
169,-0.308743,Q144 dHOMECIG questionnaire PERCENTAGE WHO HAVE EVER SMOKED HOMEMADE CIGARETTES DAILY FOR MORE THAN 6 MONTHS,q144
172,-0.352945,Q147 dCIGAR questionnaire PERCENTAGE WHO HAVE EVER SMOKED CIGARS DAILY FOR MORE THAN 6 MONTHS,q147
88,-0.630836,P038 PEPSIN plasma PEPSINOGEN I/II,p038


In [63]:
imp_features['m059_ALLVASCc'][imp_features['m059_ALLVASCc'] != 0].to_frame().sort_values(by='m059_ALLVASCc', ascending=False)
score_df[['m059_ALLVASCc', 'description', 'code']][score_df['m059_ALLVASCc'] != 0]\
.sort_values(by='m059_ALLVASCc', ascending=False)

Unnamed: 0,m059_ALLVASCc,description,code
117,0.400425,Q099 dBRTHFAST questionnaire PERCENTAGE WHO SUFFER BREATHLESSNESS WHEN HURRYING OR WALKING UPHILL,q099
182,0.376016,Q158 dWHEAT questionnaire DAILY CONSUMPTION OF WHEAT (g/day air-dry basis),q158
42,0.315672,"D042 LIGHTVEG diet survey LIGHT COLOURED VEGETABLE INTAKE (g/day/reference man, fresh weight)",d042
92,0.229516,P043 HBsAb plasma HEPATITIS B ANTI-SURFACE ANTIGEN ANTIBODY,p043
125,0.221994,Q037 cAGINC89 questionnaire HOUSEHOLD AGRICULTURAL INCOME FOR 1989 (Yuan),q037
43,0.159212,"D043 GREENVEG diet survey GREEN VEGETABLE INTAKE (g/day/reference man, fresh weight)",d043
155,0.150553,Q176 dEGGS questionnaire DAYS PER YEAR EAT EGGS,q176
69,0.147163,R022 22:6n3 red blood cell TOTAL LIPID DOCOSAHEXAENOIC ACID (22:6(3)) (% of total fatty acid by weight),r022
98,0.142196,Q100 dBRTHLEV questionnaire PERCENTAGE WHO SUFFER BREATHLESSNESS WHEN WALKING WITH OTHERS ON LEVEL GROUND,q100
103,0.135022,Q106 dWHEEZE questionnaire PERCENTAGE WHOSE CHEST OFTEN SOUNDS WHEEZY,q106


In [64]:
imp_features['m028_OESOPHCAc'][imp_features['m028_OESOPHCAc'] != 0].to_frame().sort_values(by='m028_OESOPHCAc', ascending=False)
score_df[['m028_OESOPHCAc', 'description', 'code']][score_df['m028_OESOPHCAc'] != 0]\
.sort_values(by='m028_OESOPHCAc', ascending=False)

Unnamed: 0,m028_OESOPHCAc,description,code
91,27.256861,P042 HBsAg plasma HEPATITIS B SURFACE ANTIGEN,p042
24,15.77851,P008 A-CAROT plasma ALPHA CAROTENE (ug/dL),p008
163,12.768285,D013 VITE diet survey TOTAL VITAMIN E INTAKE (mg/day/reference man),d013
113,10.214094,Q125 dFAMINDUR questionnaire TOTAL DURATION OF SEVERE FOOD SHORTAGES DURING THE LAST 30 YEARS (months),q125
21,4.894704,P006 ALBUMIN plasma ALBUMIN (g/dL) (non-pooled analysis),p006
1,4.606957,Q023 c%NOSCHL questionnaire PERCENTAGE OF HOUSEHOLD HEADS ATTENDED NO SCHOOL,q023
12,3.043453,G003 ELEVATION general features ELEVATION (meters),g003
108,2.931746,Q120 dWESTMED questionnaire PERCENTAGE USED WESTERN MEDICINE REGULARLY DURING PAST 6 MONTHS,q120
127,2.595203,Q039 cSUPINC89 questionnaire HOUSEHOLD SIDELINE AND BUSINESS INCOME FOR 1989 (Yuan),q039
110,2.317649,Q122 dANTACID questionnaire PERCENTAGE USED MAINLY WESTERN ANTACIDS DURING PAST 6 MONTHS,q122


In [65]:
imp_features['m072_COPDc'][imp_features['m072_COPDc'] > 0].to_frame().sort_values(by='m072_COPDc', ascending=False)
score_df[['m072_COPDc', 'description', 'code']][score_df['m072_COPDc'] != 0]\
.sort_values(by='m072_COPDc', ascending=False)

Unnamed: 0,m072_COPDc,description,code
103,29.647219,Q106 dWHEEZE questionnaire PERCENTAGE WHOSE CHEST OFTEN SOUNDS WHEEZY,q106
116,20.350247,Q128 dSMOKE questionnaire PERCENTAGE WHO HAVE EVER SMOKED ANY FORM OF TOBACCO DAILY FOR MORE THAN 6 MONTHS,q128
5,18.650919,R024 20:2n6 red blood cell TOTAL LIPID EICOSADIENOIC ACID (20:2(6)) (% of total fatty acid by weight),r024
7,15.99384,D028 PLNTFOOD diet survey PLANT FOOD INTAKE (g/day/reference man),d028
18,15.980178,P010 G-CAROT plasma GAMMA CAROTENE (ug/dL),p010
42,14.556566,"D042 LIGHTVEG diet survey LIGHT COLOURED VEGETABLE INTAKE (g/day/reference man, fresh weight)",d042
40,13.283286,"D040 STCHTUBER diet survey STARCHY TUBER INTAKE (g/day/reference man, fresh weight)",d040
144,12.236443,Q165 dSMOKFOOD questionnaire PERCENTAGE EVER EAT SMOKED FOOD,q165
41,8.882953,"D041 LEGUME diet survey LEGUME AND LEGUME PRODUCT INTAKE (g/day/reference man, fresh weight)",d041
92,8.726487,P043 HBsAb plasma HEPATITIS B ANTI-SURFACE ANTIGEN ANTIBODY,p043


In [82]:
# I have discovered eli5 package and installed it
# <BIAS> is the expected average score of the model
eli5.show_weights(lasso, feature_names = X_train.columns.to_list(), top=-1)

Weight?,Feature
268.554,<BIAS>
29.647,q106
20.35,q128
18.651,r024
15.994,d028
15.98,p010
14.557,d042
13.283,d040
12.236,q165
8.883,d041
