In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd


from sklearn.ensemble  import RandomForestClassifier as rfc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression as logreg

import statsmodels as sm
import statsmodels.formula.api as smf
import statsmodels.api as sma

from sklearn.feature_selection import chi2

import scipy as sp
from scipy import stats

from sklearn.preprocessing import LabelEncoder


In [2]:
# the directory that contains all the files
dataDirectory = 'data/'

In [3]:
file = 'llcp2017_formatted.csv'
df = pd.read_csv(file)

  interactivity=interactivity, compiler=compiler, result=result)


In [26]:
df.head()

Unnamed: 0,STATE FIPS CODE,FILE MONTH,INTERVIEW DATE,INTERVIEW MONTH,INTERVIEW DAY,INTERVIEW YEAR,FINAL DISPOSITION,ANNUAL SEQUENCE NUMBER,PRIMARY SAMPLING UNIT,CORRECT TELEPHONE NUMBER?,...,_IMPCAGE,_IMPCRAC,_IMPCSEX,_IMPEDUC,_IMPHOME,_IMPMRTL,_IMPNPH,_IMPSEX,_M_RACE,_URBNRRL
0,Alabama,January,1302017,1,30,2017,1100,2017000001,2017000001,Yes,...,Not asked or Missing,Missing,Missing,Missing,Missing,Missing,,Not asked or Missing,Not asked or Missing,Missing
1,Alabama,January,1122017,1,12,2017,1100,2017000002,2017000002,Yes,...,Not asked or Missing,Missing,Missing,Missing,Missing,Missing,,Not asked or Missing,Not asked or Missing,Missing
2,Alabama,January,1102017,1,10,2017,1100,2017000003,2017000003,Yes,...,Not asked or Missing,Missing,Missing,Missing,Missing,Missing,,Not asked or Missing,Not asked or Missing,Missing
3,Alabama,January,2082017,2,8,2017,1200,2017000004,2017000004,Yes,...,Not asked or Missing,Missing,Missing,Missing,Missing,Missing,,Not asked or Missing,Not asked or Missing,Missing
4,Alabama,January,1302017,1,30,2017,1100,2017000005,2017000005,Yes,...,Not asked or Missing,Missing,Missing,Missing,Missing,Missing,,Not asked or Missing,Not asked or Missing,Missing


In [27]:
print("Number of records: ",len(df))

Number of records:  450642


### Helper Functions

In [28]:
def containsKeyword(sent):
    sent = sent.lower()
    for f in filterList:
        if f in sent:
            return True
    return False

In [29]:
def normalizeNumeric(df):
    for col in df.columns:
        if np.issubdtype(df[col].dtype, np.number):
            df[col] = ( df[col] - np.mean(df[col]) ) / (np.std(df[col]))
    return df
            
#[ (c, np.sum(np.isnan(ecigdfOH[c])))  for c in ecigdfOH.columns if np.issubdtype(ecigdfOH[c].dtype, np.number) ]

In [30]:
# remove class imballance by sampling from majority class
def classImballanceDownSample(df,ycol):
    df = df.copy()
    valueCount = df[ycol].value_counts()
    print("Before Class Imballance Treatment: ")
    print(valueCount)
    classes = valueCount.index
    counts = valueCount.values
    minClassSize = np.min(counts)
    for clas in classes:
        df1 = df[df[ycol]==clas]
        df2 = df[df[ycol]!=clas]
        
        df1 = df1.sample(n=minClassSize, random_state=50)
        df = df1.append(df2)
    #shuffling the dataframe
    df = df.sample(frac=1).reset_index(drop=True)
    print("After Class Imballance Treatment: ")
    print(df[ycol].value_counts())
    return df  

In [31]:
def labelEncodeCategoricalFeatures(DF):
    labelencoder = LabelEncoder()
    df = DF.copy()
    for c in df.columns:
        if df[c].dtype.name == 'object':
            df[c] = labelencoder.fit_transform(df[c])
    return df

In [32]:
def performChiSquareDependencyTest(df,label):
    xcols = [c for c in df.columns if c != label]
    X = df[xcols]
    Y = df[label]
    res = chi2(labelEncodeCategoricalFeatures(X),Y)
    resDf = pd.DataFrame({'Variable':xcols, 'Chi':res[0], 'P_value':res[1]})
    resDf = resDf.sort_values(by=['Chi'], ascending=False).reset_index(drop=True)
    return resDf

In [33]:
def getKbestChiFeatures(df,label,k):
    xcols = [c for c in df.columns if c != label]
    X = df[xcols]
    Y = df[label]
    res = chi2(labelEncodeCategoricalFeatures(X),Y)
    kcols = [col for ch,col in sorted(zip(res[0],xcols),reverse=True)]
    resDf = df[kcols[:k]+[label]]
    return resDf

### Create Computed Column Dataframe

In [34]:
# Find the columns that contain smoking key word
filterList = ['smok','cig']    
smokecols = [c for c in df.columns if containsKeyword(c)]
smokecols

## Explore the calculated and computed columns
filterList = ['computed', 'calculated']    
calcols = [c for c in df.columns if containsKeyword(c)]
print("Number of calculated/computed columns: ",len(calcols))

# create a datframe that only has calculated columns
comDF = df[calcols]

Number of calculated/computed columns:  41


### Create SMoking columns

In [35]:
ecigdf = comDF.copy()
ecigdf.rename(columns={'CURRENT E-CIGARETTE USER CALCULATED VARI': 'esmoke', \
                     'CURRENT SMOKING CALCULATED VARIABLE':'smoke'},inplace = True)


mapper = {'Current E-cigarette user': 'Yes', 'Not currently using E-cigarettes': 'No' }
ecigdf['esmoke'] = ecigdf['esmoke'].map(mapper)

print("data size: ",len(ecigdf))
ecigdf = ecigdf[ecigdf.smoke.apply(lambda x: x in ['Yes','No']) ]
ecigdf = ecigdf[ecigdf.esmoke.apply(lambda x: x in ['Yes','No']) ]
print("clean data size: ",len(ecigdf))

print("Frequency distributions:")
print(ecigdf.esmoke.value_counts())
print(ecigdf.smoke.value_counts())

data size:  450642
clean data size:  427526
Frequency distributions:
No     413906
Yes     13620
Name: esmoke, dtype: int64
No     364794
Yes     62732
Name: smoke, dtype: int64


In [36]:
print("Just smoke :", np.sum((ecigdf.smoke == 'Yes') & (ecigdf.esmoke == 'No')))
print("Just esmoke:", np.sum((ecigdf.smoke == 'No') & (ecigdf.esmoke == 'Yes')))
print("SMoke Both :", np.sum((ecigdf.smoke == 'Yes') & (ecigdf.esmoke == 'Yes')))
print("Smoke none :", np.sum((ecigdf.smoke == 'No') & (ecigdf.esmoke == 'No')))

Just smoke : 55523
Just esmoke: 6411
SMoke Both : 7209
Smoke none : 358383


### Data Cleansing

In [15]:
# remove non smokers
ecigdf = ecigdf[ (ecigdf.smoke == 'Yes') | (ecigdf.esmoke == 'Yes') ]
print("clean data size: ",len(ecigdf))

clean data size:  69143


In [16]:
print("Records: ",len(ecigdf))
ecigdf = ecigdf.dropna()
print("Records after cleaning Nan: ",len(ecigdf))

Records:  69143
Records after cleaning Nan:  61225


In [18]:
# label is 1 for esmoker and zero for non smoker
ecigdf['label'] = ecigdf.esmoke == 'Yes'

In [19]:
excludecolumns = smokecols + ['esmoke', 'smoke']
ecigdf = ecigdf[[c for c in ecigdf.columns if c not in excludecolumns ]]
print('Columns : ',len(ecigdf.columns))
print("Frequency distributions:")
print(ecigdf.label.value_counts())

Columns :  38
Frequency distributions:
False    48998
True     12227
Name: label, dtype: int64


In [39]:
'''smoker/non-smoker'''

ecigdf = comDF.copy()
ecigdf.rename(columns={'CURRENT E-CIGARETTE USER CALCULATED VARI': 'esmoke', \
                     'CURRENT SMOKING CALCULATED VARIABLE':'smoke'},inplace = True)


mapper = {'Current E-cigarette user': 'Yes', 'Not currently using E-cigarettes': 'No' }
ecigdf['esmoke'] = ecigdf['esmoke'].map(mapper)

print("data size: ",len(ecigdf))
ecigdf = ecigdf[ecigdf.smoke.apply(lambda x: x in ['Yes','No']) ]
#ecigdf = ecigdf[ecigdf.esmoke.apply(lambda x: x in ['Yes','No']) ]
print("clean data size: ",len(ecigdf))

print("Frequency distributions:")
print(ecigdf.esmoke.value_counts())
print(ecigdf.smoke.value_counts())

data size:  450642
clean data size:  431888
Frequency distributions:
No     413906
Yes     13620
Name: esmoke, dtype: int64
No     368326
Yes     63562
Name: smoke, dtype: int64


In [40]:
print("Records: ",len(ecigdf))
ecigdf = ecigdf.dropna()
print("Records after cleaning Nan: ",len(ecigdf))

Records:  431888
Records after cleaning Nan:  377435


In [41]:
ecigdf['label'] = ecigdf.smoke == 'Yes'

In [42]:
excludecolumns = smokecols + [ 'smoke']
ecigdf = ecigdf[[c for c in ecigdf.columns if c not in excludecolumns ]]
print('Columns : ',len(ecigdf.columns))
print("Frequency distributions:")
print(ecigdf.label.value_counts())

Columns :  39
Frequency distributions:
False    321970
True      55465
Name: label, dtype: int64


## Chi Square Test

In [43]:
performChiSquareDependencyTest(ecigdf,'label')

Unnamed: 0,Variable,Chi,P_value
0,COMPUTED FRUIT INTAKE IN TIMES PER DAY,478485.164277,0.0
1,COMPUTED DARK GREEN VEGETABLE INTAKE IN,138270.608094,0.0
2,COMPUTED OTHER VEGETABLE INTAKE IN TIMES,90753.142239,0.0
3,COMPUTED WEIGHT IN KILOGRAMS,67737.046817,0.0
4,COMPUTED POTATO SERVINGS PER DAY,33901.762729,0.0
5,esmoke,14229.94611,0.0
6,COMPUTED FRUIT JUICE INTAKE IN TIMES PER,12850.247131,0.0
7,PNEUMONIA VACCINATION CALCULATED VARIABL,11628.061237,0.0
8,FLU SHOT CALCULATED VARIABLE,11547.850349,0.0
9,COMPUTED LEVEL OF EDUCATION COMPLETED CA,5440.318877,0.0


### Manual Chi square calculation for One Column (Which we will not pursue)

In [19]:
s1 = ecigdf[ecigdf['label']==True].iloc[:,0].value_counts() 
s1 = s1.astype(int)
s2 = ecigdf[ecigdf['label']==False].iloc[:,0].value_counts() 
s2 = s2.astype(int)

vcountDF = pd.concat([s1, s2], axis=1)
vcountDF.columns = ['s1','s2']
vcountDF

Unnamed: 0,s1,s2
Zero days when physical health not good,6220,26015
1-13 days when physical health not good,3370,11557
14+ days when physical health not good,2453,10571
Don�t know/Refused/Missing,184,855


In [21]:
sp.stats.chi2_contingency(vcountDF)

(86.45216064490617,
 1.265957018899482e-18,
 3,
 array([[ 6437.52298898, 25797.47701102],
        [ 2981.01149857, 11945.98850143],
        [ 2600.97097591, 10423.02902409],
        [  207.49453655,   831.50546345]]))

In [103]:
np.log10(1.33)

0.12385164096708583

In [102]:
10**.13

1.3489628825916535

## Clean Data

In [110]:
cleandf = pd.read_csv('cleanData.csv')

In [111]:
print("Records: ",len(cleandf))
cleandf = cleandf.dropna()
print("Records after cleaning Nan: ",len(cleandf))

Records:  61934
Records after cleaning Nan:  55953


In [112]:
performChiSquareDependencyTest(cleandf,'label')

Unnamed: 0,Variable,Chi,P_value
0,ads18_weight,47810.28941,0.0
1,ads18_fruitIntake,12152.722525,0.0
2,ads18_age18_25,3908.33199,0.0
3,ads18_vegIntake,1264.437342,6.047396e-277
4,ads18_older40,646.877423,1.067119e-142
5,ads18_potatoIntake,606.094508,7.910539e-134
6,ads18_income,405.922157,2.829951e-90
7,ads18_highBP,201.154078,1.169472e-45
8,ads18_unknownChl,200.688173,1.477945e-45
9,ads18_age25_30,192.920787,7.325061e-44
