In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import matplotlib.pyplot as plt
from implementations import *
import os
import csv
from helpers import *

In [2]:
DATA_FOLDER = 'data/'

In [3]:
try:
    with open(DATA_FOLDER + 'x_train.pickle', 'rb') as f:
        x_train = pickle.load(f)
    with open(DATA_FOLDER + 'x_test.pickle', 'rb') as f:
        x_test = pickle.load(f)
    with open(DATA_FOLDER + 'y_train.pickle', 'rb') as f:
        y_train = pickle.load(f)
    with open(DATA_FOLDER + 'test_ids.pickle', 'rb') as f:
        test_ids = pickle.load(f)
    with open(DATA_FOLDER + 'names_map.pickle', 'rb') as f:
        names_map = pickle.load(f)
except:
    x_train, x_test, y_train, train_ids, test_ids = load_csv_data(DATA_FOLDER, sub_sample=False)

    names = np.genfromtxt(DATA_FOLDER + 'x_train.csv', delimiter=",", dtype=str, max_rows=1)
    names = np.delete(names, 0)
    names_map = {}
    for i in range(len(names)):
        names_map[names[i]] = i

    with open(DATA_FOLDER + 'x_train.pickle', 'wb') as f:
        pickle.dump(x_train, f)

    with open(DATA_FOLDER + 'x_test.pickle', 'wb') as f:
        pickle.dump(x_test, f)

    with open(DATA_FOLDER + 'y_train.pickle', 'wb') as f:
        pickle.dump(y_train, f)

    with open(DATA_FOLDER + 'test_ids.pickle', 'wb') as f:
        pickle.dump(test_ids, f)
    
    with open(DATA_FOLDER + 'names_map.pickle', 'wb') as f:
        pickle.dump(names_map, f)

In [4]:
y_train = np.where(y_train == -1, 0, y_train)

In [5]:
x_tr = x_train.copy()
y_tr = y_train.copy()

# Testing preprocessing function

In [6]:
import preprocessing

In [7]:
x_test, y_test = preprocessing.clean_data(x_tr, y_tr, names_map=names_map)

In [8]:
x_tr[:, names_map["HEIGHT3"]]

array([501., 510., 507., ..., 502., 510., 508.])

In [9]:
x_test[:, names_map["HEIGHT3"]]

array([224.54, 250.4 , 241.78, ..., 228.08, 250.4 , 244.32])

PROBLEM : 501 = 5ft 1in = 154.94

In [18]:
x_tr[:, names_map["WEIGHT2"]]

array([ 110.,  200., 9999., ...,  320.,  250.,  150.])

In [17]:
x_test[:, names_map["WEIGHT2"]]

array([  49.89512,   90.7184 , 9999.     , ...,  145.14944,  113.398  ,
         68.0388 ])

WEIGHT OK

# Pre-processing

In [None]:
#Converting the weights to kg, and asigning lacking answers to NaN
array = x_tr[:, names_map["WEIGHT2"]]
pounds_indices  = np.where(((array >= 50)*(array <= 999))==True)
kg_indices      = np.where(((array >= 9000)*(array <= 9998))==True)
none_indices    = np.where(((array == 7777) + (array == 9998))==True)

x_tr[:, names_map["WEIGHT2"]][pounds_indices] = 0.453592 * x_tr[:, names_map["WEIGHT2"]][pounds_indices]
x_tr[:, names_map["WEIGHT2"]][kg_indices] = x_tr[:, names_map["WEIGHT2"]][kg_indices]%9000
x_tr[:, names_map["WEIGHT2"]][none_indices] = np.nan

In [None]:
#converting the height to meters, and asigning lacking answers to NaN
array = x_tr[:, names_map["HEIGHT3"]]
imperial_indices    = np.where(((array >= 200)*(array <= 711))==True)
cm_indices          = np.where(((array >= 9000)*(array <= 9998))==True)
none_indices        = np.where((array == 9998)==True)

x_tr[:, names_map["HEIGHT3"]][imperial_indices] = 0.3048 * x_tr[:, names_map["HEIGHT3"]][imperial_indices]//100 + 0.0254 * x_tr[:, names_map["HEIGHT3"]][imperial_indices]%100
x_tr[:, names_map["HEIGHT3"]][cm_indices] = x_tr[:, names_map["HEIGHT3"]][cm_indices]%9000
x_tr[:, names_map["HEIGHT3"]][none_indices] = np.nan

In [None]:
#converting FLSHTMY2 to months, and asigning lacking answers to NaN
array = x_tr[:, names_map["FLSHTMY2"]]
none_indices = np.where(((array == 777777) + (array == 999999))==True)
days_indices = np.where(((array >= 12014)*(array <= 122015))==True)

x_tr[:, names_map["FLSHTMY2"]][none_indices] = np.nan
x_tr[:, names_map["FLSHTMY2"]][days_indices] = x_tr[:, names_map["FLSHTMY2"]][days_indices]//10000 + 12*x_tr[:, names_map["FLSHTMY2"]][days_indices]%10000

In [None]:
#define a few functions used to clean and scale the data properly
def frequency_scaler(df, col):
    
    times_per_day   = np.where(((df[:, names_map[col]]>=101)*(df[:, names_map[col]]<=199))==True)
    times_per_week  = np.where(((df[:, names_map[col]]>=201)*(df[:, names_map[col]]<=299))==True)
    times_per_month = np.where(((df[:, names_map[col]]>=301)*(df[:, names_map[col]]<=399))==True)
    none_indices    = np.where(((df[:, names_map[col]] == 777) + (df[:, names_map[col]] == 999))==True)

    df[:, names_map[col]][times_per_day]                = df[:, names_map[col]][times_per_day]%100
    df[:, names_map[col]][times_per_week]               = (df[:, names_map[col]][times_per_week]%200)/7
    df[:, names_map[col]][times_per_month]              = (df[:, names_map[col]][times_per_month]%300)/30
    df[:, names_map[col]][df[:, names_map[col]]==300]   = 1/30
    df[:, names_map[col]][df[:, names_map[col]]==555]   = 0
    df[:, names_map[col]][none_indices]                 = np.nan

def weekly_frequency_scaler(df, col):
    times_per_week  = np.where(((df[:, names_map[col]]>=101)*(df[:, names_map[col]]<=199))==True)
    times_per_month = np.where(((df[:, names_map[col]]>=201)*(df[:, names_map[col]]<=299))==True)
    none_indices    = np.where(((df[:, names_map[col]] == 777) + (df[:, names_map[col]] == 999))==True)

    df[:, names_map[col]][times_per_week]                   = (df[:, names_map[col]][times_per_week]%100)/7
    df[:, names_map[col]][times_per_month]                  = (df[:, names_map[col]][times_per_month]%200)/30
    df[:, names_map[col]][df[:, names_map[col]]==888]       = 0
    df[:, names_map[col]][none_indices]                     = np.nan

def hours_to_minutes(df, col):
    hour_indices    = np.where(( (df[:, names_map[col]]>=1)*(df[:, names_map[col]]<=759) + (df[:, names_map[col]]>=800)*(df[:, names_map[col]]<=959))==True)
    none_indices    = np.where(((df[:, names_map[col]]==777) + (df[:, names_map[col]]==999))==True)
    
    df[:, names_map[col]][hour_indices] = 60*df[:, names_map[col]][hour_indices]//100 + df[:, names_map[col]][hour_indices]%100                                                 
    df[:, names_map[col]][none_indices] = np.nan

In [None]:
#fix the frequency scales of the following columns
frequency_scaler(x_tr, "FRUITJU1")
frequency_scaler(x_tr, "FRUIT1")
frequency_scaler(x_tr, "FVBEANS")
frequency_scaler(x_tr, "FVGREEN")
frequency_scaler(x_tr, "FVORANG")
frequency_scaler(x_tr, "VEGETAB1")

hours_to_minutes(x_tr, "EXERHMM1")
hours_to_minutes(x_tr, "EXERHMM2")

weekly_frequency_scaler(x_tr, "ALCDAY5")
weekly_frequency_scaler(x_tr, "EXEROFT1")
weekly_frequency_scaler(x_tr, "EXEROFT2")
weekly_frequency_scaler(x_tr, "STRENGTH")

In [None]:
#Transformation hashmap
dico_transfos={"GENHLTH":{7:np.nan,8:np.nan,9:np.nan},"POORHLTH":{88:0,77:np.nan,99:np.nan},"HLTHPLN1":{7:np.nan,9:np.nan},"CHECKUP1":{8:15,7:np.nan,9:np.nan},
               "BPMEDS":{7:np.nan,9:np.nan}, "TOLDHI2":{7:np.nan,9:np.nan}, "PHYSHLTH":{88:0,77:np.nan,99:np.nan},
               "MENTHLTH":{88:0,77:np.nan,99:np.nan}, "CVDSTRK3":{7:np.nan, 9:np.nan}, "HLTHPLN1":{9:np.nan}, "CHCOCNCR":{7:np.nan, 9:np.nan},
               "HAVARTH3":{7:np.nan, 9:np.nan}, "CHCKIDNY":{7:np.nan, 9:np.nan}, "DIABETE3":{7:np.nan, 9:np.nan}, "CHCCOPD1":{7:np.nan, 9:np.nan},
               "ASTHMA3":{7:np.nan, 9:np.nan}, "ASTHNOW":{7:np.nan, 9:np.nan}, "CHCSCNCR":{7:np.nan, 9:np.nan}, "ADDEPEV2":{7:np.nan, 9:np.nan},
               "DIABAGE2":{98:np.nan, 99:np.nan}, "EDUCA":{9:np.nan}, "INCOME2":{77:np.nan, 99:np.nan}, "QLACTLM2":{7:np.nan, 9:np.nan},
               "USEEQUIP":{7:np.nan, 9:np.nan}, "BLIND":{7:np.nan, 9:np.nan}, "DECIDE":{7:np.nan, 9:np.nan}, "DIFFWALK":{7:np.nan, 9:np.nan},
               "DIFFDRES":{7:np.nan, 9:np.nan}, "DIFFALON":{7:np.nan, 9:np.nan}, "SMOKE100":{7:np.nan, 9:np.nan}, "SMOKDAY2":{7:np.nan, 9:np.nan},
               "USENOW3":{7:np.nan, 9:np.nan}, "AVEDRNK2":{77:np.nan, 99:np.nan}, "DRNK3GE5":{77:np.nan, 88:np.nan, 99:np.nan}, "MAXDRNKS":{77:np.nan, 99:np.nan},
               "EXERANY2":{7:np.nan, 9:np.nan}, "EXERHMM1":{777:np.nan, 999:np.nan}, "SEATBELT":{7:np.nan, 8:np.nan, 9:np.nan}, "PNEUVAC3":{7:np.nan, 9:np.nan},
               "ARTHDIS2":{7:np.nan, 9:np.nan}, "ARTHSOCL":{7:np.nan, 9:np.nan}, "JOINPAIN":{77:np.nan, 99:np.nan}, "ARTHEDU":{7:np.nan, 9:np.nan}, "FLUSHOT6":{7:np.nan, 9:np.nan},
               "DOCTDIAB":{88:0, 77:np.nan, 99:np.nan}, "DIABEYE":{7:np.nan, 9:np.nan}, "CRGVMST2":{7:np.nan, 9:np.nan}, "VIDFCLT2":{7:np.nan}, "VIREDIF3":{7:np.nan},
               "VICTRCT4":{7:np.nan}, "VIGLUMA2":{7:np.nan}, "VIMACDG2":{7:np.nan}, "CIMEMLOS":{7:np.nan, 9:np.nan}, "CDSOCIAL":{7:np.nan, 9:np.nan}, "DRADVISE":{7:np.nan, 9:np.nan},
               "ASTHMAGE":{97:6, 98:np.nan, 99:np.nan}, "ASERVIST":{88:0}, "CVDASPRN":{7:np.nan, 9:np.nan}, "RDUCHART":{7:np.nan, 9:np.nan}, "ARTHEXER":{7:2, 9:2},
               "HPVADVC2":{7:np.nan, 9:np.nan}, "HPVADSHT":{77:np.nan, 99:np.nan}, "PCPSARE1":{7:np.nan, 9:np.nan}, "MISTMNT":{7:np.nan, 9:np.nan},
               "_CHISPNC":{9:np.nan}, "_RFCHOL":{9:np.nan}, "_LTASTH1":{9:1}, "_CASTHM1":{9:0}, "_ASTHMS1":{9:3}, "_HISPANC":{9:2}, "_AGEG5YR":{14:np.nan}, "_CHLDCNT":{9:np.nan},
               "_EDUCAG":{9:np.nan}, "_INCOMG":{9:np.nan}, "_SMOKER3":{9:4}, "_RFSMOK3":{9:1}, "DRNKANY5":{9:1, 7:1}, "DROCDY3_":{900:np.nan},
               "_TOTINDA":{9:np.nan}, "_LMTSCL1":{9:np.nan}, "_RFSEAT2":{9:0}, "_PASTRNG":{9:2}, "_PACAT1":{9:2}, "STRFREQ_":{99000:np.nan}
               }

In [None]:
x_tr_temp=x_tr.copy()
for col in dico_transfos:
    for key in dico_transfos[col].keys():
        key_indices = np.where(x_tr_temp[:, names_map[col]]==key)
        x_tr_temp[:, names_map[col]][key_indices]=dico_transfos[col][key]

In [None]:
#function to NaN in a column of a pandas dataframe
def replace_nan(dataframe, column, value):
    nan_indices = np.where(np.isnan(dataframe[:, names_map[column]]))
    dataframe[:, names_map[column]][nan_indices] = value

In [None]:
CRGVMST2_value    = 6
VICTRCT4_value    = 3
ARTHEXER_value    = 2
HPVADSHT_value    = 0
PCPSARE1_value    = 2


#list of features to replace NaN with the mean
mean_features   = ["POORHLTH", "PHYSHLTH", "MENTHLTH", "WEIGHT2", "DIABAGE2", "WEIGHT2", "HEIGHT3", "FRUITJU1", "FRUIT1", "FVBEANS", "FVGREEN", "FVORANG", "VEGETAB1",
                   "EXERHMM1", "FLSHTMY2", "FTJUDA1_", "FRUTDA1_", "BEANDAY_", "GRENDAY_", "ORNGDAY_", "VEGEDA1_", "STRFREQ_"]

#list of features to replace NaN with the median
median_features = ["GENHLTH", "HLTHPLN1", "CHECKUP1", "BPMEDS", "TOLDHI2", "CVDSTRK3", "CHCKIDNY", "CHCOCNCR", "HAVARTH3", "DIABETE3", "CHCCOPD1", "ASTHMA3", "ASTHNOW", "CHCSCNCR",
                   "ADDEPEV2", "EDUCA", "INCOME2", "QLACTLM2", "USEEQUIP", "BLIND", "DECIDE", "DIFFWALK", "DIFFDRES", "DIFFALON", "SMOKE100", "SMOKDAY2", "USENOW3", "ALCDAY5", "AVEDRNK2",
                   "DRNK3GE5", "MAXDRNKS", "EXERANY2", "SEATBELT", "PNEUVAC3", "ARTHDIS2", "ARTHSOCL", "JOINPAIN", "FLUSHOT6", "DOCTDIAB", "VIREDIF3", "VIGLUMA2", "VIMACDG2", "CIMEMLOS",
                   "CDSOCIAL", "DRADVISE", "HPVADVC2", "_CHISPNC", "_DRDXAR1", "_AGEG5YR", "DROCDY3_", "_CHLDCNT", "_TOTINDA", "_LMTSCL1", "ARTHEDU", "_INCOMG", "_EDUCAG", "_RFCHOL", 
                   "MISTMNT", "DIABEYE", "_BMI5"]

#list of specific features
value_features  = ["CRGVMST2", "VICTRCT4", "ARTHEXER", "HPVADSHT", "PCPSARE1"]

def replace_nan_by_mean(dataframe, col):
    replace_nan(dataframe, feature, dataframe[:, names_map[feature]].mean())

def replace_nan_by_median(dataframe, col):
    replace_nan(dataframe, feature, np.median(dataframe[:, names_map[feature]]))


#replace the NaN with the mean
for feature in mean_features:
    replace_nan_by_mean(x_tr_temp, feature)

#replace the NaN with the median
for feature in median_features:
    replace_nan_by_median(x_tr_temp, feature)

#replace the NaN with a specific value
for feature in value_features:
    replace_nan(x_tr_temp, feature, eval(feature + "_value"))

In [None]:
x_tr_cleaned=x_tr[intresting_features].copy()
fill_mean=False
if fill_mean:
    for col in intresting_features:
        x_tr_cleaned[col]=x_tr_cleaned[col].fillna(x_tr_cleaned[col].mean())
else:
    x_tr_cleaned=x_tr.fillna(0).copy()
#x_tr_cleaned=x_tr2.fillna(0).copy()
y_tr_cleaned=y_tr.copy()
# x_tr_cleaned=x_tr[intresting_features+["Id"]].dropna()
# not_na_ids=x_tr_cleaned["Id"].values
# y_tr_cleaned=y_tr.loc[y_tr["Id"].isin(not_na_ids)]
# x_tr_cleaned=x_tr_cleaned[intresting_features]
y_tr_cleaned=y_tr_cleaned["_MICHD"]
y_tr_cleaned.replace({-1:0},inplace=True)

In [None]:
x_tr_cleaned

In [None]:
np.where(np.isnan(x_tr_cleaned.values))

In [None]:
x_tr_cleaned.shape

In [None]:
y_tr_cleaned.shape

In [None]:
x_tra, x_val, y_tra, y_val=split_data(x_tr_cleaned.values,y_tr_cleaned.values.ravel(),ratio=0.75,seed=69)

In [None]:
def make_predictions(x,w,threshold,apply_sigmoid):
    w2=w.ravel()
    y_pred=x.dot(w2.T)
    if threshold==None:
        threshold=0.5
    if apply_sigmoid:
        y_pred=sigmoid(y_pred)
    y_pred=np.array([0 if prediction<threshold else 1 for prediction in y_pred])
    return y_pred

In [None]:
def compute_scores(x,w,y,threshold=None,apply_sigmoid=False):
    y_pred=make_predictions(x,w,threshold,apply_sigmoid) 
    TP=np.sum(np.logical_and(y_pred==1,y==1))
    FP=np.sum(np.logical_and(y_pred==1,y==0))
    FN=np.sum(np.logical_and(y_pred==0,y==1))
    precision=TP/(TP+FP)
    recall=TP/(TP+FN)
    f1=2*precision*recall/(precision+recall)
    return precision,recall,f1
    

In [None]:
x_tra_scaled=(x_tra-np.mean(x_tra,axis=0)[None,:])/np.std(x_tra,axis=0)
x_val_scaled=(x_val-np.mean(x_tra,axis=0)[None,:])/np.std(x_tra,axis=0)

In [None]:
y_tra.shape

In [None]:
x_tra_scaled.shape

In [None]:
y_tra

In [None]:
w_opt,loss=logistic_regression(y_tra,x_tra_scaled,initial_w=np.zeros((x_tra.shape[1],1)),max_iters=15,gamma=0.2,gd=True)

In [None]:
{intresting_features[k]:w_opt[k][0] for k in range(len(intresting_features))}

In [None]:
compute_scores(x_val_scaled,w_opt,y_val,threshold=0.58,apply_sigmoid=True)

In [None]:
thr_l=np.arange(0.3,0.7,0.01)
f1_scores=[compute_scores(x_val_scaled,w_opt,y_val,threshold=t,apply_sigmoid=True)[2] for t in thr_l]
plt.plot(thr_l,f1_scores,marker='o')

In [None]:
np.save("w_log_reg_20f_fillnamean.npy",w_opt)

## Submission

In [None]:
x_train, x_test, y_train, train_ids, test_ids=load_csv_data("data")

In [None]:
x_test=pd.read_csv("data/x_test.csv")

In [None]:
x_test2=x_test.copy()
fill_mean=True
for col in intresting_features:
    x_test2[col].replace(dico_transfos[col],inplace=True)

        
    
if fill_mean:
    x_test_cleaned=x_test2[intresting_features].copy()
    for col in intresting_features:
        x_test_cleaned[col]=x_test_cleaned[col].fillna(x_tr_cleaned[col].mean())
else:
    x_test_cleaned=x_test2[intresting_features].fillna(0).copy()
x_test_scaled=(x_test_cleaned-np.mean(x_tra,axis=0)[None,:])/np.std(x_tra,axis=0)

In [None]:
x_test_scaled.shape

In [None]:
predictions=make_predictions(x_test_scaled,w_opt,threshold=0.58,apply_sigmoid=True)

In [None]:
np.where(predictions==1)

In [None]:
predictions[np.where(predictions==0)[0]]=-1

In [None]:
predictions

In [None]:
create_csv_submission(test_ids,predictions,name="predictionslog20f_fillnamean.csv")