In [1]:
import numpy as np
import pandas as pd
import csv
import math
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB, MultinomialNB,BernoulliNB
from sklearn import metrics, svm
from sklearn.svm import SVR
from sklearn.metrics import roc_curve, roc_auc_score, mutual_info_score, accuracy_score, mean_squared_error, r2_score, explained_variance_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV, ElasticNet,ElasticNetCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.model_selection import ShuffleSplit
from scipy import stats
from sklearn.utils import resample

%matplotlib inline

In [2]:
# Function to check that there is a float or integer valued string
# Cite the following website: http://pythoncentral.io/how-to-check-if-a-string-is-a-number-in-python-including-unicode/#
def is_number(s):
   
    try:
        float(s)
        return True
    except ValueError:
        pass

    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
   
    return False

    if(math.isnan(s)):           # ensure that NA term is regarded as false
        return False



In [3]:
df=pd.read_csv('fragilefamilieschallenge/background_2.csv', sep=',',header=None, dtype = str)  # imputed data of 4242 families
background_data = df.values
M,N = background_data.shape
M,N

(4243, 11880)

In [4]:
challenge_ID = (background_data[1:,N-1])           # challenge ID in the background file (do not consider 'idnum')
background_features = background_data[1:,2:N-1]    # Holds the features given in the background.csv file
                                                    # Exclude the 'momID' column entry in the feature set.
    
feature_names= background_data[0,:]    

In [5]:
#Impute the 'NA' entries with a -1000.0

for i in range(M-1):
    for j in range(N-3):
        if(is_number(background_features[i,j])==False or math.isnan(float(background_features[i,j]))==True):     # Ensure that all values labelled as "other" are replaced by a 1.0
            background_features[i,j]=-1000.0
        else:
            background_features[i,j]=float(background_features[i,j])
            #print(background_features[i,j])
                
background_features=background_features.astype(np.float)  # ensure that each entry is float data type
            


In [6]:
# Actual imputation of NA with the average of the column

imp= Imputer(missing_values=-1000,strategy='mean', axis=0)
background_features=imp.fit_transform(background_features)

In [7]:
train_data = np.genfromtxt('fragilefamilieschallenge/train.csv', delimiter = ',')
P,Q = train_data.shape
train_ID = train_data[1:,0]                        # Challenge ID of the family in training set
train_outcome = train_data[1:,1:]                  # The 6 outcomes at age 15
                                                   # Need to impute the training outcome for the 6 outcomes as
                                                   # some values are missing

x = train_outcome[:,0]
Z=np.count_nonzero(~np.isnan(x))   # no. of valid entries for the outcome 'GPA'

In [8]:
size = background_features.shape[1]

train_feature = np.zeros((P-1,size))                # stores the features for the total training data for GPA
test_feature = np.zeros((P-1,size))                 # stores the features for the 2121 test data points
                                                   # same size as train_feature
gpa_train = np.zeros((Z,))        # stores the training GPA data for valid entries

miss_feature = np.zeros((P-1-Z,size))    # missing entries' features for GPA
avail_feature = np.zeros((Z,size))   # valid entries' features for GPA

In [10]:
k = 0
t = 0

for i in range(P-1):
    for j in range(M-1):
        if(float(challenge_ID[j]) == train_ID[i]):        # match the train_ID with the ID in background.csv file to get the corresponding features
            train_feature[i,:] = (background_features[j,:])
            if(math.isnan(train_outcome[i,0])==False):    # check for the valid entries with numerical GPA values
                gpa_train[k]= train_outcome[i,0]
                avail_feature[k,:]= background_features[j,:]
                k = k +1
            else:
                miss_feature[t,:]= background_features[j,:] 
                t= t +1
                
                    
        #else:
        #    test_feature[i,:] = (background_features[j,:])

print('done')

done


In [11]:
clf = LassoCV(eps=1e-3, n_alphas=100,cv=5, max_iter=4000)
clf.fit(avail_feature, gpa_train)
pred = clf.predict(avail_feature)
err = mean_squared_error(gpa_train,pred)

In [13]:
pred_miss = clf.predict(miss_feature)

In [14]:
l = 0
train_outcome_new = train_outcome[:,0]  # the completed training label for GPA
for i in range(P-1):
    if(math.isnan(train_outcome[i,0])==True):
        train_outcome_new[i] = pred_miss[l]
        l = l+1

In [15]:
clf.fit(train_feature, train_outcome_new)
pred_new = clf.predict(train_feature)
err_new = mean_squared_error(train_outcome_new,pred_new)

print(err_new)

0.216687544034


In [17]:
n_comp = 10
ids = clf.coef_.argsort()[::-1][:n_comp]

In [18]:
feature_names[ids]

array(['m2l11', 'f5i4', 'm3l6a', 'm4l6a', 'cm5twoc', 'm2g13',
       'cf5md_case_con', 'cf5twoc', 'm3i0l', 'm5e3a'], dtype=object)