In [1]:
# Importing numpy and all functions in implementations.py
import numpy as np
from implementations import *
import helpers as h

In [9]:

def loadData(dataPath):
    ''' Loads data and returns it as masked numpy array. A masked array contains information about which values are invalid, ensuring methods like .mean() ignores the masked values
    Args:
        dataPath: The file path of the data
    Returns:
        data: (N,d) masked numpy array, where N is the number of samples, and d is the dimension of the x values, or 1 if the data in question are the labels
        header: (d,) array with the column names
    '''
    data = np.genfromtxt(dataPath, delimiter=',', skip_header=1, dtype=float, usemask=True) # Loading the data as a masked array (with usemask=True), skipping the header, and specifying that the values are floats
    header = np.genfromtxt(dataPath, delimiter=',', dtype=str, max_rows=1) # Loading the first row of the csv file, i.e. the header
    return data , header

# Loading the data and printing the shapes
y, yHeader = loadData('./Data/y_train.csv')
x, xHeader = loadData('./Data/x_train.csv')
print(f'y: {y.shape}, x: {x.shape}')

y: (328135,), x: (328135, 321)


In [15]:
# Counting the number of valid values for each feature, and calculating the percentage of valid entries
validFeatureVals = x.count(axis=0)
validFeatureValsPercent = validFeatureVals/x.shape[0]

# Finding the indices of all the features with number of features above and below a threeshold
threeshold = 0.7
featureIndicesAboveThreeshold = np.argwhere(validFeatureValsPercent > threeshold).flatten() # Finding the indiced i where there are more than threeshold percent
featureIndicesBelowThreeshold = np.argwhere(validFeatureValsPercent < threeshold).flatten() # valid values, and flattening the array from (i,1) to (i,)
print(f'For a threeshold of {threeshold}, there are {len(featureIndicesAboveThreeshold)} valid features, and {len(featureIndicesBelowThreeshold)} invalid features')

For a threeshold of 0.7, there are 144 valid values, and 177 invalid values


In [14]:
# Removing the features that appears less than {threeshold} of the time
xFeaturesRemoved = x[:,featureIndicesAboveThreeshold]
print(xFeaturesRemoved.shape)

(328135, 144)


In [32]:
# Counting the number of valid values for each remaining sample
validSampleVals = xFeaturesRemoved.count(axis=1)
validSampleValsPercent = validSampleVals/xFeaturesRemoved.shape[1]

acceptableMissingValues = 5
sampleIndicesAboveThreeshold = np.argwhere(validSampleVals >= xFeaturesRemoved.shape[1]-acceptableMissingValues).flatten()
print(f'There are {len(sampleIndicesAboveThreeshold)} samples with at most {acceptableMissingValues} missing values, for a threeshold of {threeshold} for considering a feature valid')


There are 289870 samples with at most 5 missing values, for a threeshold of 0.7 for considering a feature valid


In [42]:
# Removing samples with more than {acceptableMissingValues} missing values
xFeaturesAndSamplesRemoved = xFeaturesRemoved[sampleIndicesAboveThreeshold]
print(xFeaturesAndSamplesRemoved.shape)

(289870, 144)


In [45]:
print(f'The number of invalid entries remaing in the dataset is {xFeaturesAndSamplesRemoved.size - xFeaturesAndSamplesRemoved.count()}\nThat is {(xFeaturesAndSamplesRemoved.size - xFeaturesAndSamplesRemoved.count())/xFeaturesAndSamplesRemoved.size} parts of the whole dataset')

The number of invalid entries remaing in the dataset is 272938
That is 0.006538802834987332 parts of the whole dataset


In [52]:
xNormalize = (xFeaturesAndSamplesRemoved - np.mean(xFeaturesAndSamplesRemoved,axis=0)) / np.var(xFeaturesAndSamplesRemoved, axis=0)
xClean = np.ma.filled(xFeaturesAndSamplesRemoved,fill_value=0)