In [15]:
# Importing numpy and all functions in implementations.py
import numpy as np
from implementations import *
import helpers as h

In [16]:

def loadData(dataPath):
    ''' Loads data and returns it as masked numpy array. A masked array contains information about which values are invalid, ensuring methods like .mean() ignores the masked values
    Args:
        dataPath: The file path of the data
    Returns:
        data: (N,d) masked numpy array, where N is the number of samples, and d is the dimension of the x values, or 1 if the data in question are the labels
        header: (d,) array with the column names
    '''
    data = np.genfromtxt(dataPath, delimiter=',', skip_header=1, dtype=float, usemask=True) # Loading the data as a masked array (with usemask=True), skipping the header, and specifying that the values are floats
    header = np.genfromtxt(dataPath, delimiter=',', dtype=str, max_rows=1) # Loading the first row of the csv file, i.e. the header
    return data , header

# Loading the data and printing the shapes
y, yHeader = loadData('./Data/y_train.csv')
x, xHeader = loadData('./Data/x_train.csv')
print(f'y: {y.shape}, x: {x.shape}')

y: (328135,), x: (328135, 321)


In [17]:
print(np.sum(y)/len(y))

0.08830207079403295


In [18]:
# Counting the number of valid values for each feature, and calculating the percentage of valid entries
validFeatureVals = x.count(axis=0)
validFeatureValsPercent = validFeatureVals/x.shape[0]

# Finding the indices of all the features with number of features above and below a threeshold
threeshold = 0.7
featureIndicesAboveThreeshold = np.argwhere(validFeatureValsPercent > threeshold).flatten() # Finding the indiced i where there are more than threeshold percent
featureIndicesBelowThreeshold = np.argwhere(validFeatureValsPercent < threeshold).flatten() # valid values, and flattening the array from (i,1) to (i,)
print(f'For a threeshold of {threeshold}, there are {len(featureIndicesAboveThreeshold)} valid features, and {len(featureIndicesBelowThreeshold)} invalid features')

For a threeshold of 0.7, there are 144 valid features, and 177 invalid features


In [19]:
# Removing the features that appears less than {threeshold} of the time
xFeaturesRemoved = x[:,featureIndicesAboveThreeshold]
print(xFeaturesRemoved.shape)

(328135, 144)


In [20]:
# Counting the number of valid values for each remaining sample
validSampleVals = xFeaturesRemoved.count(axis=1)
validSampleValsPercent = validSampleVals/xFeaturesRemoved.shape[1]

acceptableMissingValues = 5
sampleIndicesAboveThreeshold = np.argwhere(validSampleVals >= xFeaturesRemoved.shape[1]-acceptableMissingValues).flatten()
print(f'There are {len(sampleIndicesAboveThreeshold)} samples with at most {acceptableMissingValues} missing values, for a threeshold of {threeshold} for considering a feature valid')


There are 289870 samples with at most 5 missing values, for a threeshold of 0.7 for considering a feature valid


In [21]:
# Removing samples with more than {acceptableMissingValues} missing values
xFeaturesAndSamplesRemoved = xFeaturesRemoved[sampleIndicesAboveThreeshold]
ySamplesRemoved = y[sampleIndicesAboveThreeshold]
print(xFeaturesAndSamplesRemoved.shape)

(289870, 144)


In [22]:
print(f'The number of invalid entries remaing in the dataset is {xFeaturesAndSamplesRemoved.size - xFeaturesAndSamplesRemoved.count()}\nThat is {(xFeaturesAndSamplesRemoved.size - xFeaturesAndSamplesRemoved.count())/xFeaturesAndSamplesRemoved.size} parts of the whole dataset')

The number of invalid entries remaing in the dataset is 272938
That is 0.006538802834987332 parts of the whole dataset


In [28]:
xNormalize = (xFeaturesAndSamplesRemoved - np.mean(xFeaturesAndSamplesRemoved,axis=0)) / np.var(xFeaturesAndSamplesRemoved, axis=0)
xClean = np.ma.filled(xFeaturesAndSamplesRemoved,fill_value=0)
tx = np.c_[np.ones(xClean.shape[0]),xClean]

In [38]:
print(np.max(xClean,axis=0)/np.mean(np.abs(xClean),axis=0))
print(np.argmax(xClean,axis=0))

[  2.38731383   1.88838135   1.87784864   1.87144709   2.17877719
   1.00048482   1.08483842   1.00000898   1.00000898   3.51443721
   1.63946764   1.53811319   8.27270306   6.54314392   4.69803818
   5.8018997    4.09194858   7.45213924   7.08355281   6.29817488
   4.56758328   4.78944528   4.7012299    4.7154376    4.63624683
   5.36457423   4.92568509   4.54867698   3.26785506   1.26812815
   4.05365972   1.81827749   6.69447445   4.80682101   2.2954606
   1.49813518   5.35651849   7.43837069  17.92386086  16.04860394
   5.06783454   4.7657523    4.58357711   4.66788575   4.8770288
   4.58620393   4.6482246    5.64431367   3.03797778   1.88543432
   2.76129914   4.85985628   3.24916582   3.95410539   3.3863199
   4.54609282   7.03583141   1.66873752   7.22815757   5.86264472
   4.43792539   4.82982505   1.55443233   2.89891259   2.38236924
  15.63393272   3.45493538  26.57384611   1.65369843   2.22315866
  55.13363103   7.49668102   2.28518397   6.31486838   6.13247863
   6.85408697

In [66]:
initial_w = np.zeros(tx.shape[1],dtype=np.double)
w_sgd_m, loss_sgd_m, = mean_squared_error_sgd(ySamplesRemoved,tx,initial_w,10**4,2e-19,batch_size=16)
print(compute_loss(ySamplesRemoved,tx,initial_w),loss_sgd_m)

0.08970228033256288 0.1676492133259451


In [41]:
w_logistic = logistic_regression(ySamplesRemoved,tx,initial_w,10**3,1e-19)

In [51]:
errors = np.sum(np.abs((np.sign(tx @ w_logistic - 0.5)+1)/2 - ySamplesRemoved))
predPos = np.sum(np.sign(tx @ w_logistic - 0.5)/2 + 0.5)
print(f'''Number of Errors: {errors}
Predicted positive cases: {predPos}
Number of samples: {len(ySamplesRemoved)}
Number of actually positive cases: {np.sum(ySamplesRemoved)}''')
print(errors - np.sum(ySamplesRemoved))

Number of Errors: 263868.0
Predicted positive cases: 289870.0
Number of samples: 289870
Number of actually positive cases: 26002.0
237866.0


In [67]:
print(w_sgd_m)

[ 4.72123129e-20  3.06953407e-17  1.43846642e-17  5.79569344e-12
  7.17172887e-18 -1.37612979e-16  9.43670478e-17 -2.54404044e-16
  9.50148028e-11  9.50148028e-11  1.58163416e-16 -2.02231242e-15
 -9.71531442e-17 -6.33940835e-18 -1.80783612e-17 -1.29257930e-18
 -3.57896334e-17 -1.22651924e-16 -2.44825676e-17 -1.75412886e-17
 -1.95743631e-17 -1.67340825e-17 -7.33068986e-18 -1.36058042e-17
 -1.29072823e-17 -1.90953476e-17 -4.40803052e-17 -1.30843266e-17
 -9.02341238e-18 -6.65029537e-17 -2.05427579e-17 -2.91349418e-17
 -5.40650242e-17 -2.70709377e-18 -2.37185180e-17  3.42325694e-16
  2.53328604e-15  3.27884318e-17  3.60080408e-17 -1.95636509e-14
 -5.09162266e-15 -4.31983429e-17 -3.39814814e-17 -1.01307170e-17
 -1.39162760e-17 -4.27626940e-17 -1.24018757e-17 -1.80260542e-17
 -3.00624109e-17 -6.89139737e-19  1.83890056e-14  2.20007683e-16
  2.43486490e-15  1.75249576e-15  4.04269194e-15  1.70517854e-15
  1.47686389e-15  2.08008940e-17  1.49147229e-14  1.19685942e-17
 -2.10216299e-17 -9.58016