In [11]:
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.preprocessing import normalize

In [12]:
'''
#read csv
load_test_set = 'testing.csv'
df = pd.read_csv(load_test_set, header=None)

#convert dataframe to numpy array
#then create a sparse matrix from that
matrix = scipy.sparse.csr_matrix(df.to_numpy())

#print array just to see
print(matrix.toarray())

#save sparse array as npz file 
save_file = 'testing.npz'
scipy.sparse.save_npz(save_file, matrix)
print('done')
# we only needed to run this code twice, once for the testing data and once for the training data
#'''

#load in the training data set from the previously saved .npz
Xtrain = scipy.sparse.load_npz('training.npz')

#save the class values here for later as a flat array
Ytrain = Xtrain[:,61189].toarray().flatten()

#extract just the attributes and the ids from the data
Xtrain = Xtrain[:,:61189]

#save the IDs for later (might be obsolete)
trainDataSetIDs = Xtrain[:,0].toarray().flatten()

#overwrite the IDs as ones this accounts for the extra weight we will be calcualting
Xtrain[:,0] = 1

#calculate and save the column sums of this matrix for normalization later
#these values are the sums of each attribute across all the examples
colSumsTrain = Xtrain[:,:61189].sum(axis=0).astype(float)

#take the recipricol of the column sums where they are not 0
recipricol = np.reciprocal(colSumsTrain, where=colSumsTrain!=0).A1

#build a diagonal matrix from the previously calculated recipricols
diag = scipy.sparse.diags(recipricol)

#matrix multiply the diagonal matrix agains the train matrix,
#this normalizes the values of each attribute so they sum to 1,
#by elementwise dividing each column by the sum of that column
Xtrain = Xtrain @ diag

#build the delta matrix for the training data,
#this is a matrix, where each row is a class and each column an example
#each example has a one in the row for the class it belongs to,
Deltatrain = np.zeros((20,12000), dtype=int)
count = 0
for i in Ytrain:
    Deltatrain[i-1][count] = 1
    count+=1
Deltatrain = scipy.sparse.csr_matrix(Deltatrain)

#load in the test data from its .npz
Xtest = scipy.sparse.load_npz('testing.npz')

#extract the IDs
testDataSetIDs = Xtest[:,0].toarray().flatten()

#overwrite the IDs with ones to account for our extra weight
Xtest[:,0] = 1

#normalize the matrix using the values from the training data
Xtest = Xtest @ diag

In [13]:
#this function takes a matrix of weights and a given sparse matrix of attributes
#the matricies must have the same number of columns
#this function calcualtes the probabilities for each class each example belongs to that class
#using a provided weight matrix
def getProbabilitys(W, X):
    #multiply the weights by the transpose of the attributes,
    #this gives us a matrix of probabilites where the rows are the classes and the columns examples
    unNormalized = W @ X.transpose()
    
    #we take the exponential of this matrix, if we had not normalized before we would overflow
    probabilitys = np.exp(unNormalized)
    
    #we normalize again to prevent overflow in subsequent iterations 
    #and to make each examples probabilities sum to one
    probabilitys = normalize(probabilitys, norm = 'l1', axis = 0)
    return probabilitys

In [14]:
#this function classifies a given data set with a given eta, and lambda value
# over a specific number of training iterations, the given eta and lambda values must corespond
# to a classFunction already saved in the working folder, this is the function output by the train method
#dataSet = the data set you would like to classify either 'training' or 'testing'
#eta = the eta value for the trainer, a learing rate, effects how large of jumps the trainer makes
#lambda = a penalty rate, penalizes large values of weights that may be outliers
#iterations = the number of times the trainier will train on the training set before making its classifications
#NOTE: we have not implimented a stopping criteria beyond this value at the time of this writing,
#though on can be implimented using the error value calculated in the updateWeights function
def classify(dataSet, eta, lmdb, iterations):
    #read in the classFunction from the working folder that coresponds to the given parameters
    classFunctionFileName = 'class_function_eta' + str(eta) + '_lmdb' + str(lmdb) + '_iter' + str(iterations)
    classFunctionDF = pd.read_csv(classFunctionFileName+'.csv', header=None)
    
    #this will be our weight matrix we use to classify the data set
    W = classFunctionDF.to_numpy()
    
    #grab the data set we need based on the parameters
    #we also grab the ids for the output
    if dataSet == 'training':
        X = Xtrain
        dataSetIDs = trainDataSetIDs
    else:
        X = Xtest
        dataSetIDs = testDataSetIDs
    
    #calculate the probabilities for the data using the chosen weight function
    probabilities = getProbabilitys(W, X)
    
    #choose our predictions as the highest probability for each example
    # note that argmax returns the index of the highest value and so is 0 indexed
    # our class ids are 1 index so we must shift our results by 1
    predictions = np.argmax(probabilities, axis=0)+1
    
    #stack the predictions and their example IDs and convert to dataframe for output
    output = pd.DataFrame(np.column_stack((dataSetIDs, predictions)))
    
    #generate descriptive file name from parameters
    outputFileName=dataSet.upper()+'predictions_eta'+str(eta)+'_lmdb'+str(lmdb)+'_iter'+str(iterations)+'.csv'
    
    #save the output file of our predictions in the appropriate format
    output.to_csv(outputFileName, header=['id', 'class'], index = False)

In [15]:
#this function takes a weight matrix and updates it,
# using our hyperparameters, the class values, and our attribute matrix
#W is the weight matrix to be updated
#eta is our step size hyperparameter
#delta is the delta matrix we generated from the training data
#probabilitys is a matrix of probabilites assumed to be genereated from the given W using getProbabilitys
#X is the current data sets attribute matrix
#lmdb is our lambda penalty hyperparameter
def updateWeights(W, eta, delta, probabilitys, X, lmdb):
    #delta - probabilitys gives us our error, which could be used as a stopping parameter as mentioned above
    #this value is multiplied by the attribute matrix to give us the error accross the attributes
    #this is then penalized by the lmdb value across out weights
    #multiplied by our step size and added to our weights
    #this gives us new weights that are moved in the direction of our known true class values
    WNew = W + eta*( ((delta - probabilitys) @ X) - (lmdb*W) )
    
    #return the array of our new weights
    return np.array(WNew)

In [16]:
#this function trains a weight matrix for the given data set, hyperparameters and iterations
#W is a starting weight function, usually generated randomly
#eta and lmdb our our step size and penalty hyperparameters respectivily
#delta is our dleta matrix generated from the training data true class values
#iterations is the number of iterations we would like to train on
#note: to impliment an error stoping criteria we would do it here, updateWeights would return the error
# value as well and that would be the stoping value for the loop.
def train(W, X, eta, delta, lmdb, iterations):
    WNew = updateWeights(W, eta, delta, getProbabilitys(W, X), X, lmdb)
    for i in range(iterations):
        WNew = updateWeights(WNew, eta, delta, getProbabilitys(WNew, X), X, lmdb)
    return WNew

In [17]:

#Utilization of this Code
# the code below shows an example of training 5 different weight functions for different eta values
# peices of it can be uncommented to allow for traning over lambda values as well
# each weight function is saved for use by the classify function after it is generated

#The general use case
#1 call train using train(WStart, Xtrain, eta, Deltatrain, lmdb, iterations),
# where eta, lmdb, and iterations are replaced by your chosen values
#2 make sure to save the return value of train to a variable, I usually use W
#3 save that W file to the working directory using np.savetxt and the string format detailed below
# ie. 'class_function_eta'+str(eta)+'_lmdb'+str(lmdb)+'_iter'+str(iterations)'.csv'
# your saved weight function and delimiter=','
#4 call classify with the data set you would like to classify, in this case either 'testing' or 'training'

#This is so split up so that you may run train without saving it to a file 
# or run classify on weight functions already in memory without having to train them yourself

#The function in the next cell will run through a full train and classify for the given hyperparameters

'''
# train over for parameters eta in range [0.01, 0.007, 0.005, 0.002, 0.001]
# save resulting classFunctions, and predictions over the test set to files
#paramRange = [0.01, 0.005, 0.001]
paramRange = [0.009, 0.008, 0.007, 0.006]
for eta in paramRange:
    #for lmdb in paramRange:
    lmdb=0.01
    W = train(WStart, Xtrain, eta, Deltatrain, lmdb, 10000)
    np.savetxt('class_function_eta'+str(eta)+'_lmdb'+str(lmdb)+'_iter10000.csv', W, delimiter=',')
    classify('testing', eta, lmdb, 10000)
'''

"\n# train over for parameters eta in range [0.01, 0.007, 0.005, 0.002, 0.001]\n# save resulting classFunctions, and predictions over the test set to files\n#paramRange = [0.01, 0.005, 0.001]\nparamRange = [0.009, 0.008, 0.007, 0.006]\nfor eta in paramRange:\n    #for lmdb in paramRange:\n    lmdb=0.01\n    W = train(WStart, Xtrain, eta, Deltatrain, lmdb, 10000)\n    np.savetxt('class_function_eta'+str(eta)+'_lmdb'+str(lmdb)+'_iter10000.csv', W, delimiter=',')\n    classify('testing', eta, lmdb, 10000)\n"

In [20]:
#single run through of the logistic regression process
#dataSet = the data set you would like to classify, for this example must be 'training' or 'testing'
#eta = your step size
#lmdb = your penalty term
#iterations = the number of iterations you would like to train on
def logisticRegression(dataSet, eta, lmdb, iterations):
    WStart = np.random.rand(20,61189)
    W = train(WStart, Xtrain, eta, Deltatrain, lmdb, iterations)
    np.savetxt('class_function_eta'+str(eta)+'_lmdb'+str(lmdb)+'_iter'+str(iterations)+'.csv', W, delimiter=',')
    classify(dataSet, eta, lmdb, 10000)