Large diffs are not rendered by default.

@@ -5,7 +5,7 @@

import logging

logging.basicConfig(level=logging.ERROR)
logging.basicConfig(level=logging.DEBUG)#,filename="logagain.txt")

numpy.seterr(over="ignore")
#seed=int(sys.argv[5])
@@ -54,31 +54,43 @@ def fit(self,X,y):
self.W=numpy.zeros(shape=(feature_dim,1))
self.b=numpy.zeros(shape=(1,))
epoch=1
prev_cost=0.0
min_cost=0.0
prev_grad=0
patience=5
while epoch:
prev_self_W=self.W
output=self.__sigmoid(self.__output(train_X))
cost=self.__cost(output,train_y)
val_cost=self.__cost(self.__sigmoid(self.__output(test_X)),test_y)
#logging.debug("prev {} now {}".format(prev_cost,val_cost))
if prev_cost < val_cost and epoch != 1:
logging.info("Alpha is now {}".format(self.alpha))
self.W=prev_self_W
if val_cost > min_cost and epoch != 1:
logging.info("Skipping weight backup.")
logging.info("Decreasing patience and alpha.")
self.alpha*=0.1
#pass
if self.alpha < 0.0001 or epoch > 10000:
#self.W=prev_self_W
patience-=1
logging.info("Replacing with previous backup.")
self.W=prev_W
else:
min_cost=val_cost
prev_W=self.W
#logging.debug("prev {} now {}".format(prev_cost,val_cost))
if self.alpha < 0.000001 or epoch > 10000 or patience == 0:
logging.info("Restoring backup weights")
self.W=prev_W
accuracy=self.accuracy(self.predict(test_X),test_y)
logging.info("Epoch {} error {}. Training error {}. Accuracy {}".format(epoch,val_cost,cost,accuracy))
break
grad=self.__gradient(output,train_y,train_X)
self.b=self.b-(self.alpha*numpy.mean(output-train_y))
self.W=self.W-(self.alpha*grad)-(0.9*prev_grad)
prev_grad=self.alpha*grad
if epoch % 10 == 0:
if epoch % 100 == 0:
accuracy=self.accuracy(self.predict(test_X),test_y)
logging.info("Epoch {} error {}. Training error {}. Accuracy {}".format(epoch,val_cost,cost,accuracy))
# if epoch % 1000 == 0:
# self.alpha=0.0001
# if epoch % 5000 == 0:
# self.alpha=0.00001

epoch+=1
prev_cost=val_cost

def create_train_test(self,X,y,split=0.1):
dataset=numpy.hstack((X,y))
@@ -103,13 +115,20 @@ def create_feature_vector(word_counts,max_len):
try:
vector[word_count[1]]=word_count[2]
except:
vector[-1]=word_count[2]
#logging.debug("{}".format(vector))
pass
#vector[-1]=word_count[2]
#logging.debug("{}".format(vector[vector != 0].shape))
return vector

def scale(X):
return X/(1+numpy.max(X))

def tf(X):
num_docs=numpy.apply_along_axis(lambda x:x[x!=0].shape[0],0,X)+1
idf=numpy.log(X.shape[0]/num_docs)
tf=numpy.apply_along_axis(lambda x:x/numpy.max(x),1,X)
return tf*idf

def get_dataset(filename,enforce=None):
# load dataset
logging.info("Loading data.")
@@ -136,8 +155,10 @@ def get_dataset(filename,enforce=None):
logging.info("Vectors created.")
logging.info("Collecting to dataset matrix.")
X=numpy.array(vectors)
X=numpy.apply_along_axis(scale,0,X)
return X
#X=(X-numpy.mean(X))/(1+numpy.var(#X))
#X=numpy.apply_along_axis(scale,0,X)
#print(X)
return tf(X)

if __name__=="__main__":

@@ -147,16 +168,19 @@ def get_dataset(filename,enforce=None):
enforce_shape=X.shape
logging.info("Data loaded.")
labels=load_data(sys.argv[2])
#print(X)
#print(labels[labels==0].shape[0]/labels.shape[0])
#print(X)
#print(tfidf(X))
#X=tf(X)
#sys.exit()
# learn
LR=LogisticRegression(alpha=0.01)
LR.fit(X,labels)
logging.info("Beginning test.")
X_test=get_dataset(sys.argv[3],enforce_shape)
# y_test=load_data(sys.argv[4])
# logging.error("Testing set accuracy is {} for seed {}".format(LR.accuracy(LR.predict(X_test),y_test[:,numpy.newaxis]),seed))
y_test=load_data(sys.argv[4])
logging.error("Testing set accuracy is {} for seed {}".format(LR.accuracy(LR.predict(X_test),y_test[:,numpy.newaxis]),seed))
out=LR.predict(X_test)
for i in out[:,0]:
print(int(i))
# print(int(i))
pass
@@ -0,0 +1,7 @@
import os

for i in range(1000,1100):
#print("For seed {}".format(i))
os.system("./logistic_regression.py ../../csci4360-fa17/assignments/assignment1/writeup/data/train.data ../../csci4360-fa17/assignments/assignment1/writeup/data/train.label ../../csci4360-fa17/assignments/assignment1/writeup/data/test_partial.data ../../csci4360-fa17/assignments/assignment1/writeup/data/test_partial.label "+str(i))
#print()