@@ -5,7 +5,7 @@
import logging
logging .basicConfig (level = logging .ERROR )
logging .basicConfig (level = logging .DEBUG ) #,filename="logagain.txt" )
numpy .seterr (over = "ignore" )
#seed=int(sys.argv[5])
@@ -54,31 +54,43 @@ def fit(self,X,y):
self .W = numpy .zeros (shape = (feature_dim ,1 ))
self .b = numpy .zeros (shape = (1 ,))
epoch = 1
prev_cost = 0.0
min_cost = 0.0
prev_grad = 0
patience = 5
while epoch :
prev_self_W = self .W
output = self .__sigmoid (self .__output (train_X ))
cost = self .__cost (output ,train_y )
val_cost = self .__cost (self .__sigmoid (self .__output (test_X )),test_y )
#logging.debug("prev {} now {}".format(prev_cost,val_cost))
if prev_cost < val_cost and epoch != 1 :
logging .info ("Alpha is now {}" .format (self .alpha ))
self .W = prev_self_W
if val_cost > min_cost and epoch != 1 :
logging .info ("Skipping weight backup." )
logging .info ("Decreasing patience and alpha." )
self .alpha *= 0.1
#pass
if self .alpha < 0.0001 or epoch > 10000 :
#self.W=prev_self_W
patience -= 1
logging .info ("Replacing with previous backup." )
self .W = prev_W
else :
min_cost = val_cost
prev_W = self .W
#logging.debug("prev {} now {}".format(prev_cost,val_cost))
if self .alpha < 0.000001 or epoch > 10000 or patience == 0 :
logging .info ("Restoring backup weights" )
self .W = prev_W
accuracy = self .accuracy (self .predict (test_X ),test_y )
logging .info ("Epoch {} error {}. Training error {}. Accuracy {}" .format (epoch ,val_cost ,cost ,accuracy ))
break
grad = self .__gradient (output ,train_y ,train_X )
self .b = self .b - (self .alpha * numpy .mean (output - train_y ))
self .W = self .W - (self .alpha * grad )- (0.9 * prev_grad )
prev_grad = self .alpha * grad
if epoch % 10 == 0 :
if epoch % 100 == 0 :
accuracy = self .accuracy (self .predict (test_X ),test_y )
logging .info ("Epoch {} error {}. Training error {}. Accuracy {}" .format (epoch ,val_cost ,cost ,accuracy ))
# if epoch % 1000 == 0:
# self.alpha=0.0001
# if epoch % 5000 == 0:
# self.alpha=0.00001
epoch += 1
prev_cost = val_cost
def create_train_test (self ,X ,y ,split = 0.1 ):
dataset = numpy .hstack ((X ,y ))
@@ -103,13 +115,20 @@ def create_feature_vector(word_counts,max_len):
try :
vector [word_count [1 ]]= word_count [2 ]
except :
vector [- 1 ]= word_count [2 ]
#logging.debug("{}".format(vector))
pass
#vector[-1]=word_count[2]
#logging.debug("{}".format(vector[vector != 0].shape))
return vector
def scale (X ):
return X / (1 + numpy .max (X ))
def tf (X ):
num_docs = numpy .apply_along_axis (lambda x :x [x != 0 ].shape [0 ],0 ,X )+ 1
idf = numpy .log (X .shape [0 ]/ num_docs )
tf = numpy .apply_along_axis (lambda x :x / numpy .max (x ),1 ,X )
return tf * idf
def get_dataset (filename ,enforce = None ):
# load dataset
logging .info ("Loading data." )
@@ -136,8 +155,10 @@ def get_dataset(filename,enforce=None):
logging .info ("Vectors created." )
logging .info ("Collecting to dataset matrix." )
X = numpy .array (vectors )
X = numpy .apply_along_axis (scale ,0 ,X )
return X
#X=(X-numpy.mean(X))/(1+numpy.var(#X))
#X=numpy.apply_along_axis(scale,0,X)
#print(X)
return tf (X )
if __name__ == "__main__" :
@@ -147,16 +168,19 @@ def get_dataset(filename,enforce=None):
enforce_shape = X .shape
logging .info ("Data loaded." )
labels = load_data (sys .argv [2 ])
#print(X)
#print(labels[labels==0].shape[0]/labels.shape[0])
#print(X)
#print(tfidf(X))
#X=tf(X)
#sys.exit()
# learn
LR = LogisticRegression (alpha = 0.01 )
LR .fit (X ,labels )
logging .info ("Beginning test." )
X_test = get_dataset (sys .argv [3 ],enforce_shape )
# y_test=load_data(sys.argv[4])
# logging.error("Testing set accuracy is {} for seed {}".format(LR.accuracy(LR.predict(X_test),y_test[:,numpy.newaxis]),seed))
y_test = load_data (sys .argv [4 ])
logging .error ("Testing set accuracy is {} for seed {}" .format (LR .accuracy (LR .predict (X_test ),y_test [:,numpy .newaxis ]),seed ))
out = LR .predict (X_test )
for i in out [:,0 ]:
print (int (i ))
# print(int(i))
pass