In [None]:
import random
import numpy as np
import sys
from random import randint
def lazy_readlines(fn):
    with open(fn,'r') as f:
        for ln in f:
            yield ln
def processLines(fn,reduce_fn,init):
    return reduce(reduce_fn,lazy_readlines(fn),init)
def count_lines(fn):
    return processLines(fn,lambda x,y:x+1,0)
def addFeature(x,y):
    x.append(y)
    return x
def getAllFeatures(fn):
    return processLines(fn,lambda x,y:addFeature(x,[y.rstrip().split()[0],int(y.rstrip().split()[1])]),list())
def getAllFeaturesRev(fn):
    return processLines(fn,lambda x,y:addFeature(x,[int(y.rstrip().split()[1]),y.rstrip().split()[0]]),list())
def getVectors(fn):
    return processLines(fn,lambda x,y:addFeature(x,y.rstrip().split('\t')[1].split(',')),list())
def getIntVectors(fn):
    return processLines(fn,lambda x,y:addFeature(x,map(int,y.rstrip().split('\t')[1].split(','))),list())
class MentionData:
    def __init__(self,x_file,y_file,feature_dict,label_dict):
        self.feature2id = dict(getAllFeatures(feature_dict))
        self.id2feature = dict(getAllFeaturesRev(feature_dict))
        self.id2label = dict(getAllFeaturesRev(label_dict))
        self.label2id = dict(getAllFeatures(label_dict))
        
        self.data = []
        for x,y in self.readData(x_file,y_file):
            labels = np.zeros(len(self.id2label),dtype=float)
            labels[y]=1.
            self.data.append(Instance([f for f in x if f in self.id2feature] ,labels,y))
    def readData(self,x_file,y_file):
        
        assert count_lines(x_file) == count_lines(y_file)
        return zip(getIntVectors(x_file),getIntVectors(y_file))
class Instance:
    def __init__(self,features,labels,sparse_labels):
        self.features = np.asarray(features,dtype=int)
        self.labels = labels
        self.sparse_labels = sparse_labels
        self.negative_labels = self.get_negatives()
    def get_negatives(self):
        return [i for i in range(len(self.labels)) if self.labels[i] <1.]
                


In [None]:
in_dir= "/Users/mayk/working/figer/baseline/PLE/Intermediate/BBN"
a=MentionData('/Users/mayk/working/figer/baseline/PLE/Intermediate/BBN/train_x_new.txt',
              "/Users/mayk/working/figer/baseline/PLE/Intermediate/BBN/train_y.txt",
             in_dir+"/feature.txt",in_dir+"/type.txt")

In [None]:

def train(A,B,insts,size,lr,max_it =10):
    for it in xrange(1,max_it+1):
        error = 0.

        for i,inst in enumerate(insts):
            error+=cgradient(A,B,inst,size,lr=lr)
        
            if i % 1000 ==0:
                sys.stdout.write("\rIteration %d " % (it)+ "trained {0:.0f}%".format(float(i)*100/len(insts))+" Loss:{0:.2f}".format(error))
                sys.stdout.flush()
        sys.stdout.write("\n")
def gradient(A,B,inst,size,lr=0.01):
    #TODO
    dA = np.zeros(size)
    dB = np.zeros([len(inst.labels),size])
    x = np.sum(A[inst.features],axis=0)
    error = 0.
    neg_num = len(inst.negative_labels)
    for l in inst.sparse_labels:
        s1= np.vdot(x,B[l])
        N=1
        n_sample  = -1
        for k in xrange(neg_num):
            nl = inst.negative_labels[randint(0,neg_num-1)]
            s2 = np.vdot(x,B[nl])
            if s1 - s2<1:
                n_sample = nl
                N = k+1
                break
        if n_sample!=-1:
            L = crank(len(inst.negative_labels)/N)
            error += (1+s2-s1)*L
            dA += L*(B[l]-B[n_sample])
            dB[l] += L*x
            dB[nl] -= L*x
    for f in inst.features:
        A[f] += lr*dA
        norm = np.linalg.norm(A[f])
        if norm >1:
            A[f] /= norm
    for i in xrange(len(B)):
        B[i] += lr*dB[i]
        norm =  np.linalg.norm(B[i])
        if norm >1:
            B[i] /=norm
    return error
def rank(k):
    loss = 0.
    for i in xrange(1,k+1):
        loss += 1./i
    return loss
def save_to_text(matrix,output):
    shape = matrix.shape
    with open(output,'wb') as out:
        out.write("%d %d\n" % (shape))
        for row in matrix:
            x = " ".join(map(lambda x:"{0:.5}".format(x),row))
            out.write(x+"\n")
    

In [None]:
size=50
A= np.random.rand(len(a.feature2id),size)
B= np.random.rand(len(a.label2id),size)
train(A,B,a.data,50,lr=0.01,max_it=10)
save_to_text(A,'/Users/mayk/working/figer/baseline/PLE/Results/warp_py_A.txt')
save_to_text(B,'/Users/mayk/working/figer/baseline/PLE/Results/warp_py_B.txt')

In [None]:
%load_ext Cython


In [None]:
%%cython
import numpy as np

cimport numpy as np
from random import randint

cdef crank(int k):
    cdef float loss = 0.
    cdef int i = 1
    for i in xrange(1,k+1):
        loss += 1./i
    return loss
def cgradient(A,B,inst,size,lr=0.01):
    dA = np.zeros(size)
    dB = np.zeros([len(inst.labels),size])
    x = np.sum(A[inst.features],axis=0)
    error = 0.
    neg_num = len(inst.negative_labels)
    cdef int i =0
    for l in inst.sparse_labels:
        s1= np.vdot(x,B[l])
        N=1
        n_sample  = -1
        for k in xrange(neg_num):
            nl = inst.negative_labels[randint(0,neg_num-1)]
            s2 = np.vdot(x,B[nl])
            if s1 - s2<1:
                n_sample = nl
                N = k+1
                break
        if n_sample!=-1:
            L = crank(len(inst.negative_labels)/N)
            error += (1+s2-s1)*L
            dA += L*(B[l]-B[n_sample])
            dB[l] += L*x
            dB[nl] -= L*x
    for f in inst.features:
        A[f] += lr*dA
        norm = np.linalg.norm(A[f])
        if norm >1:
            A[f] /= norm
    for i in xrange(len(B)):
        B[i] += lr*dB[i]
        norm =  np.linalg.norm(B[i])
        if norm >1:
            B[i] /=norm
    return error

In [None]:
%timeit crank(10)

In [None]:
%timeit rank(10)

In [None]:
%timeit gradient(A,B,a.data[2],50,lr=0.01)

In [None]:
%timeit cgradient(A,B,a.data[2],50,lr=0.01)

In [None]:
%timeit np.vdot(B[1],B[1])

In [None]:
%%cython
import cython
import numpy as np
cimport numpy as np

from libc.math cimport exp
from libc.math cimport log
from libc.string cimport memset

# scipy <= 0.15

import scipy.linalg.blas as fblas

REAL = np.float32
cdef extern from "/Users/mayk/working/figer/baseline/PLE/Model/warp/voidptr.h":
    void* PyCObject_AsVoidPtr(object obj)
DEF MAX_SENTENCE_LEN = 10000
cdef scopy_ptr scopy = <scopy_ptr>PyCObject_AsVoidPtr(fblas.scopy._cpointer)  # y = x
