In [1]:
import numpy as np
from numpy.linalg import inv
from scipy import stats
import math
from sys import stdout
from mcc import mcc
import lr

In [None]:
root = '../../Desktop/bosch/'
root = '../../bosch/'

In [None]:
# https://www.kaggle.com/c/bosch-production-line-performance

In [None]:
"""

1,183,747 training observations
1,183,748 test observations

Response is 1: 6,879 times out of 1,183,748

Excluding Id (and Response in numeric):
 2,140 categorical features (like L1_S25_F2117)
   968 numeric features (like L1_S25_F2117)
 1,156 date features (like L1_S25_D2118)

L1_S25_D2118 in date is paired with L1_S25_F2117 (if L1_S25_F2117 is present)
Four lines -- 0 to 3
Lines have different number of sections.

Fit each group in training.
For each x in test, find group for x, get model and predict. If group not found, predict 0.

"""

i = 1

In [None]:
def run(filename):
    groups = {}

    with open(root + filename) as f:
        n = 0
        while True:
            line = f.readline()
            if line == '': # or n == 10000:
                break

            cols = line.strip('\n').split(',')

            if n == 0:
                width = len(cols)
            elif len(cols) == width:
                pattern = ','.join([str(i) for i, x in enumerate(cols) if x != ''])
                group = groups.get(pattern, None)
                if group == None:
                    groups[pattern] = group = []
                group.append(n)

            n += 1
    return groups

In [None]:
groups = run('train_numeric.csv')

In [None]:
def unit_vector(vector):
    l = np.linalg.norm(vector)
    return 0 * vector if l == 0 else vector / l

def angle_between(v1, v2):
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))

def parallel(v1, v2):
    return sum(v1) == 0 or abs(angle_between(v1, v2)) < 0.0001

def NLL(w):
    muw = mu(w)
    return -sum(y * np.log(muw) + (1 - y) * np.log(1 - muw))

mu = lambda w: sigmoid(X.dot(w))
jac = lambda w: X.T.dot(mu(w) - y)
S = lambda w, mu: np.diag(mu * (1 - mu))
hess = lambda w: X.T.dot(S(w, mu(w))).dot(X)

In [None]:
filename = 'train_numeric.csv'
N = 1183747

with open(root + filename) as f:
    line = f.readline()
    cols = line.strip('\n').split(',')
    
Z = np.zeros((N, len(cols)), dtype = float)

In [None]:
with open(root + filename) as f:
    n = 0
    line = f.readline()
    while True:
        line = f.readline()
        if line == '':
            break

        Z[n, :] = [np.nan if x == '' else float(x) for x in line.strip('\n').split(',')]
        n += 1

In [None]:
models = {}
keep = {}
count = 0

for pattern, group in groups.iteritems():
    rows = np.array(group) - 1
    cols = np.array([int(x) for x in pattern.split(',')])[1 : -1]        
    y = Z[rows, -1]
    X = Z[rows, :][:, cols]

    indices = []
    for j in range(X.shape[1]):
        good = True
        for k in range(j + 1, X.shape[1]):
            if parallel(X[:, j], X[:, k]):
                good = False
                break
        if good:
            indices.append(j)

    keep[pattern] = indices
    X = X[:, indices]
    models[pattern] = lr.fit(X, y)
    count += 1
    stdout.write('\r%s/%s' % (count, len(groups)))
    stdout.flush()    

In [None]:
def y_hat(i):
    pattern = ','.join([str(i) for i, x in enumerate(Z[i, :]) if x != ''])
    cols = np.array([int(x) for x in pattern.split(',')])[1 : -1]        
    x = Z[i, cols]
    return lr.predict(x[keep['pattern']], models[pattern])
        
y = Z[:, -1]
y_hat = [y_hat(i) for i in range(len(Z))]
print mcc(y, y_hat)