# Study

## Series Analysis

### Example: Fibonacci

a(0) = 1

a(1) = 1

a(n) = a(n-2) + a(n-1)

a(n) = Xa(n-2) + Ya(n-1)

## Data load and training / test sets initialization

In [1]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, BayesianRidge
from IPython.display import display, HTML
%matplotlib inline

def stoarray(data = [], sep = ','):
    return data.map(lambda x: np.array(x.split(sep), dtype=np.longdouble))

# load the data
colna = ['id', 'seq']
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
test.columns = colna
train.columns = colna
train['seq'], test['seq'] = stoarray(train['seq']), stoarray(test['seq'])
train.head()
display(pd.DataFrame(train.seq[1].astype(int)))

Unnamed: 0,0
0,1
1,2
2,1
3,5
4,5
5,1
6,11
7,16
8,7
9,1


## Feature extraction of a determined sequence

In [2]:
# returns a test set matrix [Xi, X1, X2, X3... Xn-2, Xn-1, Xn, y]
def getTrainingSet(seq):
    xlen = seq.shape[0]
    x = np.array([])
    trainingSet = np.zeros([xlen, xlen+1])
    columns = []
    for i in range(xlen):
        xi = np.array([float(i)])
        x1 = np.concatenate((seq[:i], np.zeros(xlen-1-i)))
        y1 = seq[i:i+1]
        trainingSet[i] = np.concatenate((xi, x1, y1))
        columns.append("x" + str(i))
    columns.append("y")
    # mirror lr
    # trainingSet[:,1:-1] = np.fliplr(trainingSet[:,1:-1])
    return trainingSet, columns

trainingSet, columns = getTrainingSet(train.seq[4][:-1])
#X = pd.DataFrame(trainingSet[:,:-1].astype(int), columns=columns[:-1])
#y = pd.DataFrame(trainingSet[:,-1:].astype(int), columns=columns[-1:])
#display(X, y)

### TEST
#np.set_printoptions(precision=99)
#print(train.seq[4][-1:].astype(np.longdouble), predictLast(train.seq[4][:-1]).astype(np.longdouble))
#display(pd.DataFrame(trainingSet.astype(np.longdouble), columns=columns))

## Fit a Linear Regression model

In [3]:
display(pd.DataFrame([
            ["x0", "x1", "x2", "x3", "y"],
            ["x00", "x01", "x02", "x03", "y0"],
            ["x10", "x02", "x03", "x13", "y1"],
            ["x20", "x12", "x13", "x23", "y1"],
        ]))
print("to transform m into shifted ngrams matrix that's idea...")

# performs dimensionality reduction using ngrams
def getTrainingSetNgram(seq):
    m, columns = getTrainingSet(seq)
    n = int((m.shape[1]-2)/2+1) # n-gram n value
    # n == 15 best cut off - by try and error discovery
    if n > 15:
        n = 15
    #print("n-gram:", n, len(seq))
    mp = np.array([m[:,0], *tuple(np.zeros([n, m.shape[1]-1])), m[:,-1]]).transpose()
    # complete diagonals - ngram matrix
    for i in range(n):
        for j in range(1,n):
            if n-i+j <= n:
                mp[i,n-i+j] = m[i,j]
    # complete matrix ngrams body
    for i in range(n, m.shape[0]):
        mp[i,1:1+n] = m[i,1-n+i:1+i]
    #print(mp.shape, columns[:mp.shape[1]])
    columns[mp.shape[1]-1] = "y"
    # vertical mirror center of matrix
    #    mp[:,1:-1] = np.fliplr(mp[:,1:-1])
    #    mp = np.flipud(mp)
    return mp, columns[:mp.shape[1]]

trainingSet, columns = getTrainingSet(train.seq[1][:-1])
#display(pd.DataFrame(trainingSet.astype(int), columns=columns))

trainingSet, columns = getTrainingSetNgram(train.seq[1][:-1])
display(pd.DataFrame(trainingSet.astype(int), columns=columns))

Unnamed: 0,0,1,2,3,4
0,x0,x1,x2,x3,y
1,x00,x01,x02,x03,y0
2,x10,x02,x03,x13,y1
3,x20,x12,x13,x23,y1


to transform m into shifted ngrams matrix that's idea...


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,y
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,1
3,3,0,0,0,0,0,0,0,0,0,0,0,0,1,2,1,5
4,4,0,0,0,0,0,0,0,0,0,0,0,1,2,1,5,5
5,5,0,0,0,0,0,0,0,0,0,0,1,2,1,5,5,1
6,6,0,0,0,0,0,0,0,0,0,1,2,1,5,5,1,11
7,7,0,0,0,0,0,0,0,0,1,2,1,5,5,1,11,16
8,8,0,0,0,0,0,0,0,1,2,1,5,5,1,11,16,7
9,9,0,0,0,0,0,0,1,2,1,5,5,1,11,16,7,1


### Generate computed features

In [4]:
def generateExtraFeatures(mX):
    n = mX.shape[1]
    #for i in range(1, n-1):
    #    mX = mX
    #    mX = np.concatenate((mX, mX[:,i:i+1]**2), axis=1) # Xn^2
    #    mX = np.concatenate((mX, np.cos(mX[:,i:i+1])), axis=1) # cos
    #    mX = np.concatenate((mX, np.cos(mX[:,i:i+1])), axis=1) # sin
    #    mX = np.concatenate((mX, np.exp(mX[:,i:i+1])), axis=1) # exp
    #    mX = np.concatenate((mX, np.log(mX[:,i:i+1])), axis=1) # exp
    #if (n > 11):
    #    return mX
    for i in range(1, n-2):
        mX = np.concatenate((mX, mX[:,i+1:i+2] - mX[:,i+0:i+1]), axis=1) # X(n+1) - X(n)
        #mX = np.concatenate((mX, np.angle(mX[:,i+1:i+2] - mX[:,i+0:i+1])), axis=1) # X(n+1) v X(n)
    #for i in range(1, n-3):
    #    mX = np.concatenate((mX, mX[:,i+2:i+3] - mX[:,i+0:i+1]), axis=1) # X(n+2) - X(n)
    #    mX = np.concatenate((mX, np.angle(mX[:,i+2:i+3] - mX[:,i+0:i+1])), axis=1) # X(n+2) v X(n)
    #for i in range(1, n-1):
        #mX = mX
        #mX[:,i+1:i+2] = np.cos(mX[:,i+1:i+2] - mX[:,i:i+1])
    #display(pd.DataFrame(mX))
    return mX

trainingSet, columns = getTrainingSetNgram(train.seq[2][:-1])
generateExtraFeatures(trainingSet)

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   1.00000000e+00],
       [  2.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       ..., 
       [  3.70000000e+01,   4.09600000e+03,   5.12000000e+03, ...,
          9.83040000e+04,   6.55360000e+04,   1.96608000e+05],
       [  3.80000000e+01,   5.12000000e+03,   8.19200000e+03, ...,
          6.55360000e+04,   1.96608000e+05,   1.31072000e+05],
       [  3.90000000e+01,   8.19200000e+03,   1.02400000e+04, ...,
          1.96608000e+05,   1.31072000e+05,   3.93216000e+05]])

## Predict Model function

In [13]:
def predictLast(seq):
    if seq.shape[0] == 0:
        return 0
    if seq.shape[0] <= 2:
        return seq[0]
    
    trainingSet, columns = getTrainingSetNgram(seq)
    X = trainingSet[:,:-1]
    X = generateExtraFeatures(X)
    X = pd.DataFrame(X)
    y = pd.DataFrame(trainingSet[:,-1:], columns=columns[-1:])
    
    #display(pd.DataFrame(trainingSet.astype(int), columns=columns))
    #display(pd.DataFrame(X.astype(int)))

    testIndex = np.array([int(seq.shape[0])])
    testRow = trainingSet[-1:,2:][0];
    testRowX = np.array([np.concatenate((testIndex, testRow))])
    
    # add extra
    testRowX = generateExtraFeatures(testRowX)
    testRowX = pd.DataFrame(testRowX)
    #display(testRowX)
    
    #print(testIndex.shape, testRow.shape)
    #display(testRowX)
    
    # train
    regresor = LinearRegression(fit_intercept=False, normalize=False, copy_X=True, n_jobs=3)
    regresor.fit(X.as_matrix(), y.as_matrix())
    
    #inpFinal = np.copy(inp)
    outFinal = regresor.predict(testRowX.as_matrix())
    #print(len(seq), len(outFinal))
    #display(pd.DataFrame([seq.astype(int), outFinal.astype(int)]))
    return outFinal[-1:]

### Test predictive model with Training set

In [12]:
pmatches = 0
matches = 0
zeros = 0
output = []
#for i in range(len(train)):
for i in range(1,16000):
    seq = train.seq[i].astype(np.longdouble)
    
    plast = seq[-2:-1]
    plastpredicted = predictLast(seq[:-2])
    if np.isnan(plastpredicted):
        plastpredicted = 0
    
    last = seq[-1:]
    lastpredicted = predictLast(seq[:-1])
    if np.isnan(lastpredicted):
        lastpredicted = 0
    
    pdiff = int(plast - plastpredicted)
    
    if np.abs(pdiff) > 100:
        lastpredicted = 0
    
    diff = int(last - lastpredicted)
    
    #print('result = %d; expected = %d; diff = %d' % (int(lastpredicted), int(last), diff))
    output.append([train.id[i], int(lastpredicted)])
    if pdiff == 0:
        pmatches += 1
    if diff == 0:
        matches += 1
    if lastpredicted == 0:
        zeros += 1
    if i>0 and i%2000==0:
        print("%d, %d / %d matches; zeros = %d, accuracy = %f " % (pmatches, matches, i, zeros, float(matches)/float(i)))
              
print("%d total matches" % matches)

pd.DataFrame(output).to_csv("train_submission.csv", header=["Id", "Last"], index=False, index_label=False)

357, 351 / 2000 matches; zeros = 1094, accuracy = 0.175500 
740, 729 / 4000 matches; zeros = 2136, accuracy = 0.182250 
1121, 1096 / 6000 matches; zeros = 3210, accuracy = 0.182667 
1502, 1470 / 8000 matches; zeros = 4257, accuracy = 0.183750 
1865, 1843 / 10000 matches; zeros = 5327, accuracy = 0.184300 
2237, 2204 / 12000 matches; zeros = 6377, accuracy = 0.183667 
2613, 2588 / 14000 matches; zeros = 7440, accuracy = 0.184857 
2962 total matches


### Test predictive model with Test set

In [16]:
output = []
for i in range(len(test)):
    seq = test.seq[i]
    lastpredicted = predictLast(seq)
    
    
    plast = seq[-1:]
    plastpredicted = predictLast(seq[:-1])
    if np.isnan(plastpredicted):
        plastpredicted = 0
    
    if np.abs(plast - plastpredicted) > 100:
        lastpredicted = plast
    
    
    try:
        output.append([test.id[i], int(lastpredicted)])
    except OverflowError:
        output.append([test.id[i], int(seq[-1:])])
    if i>0 and i%2000==0:
        print("%d / %d done" % (i, len(test)))
              
pd.DataFrame(output).to_csv("submission12.csv", header=["Id", "Last"], index=False, index_label=False)

2000 / 113845 done
4000 / 113845 done
6000 / 113845 done
8000 / 113845 done
10000 / 113845 done
12000 / 113845 done
14000 / 113845 done
16000 / 113845 done
18000 / 113845 done
20000 / 113845 done
22000 / 113845 done
24000 / 113845 done
26000 / 113845 done
28000 / 113845 done
30000 / 113845 done
32000 / 113845 done
34000 / 113845 done
36000 / 113845 done
38000 / 113845 done
40000 / 113845 done
42000 / 113845 done
44000 / 113845 done
46000 / 113845 done
48000 / 113845 done
50000 / 113845 done
52000 / 113845 done
54000 / 113845 done
56000 / 113845 done
58000 / 113845 done
60000 / 113845 done
62000 / 113845 done
64000 / 113845 done
66000 / 113845 done
68000 / 113845 done
70000 / 113845 done
72000 / 113845 done
74000 / 113845 done
76000 / 113845 done
78000 / 113845 done
80000 / 113845 done
82000 / 113845 done
84000 / 113845 done
86000 / 113845 done
88000 / 113845 done
90000 / 113845 done
92000 / 113845 done
94000 / 113845 done
96000 / 113845 done
98000 / 113845 done
100000 / 113845 done
102