# Study

## Series Analysis

### Example: Fibonacci

a(0) = 1

a(1) = 1

a(n) = a(n-2) + a(n-1)

a(n) = Xa(n-2) + Ya(n-1)

## Data load and training / test sets initialization

In [1]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from IPython.display import display, HTML
%matplotlib inline

def stoarray(data = [], sep = ','):
    return data.map(lambda x: np.array(x.split(sep), dtype=float))

# load the data
colna = ['id', 'seq']
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
test.columns = colna
train.columns = colna
train['seq'], test['seq'] = stoarray(train['seq']), stoarray(test['seq'])
train.head()
display(pd.DataFrame(train.seq[0]))

Unnamed: 0,0
0,1.0
1,3.0
2,13.0
3,87.0
4,1053.0
5,28576.0
6,2141733.0
7,508147100.0
8,402135300000.0
9,1073376000000000.0


## Feature extraction of a determined sequence

In [63]:
# returns a test set matrix [Xi, X1, X2, X3... Xn-2, Xn-1, Xn, y]
def getTrainingSet(seq):
    xlen = seq.shape[0]
    x = np.array([])
    trainingSet = np.zeros([xlen, xlen+1])
    columns = []
    for i in range(xlen):
        xi = np.array([float(i)])
        x1 = np.concatenate((seq[:i], np.zeros(xlen-1-i)))
        y1 = seq[i:i+1]
        trainingSet[i] = np.concatenate((xi, x1, y1))
        columns.append("x" + str(i))
    columns.append("y")
    return trainingSet, columns

trainingSet, columns = getTrainingSet(train.seq[0][:-1])
X = pd.DataFrame(trainingSet[:,:-1].astype(int), columns=columns[:-1])
y = pd.DataFrame(trainingSet[:,-1:].astype(int), columns=columns[-1:])
display(X, y)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12
0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0
2,2,1,3,0,0,0,0,0,0,0,0,0,0
3,3,1,3,13,0,0,0,0,0,0,0,0,0
4,4,1,3,13,87,0,0,0,0,0,0,0,0
5,5,1,3,13,87,1053,0,0,0,0,0,0,0
6,6,1,3,13,87,1053,28576,0,0,0,0,0,0
7,7,1,3,13,87,1053,28576,2141733,0,0,0,0,0
8,8,1,3,13,87,1053,28576,2141733,508147108,0,0,0,0
9,9,1,3,13,87,1053,28576,2141733,508147108,402135275365,0,0,0


Unnamed: 0,y
0,1
1,3
2,13
3,87
4,1053
5,28576
6,2141733
7,508147108
8,402135275365
9,1073376057490373


## Fit a Linear Regression model

In [138]:
display(pd.DataFrame([
            ["x0", "x1", "x2", "x3", "y"],
            ["x00", "x01", "x02", "x03", "y0"],
            ["x10", "x02", "x03", "x13", "y1"],
            ["x20", "x12", "x13", "x23", "y1"],
        ]))
print("to transform m into shifted ngrams matrix that's idea...")

def getTrainingSetNgram(seq):
    m, columns = getTrainingSet(seq)
    n = int((m.shape[1]-2)/3) # n-gram n value
    mp = np.array([m[:,0], *tuple(np.zeros([n, m.shape[1]-1])), m[:,-1]]).transpose()
    for i in range(n, m.shape[0]):
        mp[i,1:1+n] = m[i,1-n+i:1+i]
        #mp[i-n,0] = i
        #mp[i-n,1:1+n] = m[i,1-n+i:1+i]
    #print(mp.shape, columns[:-(mp.shape[1]+2)])
    return mp, columns[:-(mp.shape[1]+2)]

trainingSet, columns = getTrainingSet(train.seq[0][:-1])
display(pd.DataFrame(trainingSet.astype(int), columns=columns))

print("@TODO complete diagonals - ngram matrix")
trainingSet, columns = getTrainingSetNgram(train.seq[0][:-1])
display(pd.DataFrame(trainingSet.astype(int), columns=columns))

Unnamed: 0,0,1,2,3,4
0,x0,x1,x2,x3,y
1,x00,x01,x02,x03,y0
2,x10,x02,x03,x13,y1
3,x20,x12,x13,x23,y1


to transform m into shifted ngrams matrix that's idea...


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,y
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,0,0,0,0,0,0,0,0,0,0,0,3
2,2,1,3,0,0,0,0,0,0,0,0,0,0,13
3,3,1,3,13,0,0,0,0,0,0,0,0,0,87
4,4,1,3,13,87,0,0,0,0,0,0,0,0,1053
5,5,1,3,13,87,1053,0,0,0,0,0,0,0,28576
6,6,1,3,13,87,1053,28576,0,0,0,0,0,0,2141733
7,7,1,3,13,87,1053,28576,2141733,0,0,0,0,0,508147108
8,8,1,3,13,87,1053,28576,2141733,508147108,0,0,0,0,402135275365
9,9,1,3,13,87,1053,28576,2141733,508147108,402135275365,0,0,0,1073376057490373


@TODO complete diagonals - ngram matrix


Unnamed: 0,x0,x1,x2,x3,x4,x5
0,0,0,0,0,0,1
1,1,0,0,0,0,3
2,2,0,0,0,0,13
3,3,0,0,0,0,87
4,4,1,3,13,87,1053
5,5,3,13,87,1053,28576
6,6,13,87,1053,28576,2141733
7,7,87,1053,28576,2141733,508147108
8,8,1053,28576,2141733,508147108,402135275365
9,9,28576,2141733,508147108,402135275365,1073376057490373


In [139]:
regresor = LinearRegression()
regresor.fit(X.as_matrix(), y.as_matrix())
inp = np.copy(X.as_matrix())
out = regresor.predict(inp)
print("train.....")
print(inp)
print(inp.shape)
print(out)


print("test.....")
inpFinal = np.copy(X[inp.shape[0]-1:inp.shape[1]+1].as_matrix())
# assing y1 as last train X1 to predict Xn+1
#inpFinal[:,-1] = out[-1:,0]


X0 = inpFinal[0,:1]
X2_N = inpFinal[0,2:]
XN = out[-1:,0]
inpFinal = np.concatenate((X0, X2_N, XN))
print(inpFinal)

outFinal = regresor.predict(inpFinal)
print(outFinal)

train.....
[[                   0                    0                    0
                     0                    0                    0
                     0                    0                    0
                     0                    0                    0
                     0]
 [                   1                    1                    0
                     0                    0                    0
                     0                    0                    0
                     0                    0                    0
                     0]
 [                   2                    1                    3
                     0                    0                    0
                     0                    0                    0
                     0                    0                    0
                     0]
 [                   3                    1                    3
                    13                    0                    0
       



## Predict Model function

In [140]:
def predictLast(seq):
    if seq.shape[0] == 0:
        return 0
    if seq.shape[0] <= 2:
        return seq[0]
    trainingSet, columns = getTrainingSetNgram(seq)
    X = pd.DataFrame(trainingSet[:,:-1].astype(int), columns=columns[:-1])
    y = pd.DataFrame(trainingSet[:,-1:].astype(int), columns=columns[-1:])
    display(X,y)
    regresor = LinearRegression()
    # train
    regresor.fit(X.as_matrix(), y.as_matrix())
    inp = np.copy(X.as_matrix())
    out = regresor.predict(inp)
    print("0000",inp.astype(int))
    # test
    inpFinal = np.copy(X[inp.shape[0]-1:inp.shape[1]+1].as_matrix())
    print("1111",inpFinal.astype(int))
    # assing y1 as last train X1 to predict Xn+1
    print('@in@', inpFinal[:,:-1])
    print('@ou@', out[-1:,0])
    inpFinal[:,-1] = out[-1:,0]
    print("2222",inpFinal.astype(int))
    outFinal = regresor.predict(inpFinal)
    return outFinal

### Test predictive model with Training set

In [127]:
matches = 0
zeros = 0
output = []
#for i in range(len(train)):
for i in range(5):
    seq = train.seq[i]
    last = int(seq[-1:])
    print(">>>",seq[:-1])
    lastpredicted = int(predictLast(seq[:-1]))  ## TODO here is the error
    print("lp>>>",lastpredicted)
    diff = last - lastpredicted
    output.append([train.id[i], lastpredicted])
    if diff == 0:
        matches += 1
    if lastpredicted == 0:
        zeros += 1
    if i>0 and i%2000==0:
        print("%d / %d matches; zeros = %d, accuracy = %f " % (matches, i, zeros, float(matches)/float(i)))
              
print("%d total matches" % matches)

pd.DataFrame(output).to_csv("train_submission.csv", header=["Id", "Last"], index=False, index_label=False)

>>> [  1.00000000e+00   3.00000000e+00   1.30000000e+01   8.70000000e+01
   1.05300000e+03   2.85760000e+04   2.14173300e+06   5.08147108e+08
   4.02135275e+11   1.07337606e+15   9.70038549e+18   2.98434347e+23
   3.14793601e+28]


Unnamed: 0,x0,x1,x2,x3,x4
0,0,0,0,0,0
1,1,0,0,0,0
2,2,0,0,0,0
3,3,0,0,0,0
4,4,1,3,13,87
5,5,3,13,87,1053
6,6,13,87,1053,28576
7,7,87,1053,28576,2141733
8,8,1053,28576,2141733,508147108
9,9,28576,2141733,508147108,402135275365


Unnamed: 0,x5
0,1
1,3
2,13
3,87
4,1053
5,28576
6,2141733
7,508147108
8,402135275365
9,1073376057490373


0000 [[                   0                    0                    0
                     0                    0]
 [                   1                    0                    0
                     0                    0]
 [                   2                    0                    0
                     0                    0]
 [                   3                    0                    0
                     0                    0]
 [                   4                    1                    3
                    13                   87]
 [                   5                    3                   13
                    87                 1053]
 [                   6                   13                   87
                  1053                28576]
 [                   7                   87                 1053
                 28576              2141733]
 [                   8                 1053                28576
               2141733            508147108]
 [   

ValueError: Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required.

### Test predictive model with Test set

In [None]:
output = []
for i in range(len(test)):
    seq = test.seq[i]
    lastpredicted = predictLast(seq)
    try:
        output.append([test.id[i], int(lastpredicted)])
    except OverflowError:
        output.append([test.id[i], int(seq[-1:])])
    if i>0 and i%2000==0:
        print("%d / %d done" % (i, len(test)))
              
pd.DataFrame(output).to_csv("submission3.csv", header=["Id", "Last"], index=False, index_label=False)