# Study

## Series Analysis

### Example: Fibonacci

a(0) = 1

a(1) = 1

a(n) = a(n-2) + a(n-1)

a(n) = Xa(n-2) + Ya(n-1)

## Data load and training / test sets initialization

In [17]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from IPython.display import display, HTML
%matplotlib inline

def stoarray(data = [], sep = ','):
    return data.map(lambda x: np.array(x.split(sep), dtype=float))

# load the data
colna = ['id', 'seq']
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
test.columns = colna
train.columns = colna
train['seq'], test['seq'] = stoarray(train['seq']), stoarray(test['seq'])
train.head()
display(pd.DataFrame(train.seq[1].astype(int)))

Unnamed: 0,0
0,1
1,2
2,1
3,5
4,5
5,1
6,11
7,16
8,7
9,1


## Feature extraction of a determined sequence

In [19]:
# returns a test set matrix [Xi, X1, X2, X3... Xn-2, Xn-1, Xn, y]
def getTrainingSet(seq):
    xlen = seq.shape[0]
    x = np.array([])
    trainingSet = np.zeros([xlen, xlen+1])
    columns = []
    for i in range(xlen):
        xi = np.array([float(i)])
        x1 = np.concatenate((seq[:i], np.zeros(xlen-1-i)))
        y1 = seq[i:i+1]
        trainingSet[i] = np.concatenate((xi, x1, y1))
        columns.append("x" + str(i))
    columns.append("y")
    return trainingSet, columns

trainingSet, columns = getTrainingSet(train.seq[1][:-1])
#X = pd.DataFrame(trainingSet[:,:-1].astype(int), columns=columns[:-1])
#y = pd.DataFrame(trainingSet[:,-1:].astype(int), columns=columns[-1:])
#display(X, y)
display(pd.DataFrame(trainingSet.astype(int), columns=columns))

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x47,x48,x49,x50,x51,x52,x53,x54,x55,y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,2,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,1,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
4,4,1,2,1,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
5,5,1,2,1,5,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,6,1,2,1,5,5,1,0,0,0,...,0,0,0,0,0,0,0,0,0,11
7,7,1,2,1,5,5,1,11,0,0,...,0,0,0,0,0,0,0,0,0,16
8,8,1,2,1,5,5,1,11,16,0,...,0,0,0,0,0,0,0,0,0,7
9,9,1,2,1,5,5,1,11,16,7,...,0,0,0,0,0,0,0,0,0,1


## Fit a Linear Regression model

In [58]:
display(pd.DataFrame([
            ["x0", "x1", "x2", "x3", "y"],
            ["x00", "x01", "x02", "x03", "y0"],
            ["x10", "x02", "x03", "x13", "y1"],
            ["x20", "x12", "x13", "x23", "y1"],
        ]))
print("to transform m into shifted ngrams matrix that's idea...")

# performs dimensionality reduction using ngrams
def getTrainingSetNgram(seq):
    m, columns = getTrainingSet(seq)
    n = int((m.shape[1]-2)/3) # n-gram n value
    mp = np.array([m[:,0], *tuple(np.zeros([n, m.shape[1]-1])), m[:,-1]]).transpose()
    for i in range(n, m.shape[0]):
        mp[i,1:1+n] = m[i,1-n+i:1+i]
        #mp[i-n,0] = i
        #mp[i-n,1:1+n] = m[i,1-n+i:1+i]
    #print(mp.shape, columns[:mp.shape[1]])
    columns[mp.shape[1]-1] = "y"
    return mp, columns[:mp.shape[1]]

trainingSet, columns = getTrainingSet(train.seq[1][:-1])
display(pd.DataFrame(trainingSet.astype(int), columns=columns))

print("@TODO complete diagonals - ngram matrix")
trainingSet, columns = getTrainingSetNgram(train.seq[1][:-1])
display(pd.DataFrame(trainingSet.astype(int), columns=columns))

Unnamed: 0,0,1,2,3,4
0,x0,x1,x2,x3,y
1,x00,x01,x02,x03,y0
2,x10,x02,x03,x13,y1
3,x20,x12,x13,x23,y1


to transform m into shifted ngrams matrix that's idea...


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x47,x48,x49,x50,x51,x52,x53,x54,x55,y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,2,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,1,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
4,4,1,2,1,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
5,5,1,2,1,5,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,6,1,2,1,5,5,1,0,0,0,...,0,0,0,0,0,0,0,0,0,11
7,7,1,2,1,5,5,1,11,0,0,...,0,0,0,0,0,0,0,0,0,16
8,8,1,2,1,5,5,1,11,16,0,...,0,0,0,0,0,0,0,0,0,7
9,9,1,2,1,5,5,1,11,16,7,...,0,0,0,0,0,0,0,0,0,1


@TODO complete diagonals - ngram matrix


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,y
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5
4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5
5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11
7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16
8,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7
9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [4]:
regresor = LinearRegression()
regresor.fit(X.as_matrix(), y.as_matrix())
inp = np.copy(X.as_matrix())
out = regresor.predict(inp)
print("train.....")
print(inp)
print(inp.shape)
print(out)


print("test.....")
inpFinal = np.copy(X[inp.shape[0]-1:inp.shape[1]+1].as_matrix())
# assing y1 as last train X1 to predict Xn+1
#inpFinal[:,-1] = out[-1:,0]


X0 = inpFinal[0,:1]
X2_N = inpFinal[0,2:]
XN = out[-1:,0]
inpFinal = np.concatenate((X0, X2_N, XN))
print(inpFinal)

outFinal = regresor.predict(inpFinal)
print(outFinal)

train.....
[[                   0                    0                    0
                     0                    0                    0
                     0                    0                    0
                     0                    0                    0
                     0]
 [                   1                    1                    0
                     0                    0                    0
                     0                    0                    0
                     0                    0                    0
                     0]
 [                   2                    1                    3
                     0                    0                    0
                     0                    0                    0
                     0                    0                    0
                     0]
 [                   3                    1                    3
                    13                    0                    0
       



## Predict Model function

In [178]:
def predictLast(seq):
    if seq.shape[0] == 0:
        return 0
    if seq.shape[0] <= 2:
        return seq[0]
    
    trainingSet, columns = getTrainingSetNgram(seq)
    X = pd.DataFrame(trainingSet[:,:-1].astype(int), columns=columns[:-1])
    y = pd.DataFrame(trainingSet[:,-1:].astype(int), columns=columns[-1:])
    #display(pd.DataFrame(trainingSet.astype(int), columns=columns))
    #display(pd.DataFrame(X.astype(int)))

    testIndex = np.array([int(seq.shape[0])])
    testRow = trainingSet[-1:,2:][0].astype(int);
    testRowX = pd.DataFrame(np.array([np.concatenate((testIndex, testRow))]))
    #print(testIndex.shape, testRow.shape)
    #display(testRowX)
    
    regresor = LinearRegression()
    # train
    regresor.fit(X.as_matrix(), y.as_matrix())
    
    #inpFinal = np.copy(inp)
    outFinal = regresor.predict(testRowX.as_matrix())
    #print(len(seq), len(outFinal))
    #display(pd.DataFrame([seq.astype(int), outFinal.astype(int)]))
    return int(outFinal[-1:])

### Test predictive model with Training set

In [179]:
matches = 0
zeros = 0
output = []
#for i in range(len(train)):
for i in range(1,10):
    seq = train.seq[i]
    last = int(seq[-1:])
    lastpredicted = int(predictLast(seq[:-1]))  ## TODO here is the error
    diff = last - lastpredicted
    print('result = %d; expected = %d; diff = %d' % (lastpredicted, last, diff))
    output.append([train.id[i], lastpredicted])
    if diff == 0:
        matches += 1
    if lastpredicted == 0:
        zeros += 1
    if i>0 and i%2000==0:
        print("%d / %d matches; zeros = %d, accuracy = %f " % (matches, i, zeros, float(matches)/float(i)))
              
print("%d total matches" % matches)

pd.DataFrame(output).to_csv("train_submission.csv", header=["Id", "Last"], index=False, index_label=False)

result = 8239; expected = 7424; diff = -815
result = 2097156; expected = 2097152; diff = -4
result = 18610239426436420; expected = 18610239435360216; diff = 8923796
result = 36786596265090375680; expected = 28792920887348623835136; diff = 28756134291083533459456
result = 13; expected = 5; diff = -8
result = 61571; expected = 63240; diff = 1669
result = 83994600076709536; expected = 83994842745043328; diff = 242668333792
result = -33; expected = 1752; diff = 1785
result = 2719; expected = 2619; diff = -100
0 total matches


### Test predictive model with Test set

In [None]:
output = []
for i in range(len(test)):
    seq = test.seq[i]
    lastpredicted = predictLast(seq)
    try:
        output.append([test.id[i], int(lastpredicted)])
    except OverflowError:
        output.append([test.id[i], int(seq[-1:])])
    if i>0 and i%2000==0:
        print("%d / %d done" % (i, len(test)))
              
pd.DataFrame(output).to_csv("submission3.csv", header=["Id", "Last"], index=False, index_label=False)