# Study

## Series Analysis

### Example: Fibonacci

a(0) = 1

a(1) = 1

a(n) = a(n-2) + a(n-1)

a(n) = Xa(n-2) + Ya(n-1)

## Data load and training / test sets initialization

In [1]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from IPython.display import display, HTML
%matplotlib inline

def stoarray(data = [], sep = ','):
    return data.map(lambda x: np.array(x.split(sep), dtype=float))

# load the data
colna = ['id', 'seq']
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
test.columns = colna
train.columns = colna
train['seq'], test['seq'] = stoarray(train['seq']), stoarray(test['seq'])
train.head()
display(pd.DataFrame(train.seq[0]))

Unnamed: 0,0
0,1.0
1,3.0
2,13.0
3,87.0
4,1053.0
5,28576.0
6,2141733.0
7,508147100.0
8,402135300000.0
9,1073376000000000.0


## Feature extraction of a determined sequence

In [2]:
# returns a test set matrix [Xi, X1, X2, X3... Xn-2, Xn-1, Xn, y]
def getTrainingSet(seq):
    xlen = len(seq)-1
    x = np.array([])
    trainingSet = np.zeros([xlen+1, xlen+2])
    columns = []
    for i in range(xlen+1):
        xi = np.array([float(i)])
        x1 = np.concatenate((seq[:i], np.zeros(xlen-i)))
        y1 = seq[i:i+1]
        trainingSet[i] = np.concatenate((xi, x1, y1))
        columns.append("x" + str(i))
    return trainingSet, columns

trainingSet, columns = getTrainingSet(train.seq[0][:-1])
X = pd.DataFrame(trainingSet[:,:-1], columns=columns)
y = pd.DataFrame(trainingSet[:,-1:], columns=["y"])
display(X, y)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,1.0,3.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,1.0,3.0,13.0,87.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5.0,1.0,3.0,13.0,87.0,1053.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6.0,1.0,3.0,13.0,87.0,1053.0,28576.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7.0,1.0,3.0,13.0,87.0,1053.0,28576.0,2141733.0,0.0,0.0,0.0,0.0,0.0
8,8.0,1.0,3.0,13.0,87.0,1053.0,28576.0,2141733.0,508147108.0,0.0,0.0,0.0,0.0
9,9.0,1.0,3.0,13.0,87.0,1053.0,28576.0,2141733.0,508147108.0,402135300000.0,0.0,0.0,0.0


Unnamed: 0,y
0,1.0
1,3.0
2,13.0
3,87.0
4,1053.0
5,28576.0
6,2141733.0
7,508147100.0
8,402135300000.0
9,1073376000000000.0


## Fit a Linear Regression model

In [4]:
regresor = LinearRegression()
regresor.fit(X.as_matrix(), y.as_matrix())
inp = np.copy(X.as_matrix())
out = regresor.predict(inp)
print("train.....")
print(inp)
print(inp.shape)
print(out)


print("test.....")
inpFinal = np.copy(X[inp.shape[0]-1:inp.shape[1]+1].as_matrix())
# assing y1 as last train X1 to predict Xn+1
inpFinal[:,-1] = out[-1:,0]
print(inpFinal)
outFinal = regresor.predict(inpFinal)
print(outFinal)

train.....
[[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00]
 [  1.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00]
 [  2.00000000e+00   1.00000000e+00   3.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00]
 [  3.00000000e+00   1.00000000e+00   3.00000000e+00   1.30000000e+01
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00]
 [  4.00000000e+00   1.00000000e+00   3.00000000e+00   1.30000000e+01

In [33]:
def predictLast(seq):
    if len(seq) <= 1:
        return seq[0]
    trainingSet, columns = getTrainingSet(seq)
    X = pd.DataFrame(trainingSet[:,:-1], columns=columns)
    y = pd.DataFrame(trainingSet[:,-1:], columns=["y"])
    regresor = LinearRegression()
    # train
    regresor.fit(X.as_matrix(), y.as_matrix())
    inp = np.copy(X.as_matrix())
    out = regresor.predict(inp)
    # test
    inpFinal = np.copy(X[inp.shape[0]-1:inp.shape[1]+1].as_matrix())
    # assing y1 as last train X1 to predict Xn+1
    inpFinal[:,-1] = out[-1:,0]
    outFinal = regresor.predict(inpFinal)
    return outFinal

matches = 0
output = []
#for i in range(len(train)):
for i in range(5):
    seq = train.seq[i]
    last = seq[-1:]
    lastpredicted = predictLast(seq[:-1])
    diff = int(last - lastpredicted)
    output.append([train.id[i], int(lastpredicted)])
    if diff == 0:
        matches += 1
    if i>0 and i%1000==0:
        print("%d / %d matches; accuracy = %f " % (matches, i, float(matches)/float(i)))
              
print("%d total matches" % matches)



output = []
for i in range(len(test)):
    seq = test.seq[i]
    lastpredicted = predictLast(seq)
    try:
        output.append([test.id[i], int(lastpredicted)])
    except OverflowError:
        output.append([test.id[i], int(seq[-1:])])
    if i>0 and i%1000==0:
        print("%d / %d done" % (i, len(test)))
              
pd.DataFrame(output).to_csv("submission3.csv", header=["Id", "Last"], index=False, index_label=False)

0 total matches
1000 / 113845 done
2000 / 113845 done
3000 / 113845 done
4000 / 113845 done
5000 / 113845 done
6000 / 113845 done
7000 / 113845 done
8000 / 113845 done
9000 / 113845 done
10000 / 113845 done
11000 / 113845 done
12000 / 113845 done
13000 / 113845 done
14000 / 113845 done
15000 / 113845 done
16000 / 113845 done
17000 / 113845 done
18000 / 113845 done
19000 / 113845 done
20000 / 113845 done
21000 / 113845 done
22000 / 113845 done
23000 / 113845 done
24000 / 113845 done
25000 / 113845 done
26000 / 113845 done
27000 / 113845 done
28000 / 113845 done
29000 / 113845 done
30000 / 113845 done
31000 / 113845 done
32000 / 113845 done
33000 / 113845 done
34000 / 113845 done
35000 / 113845 done
36000 / 113845 done
37000 / 113845 done
38000 / 113845 done
39000 / 113845 done
40000 / 113845 done
41000 / 113845 done
42000 / 113845 done
43000 / 113845 done
44000 / 113845 done
45000 / 113845 done
46000 / 113845 done
47000 / 113845 done
48000 / 113845 done
49000 / 113845 done
50000 / 11384