In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso, MultiTaskLasso
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import SGDRegressor # when attributes are big
from sklearn.kernel_ridge import KernelRidge # identical to support vector regression (SVR)
from sklearn.svm import SVR # use support vector regression (SVR)
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.metrics import roc_curve, auc

# Load prepared data

In [2]:
save_path = "/Users/chikakoolsen/opt/python/thesis/code/tdcs_thesis/data/raw/"

## fmap mean for 5 run 

In [3]:
file_mean = save_path+"fmap_mean.txt"
columns_mean =['exp', 'mini_exp', 'i', 'j', 'k', 'mean0', 'mean1', 'mean2', 'mean3', 'mean4', 'theory']
data = np.loadtxt(file_mean);

In [4]:
df = pd.DataFrame(data, columns=columns_mean)
df = df.astype({"exp": int, "i": int, "j": int, "k": int, "mini_exp": int})

In [5]:
df

Unnamed: 0,exp,mini_exp,i,j,k,mean0,mean1,mean2,mean3,mean4,theory
0,36,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,36,1,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0
2,36,1,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0
3,36,1,0,0,3,0.0,0.0,0.0,0.0,0.0,0.0
4,36,1,0,0,4,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
3244027,38,6,43,63,59,0.0,0.0,0.0,0.0,0.0,0.0
3244028,38,6,43,63,60,0.0,0.0,0.0,0.0,0.0,0.0
3244029,38,6,43,63,61,0.0,0.0,0.0,0.0,0.0,0.0
3244030,38,6,43,63,62,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3244032 entries, 0 to 3244031
Data columns (total 11 columns):
 #   Column    Dtype  
---  ------    -----  
 0   exp       int64  
 1   mini_exp  int64  
 2   i         int64  
 3   j         int64  
 4   k         int64  
 5   mean0     float64
 6   mean1     float64
 7   mean2     float64
 8   mean3     float64
 9   mean4     float64
 10  theory    float64
dtypes: float64(6), int64(5)
memory usage: 272.3 MB


## 55 fmap means

In [None]:
arr = [4, 6, 8, 10, 12]
col = []
for i in range(len(arr)):
    for j in range(1, 12):
        col.append(a)

In [None]:
file_mean_55 = save_path+"fmap_mean_55.txt"
columns_mean_55 = ['exp', 'mini_exp', 'i', 'j', 'k'].append(col)
columns_mean_55.append(['theory'])
data = np.loadtxt(file_mean_55);

In [None]:
columns_mean_55

## new attributes for Linear Models

In [7]:
file_mean_new = save_path+"fmap_mean_new.txt"
columns =['exp', 'mini_exp', 'i', 'j', 'k', 'mean0', 'mean1', 'mean2', 'mean3', 'mean4', 'theory', 'new0', 'new1', 'new2', 'new3', 'new4', 'slope', 'intercept']
data = np.loadtxt(file_mean_new);

In [8]:
df_new = pd.DataFrame(data, columns=columns)
df_new = df_new.astype({"exp": int, "i": int, "j": int, "k": int, "mini_exp": int})

In [9]:
df_new.isnull().any()

exp          False
mini_exp     False
i            False
j            False
k            False
mean0        False
mean1        False
mean2        False
mean3        False
mean4        False
theory       False
new0         False
new1         False
new2         False
new3         False
new4         False
slope        False
intercept    False
dtype: bool

In [10]:
df_new.describe()

Unnamed: 0,exp,mini_exp,i,j,k,mean0,mean1,mean2,mean3,mean4,theory,new0,new1,new2,new3,new4,slope,intercept
count,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0,3244032.0
mean,37.0,3.5,21.5,31.5,31.5,-46.75177,-46.3227,-49.43398,-52.54767,-51.05728,2.070648,inf,-inf,-inf,-inf,inf,-1.076377,-46.92826
std,0.8164967,1.707825,12.69843,18.47296,18.47296,384.8085,392.6424,385.7237,387.2141,387.9891,8.492458,,,,,,18.74691,382.0577
min,36.0,1.0,0.0,0.0,0.0,-14223.62,-14619.19,-12524.48,-14251.34,-12398.63,-775.9955,-628978500.0,-inf,-inf,-inf,-628978500.0,-1758.125,-12719.16
25%,36.0,2.0,10.75,15.75,15.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37.0,3.5,21.5,31.5,31.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,5.0,32.25,47.25,47.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,38.0,6.0,43.0,63.0,63.0,6884.447,7646.3,7653.812,6849.814,7587.812,769.7382,inf,96601160.0,1257957000.0,1257957000.0,inf,1810.604,7199.591


In [11]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3244032 entries, 0 to 3244031
Data columns (total 18 columns):
 #   Column     Dtype  
---  ------     -----  
 0   exp        int64  
 1   mini_exp   int64  
 2   i          int64  
 3   j          int64  
 4   k          int64  
 5   mean0      float64
 6   mean1      float64
 7   mean2      float64
 8   mean3      float64
 9   mean4      float64
 10  theory     float64
 11  new0       float64
 12  new1       float64
 13  new2       float64
 14  new3       float64
 15  new4       float64
 16  slope      float64
 17  intercept  float64
dtypes: float64(13), int64(5)
memory usage: 445.5 MB


# Split data

## Only mean

In [13]:
df.iloc[:, 2:-1]

Unnamed: 0,i,j,k,mean0,mean1,mean2,mean3,mean4
0,0,0,1,0.0,0.0,0.0,0.0,0.0
1,0,1,1,0.0,0.0,0.0,0.0,0.0
2,0,2,1,0.0,0.0,0.0,0.0,0.0
3,0,3,1,0.0,0.0,0.0,0.0,0.0
4,0,4,1,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
3244027,63,59,6,0.0,0.0,0.0,0.0,0.0
3244028,63,60,6,0.0,0.0,0.0,0.0,0.0
3244029,63,61,6,0.0,0.0,0.0,0.0,0.0
3244030,63,62,6,0.0,0.0,0.0,0.0,0.0


In [14]:
X = df.iloc[:, 2:-1].values  # include i,j,k
y = df['theory'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

In [15]:
print(X.shape)
print(y.shape)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3244032, 8)
(3244032,)
(2270822, 8)
(2270822,)
(973210, 8)
(973210,)


## Non zero
Not so effective

In [None]:
df_copy = df.copy()
idx = df[(df['mean0']==0.0) & (df['mean1']==0.0) & (df['mean2']==0.0) & (df['mean3']==0.0) & (df['mean4']==0.0)].index
df_copy.drop(idx, inplace=True)
df_copy.reset_index(drop=True, inplace=True)
df_copy.shape

In [66]:
df_nonzero = df[(df['mean0']!=0.0) & (df['mean1']!=0.0) & (df['mean2']!=0.0) & (df['mean3']!=0.0) & (df['mean4']!=0.0)]

In [67]:
df_nonzero

Unnamed: 0,exp,i,j,k,mini_exp,mean0,mean1,mean2,mean3,mean4,theory
30760,36,7,32,40,1,118.874306,102.631088,73.720810,46.604973,35.842915,4.988159
30761,36,7,32,41,1,-69.809402,-88.836975,-115.809509,-147.981812,-157.407669,4.591202
30762,36,7,32,42,1,-501.965088,-518.938477,-546.379456,-559.395264,-578.390381,4.237324
30763,36,7,32,43,1,-729.152405,-748.379883,-771.094543,-794.678772,-810.996643,3.918999
30799,36,7,33,15,1,-1217.971191,-1236.205566,-1261.405762,-1284.039307,-1296.501587,3.371496
...,...,...,...,...,...,...,...,...,...,...,...
3213651,38,36,37,19,6,-75.107628,-68.905487,-75.427681,-82.513771,-79.595093,4.270157
3213652,38,36,37,20,6,76.162445,81.230690,75.100578,66.783051,69.944969,4.636876
3213653,38,36,37,21,6,196.146164,202.442993,194.505157,186.740524,189.227448,5.051056
3213660,38,36,37,28,6,749.112549,763.357300,750.066223,730.827637,739.551208,13.024664


In [68]:
X = df_nonzero.iloc[:, 5:-1].values
y = df_nonzero['theory'].values

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

## Splid data with mini experiment

In [7]:
df_train = df[(df['exp']==36) & (df['mini_exp']!=6)]
df_test = df[(df['exp']==36) & (df['mini_exp']==6)]

In [8]:
df_train.iloc[:, 2:-1]

Unnamed: 0,i,j,k,mean0,mean1,mean2,mean3,mean4
0,0,0,0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,0.0,0.0,0.0,0.0,0.0
2,0,0,2,0.0,0.0,0.0,0.0,0.0
3,0,0,3,0.0,0.0,0.0,0.0,0.0
4,0,0,4,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
901115,43,63,59,0.0,0.0,0.0,0.0,0.0
901116,43,63,60,0.0,0.0,0.0,0.0,0.0
901117,43,63,61,0.0,0.0,0.0,0.0,0.0
901118,43,63,62,0.0,0.0,0.0,0.0,0.0


In [9]:
X_train = df_train.iloc[:, 5:-1].values
y_train = df_train['theory'].values

X_test = df_test.iloc[:, 5:-1].values
y_test = df_test['theory'].values

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(901120, 5)
(901120,)
(180224, 5)
(180224,)


## Splid new data with mini experiment (new attributes)

In [16]:
df_train_new = df_new[(df_new['exp']==38) & (df_new['mini_exp']!=6)]
df_test_new = df_new[(df_new['exp']==38) & (df_new['mini_exp']==6)]

In [156]:
df_new.iloc[:, -7:-2]

Unnamed: 0,new0,new1,new2,new3,new4
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
3244027,0.0,0.0,0.0,0.0,0.0
3244028,0.0,0.0,0.0,0.0,0.0
3244029,0.0,0.0,0.0,0.0,0.0
3244030,0.0,0.0,0.0,0.0,0.0


In [159]:
X_train_new = df_train_new.iloc[:, -7:-2].values
y_train_new = df_train_new['theory'].values

X_test_new = df_test_new.iloc[:, -7:-2].values
y_test_new = df_test_new['theory'].values

In [160]:
print(X_train_new.shape)
print(y_train_new.shape)
print(X_test_new.shape)
print(y_test_new.shape)

(901120, 5)
(901120,)
(180224, 5)
(180224,)


# SVM

In [None]:
regr = SVR()
regr.fit(X_train, y_train)

In [None]:
y_train_pred = regr.predict(X_train)
y_test_pred = regr.predict(X_test)

In [None]:
print(regr.score(X_train, y_train))
print(regr.score(X_test, y_test))
print(r2_score(y_test, y_predict))


# GridSearch

## Find out which kernel is best

In [None]:
param_gamma = ['auto', 'scale']
param_degree = [3, 5]
param_grid = [{'kernel': ['linear']},
              {'kernel': ['rbf'],
               'gamma': param_gamma},
              {'kernel': ['poly'],
               'gamma': param_gamma,
               'degree': param_degree},
              {'kernel': ['sigmoid'],
               'gamma': param_gamma},
              {'kernel': ['precomputed']}]

svr = SVR()

gs = GridSearchCV(estimator=svr, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  n_jobs=-1)
clf = gs.fit(X_train, y_train)

In [None]:
print(clf.best_score_)
print(clf.best_params_)
result = clf.best_estimator_
print('Test accuracy: %.3f' % result.score(X_test, y_test))