In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score

In [2]:
def find_nearest(array,value):
    idx = (np.abs(array-value)).argmin()
    
    return array[idx]

In [3]:
def refine_data(arr1,arr2,arr3):
    refined_data = []
    for i in range(arr1.shape[0]):
        rt = find_nearest(arr1[i],arr3[i])
        refined_data.append(rt.round(3))
        
    refined_data = np.array(refined_data)
    
    return refined_data

In [4]:
def get_metrics(arr1,arr2):
    mse = mean_squared_error(arr1,arr2)
    r2 = r2_score(arr1,arr2)
    return mse,r2

In [5]:
d1 = pd.read_csv('X_data_t3',sep = ',')
d2 = pd.read_csv('y_data',sep = ',')
d3 = pd.concat([d1,d2],axis=1)

#data = d3.loc[d3['maxRT_ab'] < 9e6]
#d3 = shuffle(d3)

#X = np.array(d3[['maxRT_t','x_start_t','diff_start','diff_end']])
#X = np.array(d3[['maxRT_ab','maxRT_baseline','x_start_ab','x_end_ab']])
#X = np.array(d3[['maxRT_t','maxRT_ab','maxRT_baseline','x_start_ab','x_end_ab']])
#X = np.array(d3[['maxRT_t','maxRT_ab','maxRT_baseline','x_start_t','x_start_ab','diff_start','x_end_t','x_end_ab','diff_end']])

#X = np.array(d3[['rt','maxRT_t','maxRT_ab','maxRT_base','x_start_t','x_start_ab','diff_start','x_end_t',
#                'x_end_ab','diff_end','width']])

X = np.array(d3[['rt','maxRT_t','x_start_t','diff_start','x_end_t','diff_end','width']])

y = np.array(d3['y_left_t'])

time = np.array(pd.read_csv('time',sep = ',',header = None).dropna(axis = 'columns'))
abundance = np.array(pd.read_csv('abundance',sep = ',',header = None).dropna(axis = 'columns'))
baseline = np.array(pd.read_csv('baseline',sep = ',',header = None).dropna(axis = 'columns'))

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [7]:
scaler_train = MinMaxScaler()
#scaler_train = StandardScaler()
scaler_train.fit(X_train)
X_train =scaler_train.transform(X_train)

scaler_test = MinMaxScaler()
#scaler_test = StandardScaler()
scaler_test.fit(X_test)
X_test = scaler_test.transform(X_test)

In [8]:
n_estimators = 5000
max_depth = 5000
random_state = 500

params = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': 10,
                 'learning_rate': 0.1, 'loss': 'lad'}

clf = ensemble.GradientBoostingRegressor(**params)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
clf.fit(X_train,y_train)
y_predict = clf.predict(X_test).round(3)
             
mse_1,r2_1 = get_metrics(y_test,y_predict)
        
print(mse_1,r2_1)



[0.14355469 0.2154058  0.30670447 0.17620917 0.43836956]
0.00024558823529411745 0.02390272776382507


In [9]:
y_test

array([0.98 , 0.99 , 1.003, 1.013, 1.   , 0.993, 0.98 , 1.007, 0.983,
       0.987, 0.98 , 1.013, 1.003, 1.017, 0.997, 0.99 , 0.953, 1.   ,
       0.98 , 0.983, 0.97 , 0.983, 1.017, 0.997, 0.99 , 1.013, 0.993,
       0.997, 0.973, 0.98 , 0.973, 0.997, 0.97 , 0.98 , 0.993, 1.007,
       0.977, 0.983, 0.977, 0.983, 0.983, 0.98 , 0.98 , 0.983, 1.   ,
       0.967, 0.997, 1.017, 0.957, 0.98 , 0.993, 1.   , 0.987, 0.983,
       0.997, 1.01 , 0.953, 1.007, 0.987, 0.983, 0.983, 0.983, 0.983,
       0.983, 1.02 , 0.987, 0.963, 0.98 , 1.017, 0.973, 0.987, 0.963,
       1.003, 1.007, 0.997, 0.937, 0.987, 0.987, 0.997, 0.993, 0.98 ,
       0.977, 1.013, 0.977, 0.987])

In [10]:
y_predict

array([0.987, 0.995, 0.987, 1.001, 1.004, 0.995, 0.98 , 1.001, 1.001,
       0.984, 0.987, 0.989, 1.004, 1.005, 0.981, 0.987, 1.001, 0.997,
       0.982, 0.995, 0.989, 0.98 , 0.995, 1.001, 0.981, 1.001, 0.991,
       1.001, 0.981, 0.981, 0.989, 0.987, 0.989, 0.987, 0.987, 1.001,
       0.982, 1.001, 0.977, 0.989, 0.981, 0.987, 1.001, 0.995, 0.995,
       0.989, 0.996, 1.004, 0.989, 0.989, 1.001, 0.995, 0.984, 0.989,
       0.995, 1.017, 0.989, 1.005, 0.981, 0.983, 0.989, 0.99 , 0.983,
       0.989, 1.014, 0.989, 0.995, 0.985, 0.987, 0.983, 0.995, 1.004,
       1.001, 0.995, 0.995, 1.001, 0.995, 0.995, 0.995, 0.995, 1.001,
       0.995, 0.987, 0.995, 0.995])

In [11]:
from sklearn.decomposition.pca import PCA
from sklearn.preprocessing import scale
pca = PCA()
X_reduced = pca.fit_transform(X_train)
scores = cross_val_score(clf, X_reduced, y_train, cv=5, scoring = 'neg_mean_squared_error')
print(scores)
clf.fit(X_reduced,y_train)
y_predict = clf.predict(X_test).round(3)
mse_1,r2_1 = get_metrics(y_test,y_predict)
        
print(mse_1,r2_1)


[-0.00033898 -0.00021808 -0.00012696 -0.00020934 -0.00015512]
0.00034392941176470434 -0.3669570134875322


In [12]:
y_predict

array([0.989, 1.001, 1.001, 1.001, 1.001, 1.001, 0.984, 1.001, 1.001,
       1.001, 0.986, 1.001, 0.989, 1.001, 1.001, 1.001, 1.001, 0.989,
       0.986, 1.001, 1.001, 0.984, 1.001, 1.001, 1.001, 1.001, 1.001,
       1.001, 0.985, 0.986, 1.001, 1.001, 1.001, 0.986, 1.001, 1.001,
       0.985, 1.001, 0.999, 1.001, 0.985, 0.986, 1.001, 1.001, 1.001,
       1.001, 0.991, 1.001, 1.001, 1.001, 1.001, 1.001, 0.984, 1.001,
       1.001, 0.986, 1.001, 1.001, 1.001, 0.985, 1.001, 1.003, 0.986,
       1.001, 1.001, 1.001, 1.001, 0.985, 1.001, 0.985, 1.001, 1.001,
       1.001, 1.001, 1.001, 1.001, 1.001, 1.001, 1.001, 1.001, 1.001,
       1.001, 1.001, 1.001, 1.001])

In [13]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

array([ 95.72,  98.19, 100.  , 100.  , 100.  , 100.  , 100.  ])