In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

data = pd.read_csv("train.csv")

cont_sample_multidim = []
sample_points = np.random.randint(150000, len(data) + 1, 1000)

for point in sample_points:
    new_point_X = []
    temp = pd.DataFrame(data.iloc[point])
    new_point_y = temp.loc['time_to_failure'].to_list()[0]
    
    for i in range(150000):
        temp = pd.DataFrame(data.iloc[point - i])
        temp = temp.loc['acoustic_data'].to_list()[0]
        new_point_X.append(temp)
    
    cont_sample_multidim.append([new_point_X, new_point_y])

X = []
y = []
for i in range(len(cont_sample_multidim)):
    X.append(cont_sample_multidim[i][0])
    y.append(cont_sample_multidim[i][1])

pd.DataFrame(X).to_csv("rand_mdim_sample_100_X", index=False)
pd.DataFrame(y).to_csv("rand_mdim_sample_100_y", index=False)

best_score = np.infty

print("Done sampling")
print("\nStart KNN")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

results = []

for k in [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 400, 500]:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_test = knn.predict(X_test)
    score = root_mean_squared_error(y_pred_test, y_test)
    results.append([k, score])
    if score < best_score:
        best_score = score

print(results)

print("\nStart LR")
results = []

for i in range(5):#10):
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred_test = lr.predict(X_test)
    score = root_mean_squared_error(y_pred_test, y_test)
    results.append([i, score])
    if score < best_score:
        best_score = score

print(results)

print("\nStart SVR")
results = []

for e in [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1]:#10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 400, 500]:
    for c in [0.001, 0.01, 0.1, 1, 10, 20, 50, 100, 1000]:
        regr = make_pipeline(StandardScaler(), SVR(C=c, epsilon=e))
        regr.fit(X, y)
        y_pred_test = regr.predict(X_test)
        score = root_mean_squared_error(y_pred_test, y_test)
        results.append([e, c, score])
        if score < best_score:
            best_score = score

print(results)

print("best score: ", best_score)


In [None]:
print("\nStart SVR")
best_score = 0
results = []

for e in [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1]:
    for c in [0.001, 0.01, 0.1, 1, 10, 20, 50, 100, 1000]:
        regr = make_pipeline(StandardScaler(), SVR(C=c, epsilon=e))
        regr.fit(X, y)
        y_pred_test = regr.predict(X_test)
        score = mean_squared_error(y_pred_test, y_test, squared=False)
        results.append([e, c, score])
        if score < best_score:
            best_score = score

print(results)

print("best score: ", best_score)

In [None]:
X = pd.read_csv("rand_mdim_sample_100_X", index_col=False)
y = pd.read_csv("rand_mdim_sample_100_y", index_col=False)

In [None]:

print("Start Program")
X_150kdim = []
X_sum = []
X_avg = []

for i in range(len(X)):
    X_sum.append([sum( X.iloc[i].to_list() ), X.iloc[i].to_list()[150000-1]])
    X_avg.append([ (X_sum[i][0]/len( X.iloc[1].to_list() ) ), X.iloc[i].to_list()[150000-1]] )
    X_150kdim.append(X.iloc[i].to_list())

#pd.DataFrame(X_sum).to_csv("rand_mdim_sample_100_X_sum", index=False)
#pd.DataFrame(X_avg).to_csv("rand_mdim_sample_100_X_avg", index=False)

print("Done making arrays")

best_score = np.infty

print("\nStart KNN")

X_train, X_test, y_train, y_test = train_test_split(X_sum, y, test_size=0.20, random_state=0)

results = []

for k in [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 400, 500]:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_test = knn.predict(X_test)
    score = mean_squared_error(y_pred_test, y_test, squared=False)
    results.append([k, score])
    if score < best_score:
        best_score = score

print(results)

print("\nStart LR")
results = []

for i in range(5):#10):
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred_test = lr.predict(X_test)
    score = mean_squared_error(y_pred_test, y_test, squared=False)
    results.append([i, score])
    if score < best_score:
        best_score = score

print(results)

print("\nStart SVR")
results = []

for e in [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1]:#10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 400, 500]:
    for c in [0.001, 0.01, 0.1, 1, 10, 20, 50, 100, 1000]:
        regr = make_pipeline(StandardScaler(), SVR(C=c, epsilon=e))
        regr.fit(X_train, y_train)
        y_pred_test = regr.predict(X_test)
        score = mean_squared_error(y_pred_test, y_test, squared=False)
        results.append([e, c, score])
        if score < best_score:
            best_score = score

print(results)

print("best score sum method: ", best_score)




best_score = np.infty

print("\nStart KNN")

X_train, X_test, y_train, y_test = train_test_split(X_avg, y, test_size=0.20, random_state=0)

results = []

for k in [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 400, 500]:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_test = knn.predict(X_test)
    score = mean_squared_error(y_pred_test, y_test, squared=False)
    results.append([k, score])
    if score < best_score:
        best_score = score

print(results)

print("\nStart LR")
results = []

for i in range(5):#10):
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred_test = lr.predict(X_test)
    score = mean_squared_error(y_pred_test, y_test, squared=False)
    results.append([i, score])
    if score < best_score:
        best_score = score

print(results)

print("\nStart SVR")
results = []

for e in [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1]:#10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 400, 500]:
    for c in [0.001, 0.01, 0.1, 1, 10, 20, 50, 100, 1000]:
        regr = make_pipeline(StandardScaler(), SVR(C=c, epsilon=e))
        regr.fit(X_train, y_train)
        y_pred_test = regr.predict(X_test)
        score = mean_squared_error(y_pred_test, y_test, squared=False)
        results.append([e, c, score])
        if score < best_score:
            best_score = score

print(results)

print("best score avg method: ", best_score)




best_score = np.infty
results = []
X_train, X_test, y_train, y_test = train_test_split(X_150kdim, y, test_size=0.20, random_state=0)

for k in [100, 150, 200, 250, 300, 400, 500]:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_test = knn.predict(X_test)
    score = mean_squared_error(y_pred_test, y_test, squared=False)
    results.append([k, score])
    if score < best_score:
        best_score = score

print(results)

print("\nStart LR")
results = []

for i in range(5):#10):
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred_test = lr.predict(X_test)
    score = mean_squared_error(y_pred_test, y_test, squared=False)
    results.append([i, score])
    if score < best_score:
        best_score = score

print(results)

print("\nStart SVR")
results = []

for e in [0.000001, 0.00001, 0.01, 0.1, 1]:
    for c in [0.001, 1, 10, 20, 100]:
        regr = make_pipeline(StandardScaler(), SVR(C=c, epsilon=e))
        regr.fit(X_train, y_train)
        y_pred_test = regr.predict(X_test)
        score = mean_squared_error(y_pred_test, y_test, squared=False)
        results.append([e, c, score])
        if score < best_score:
            best_score = score

print(results)

print("best score 150kdim method: ", best_score)

In [None]:
temp = pd.read_csv("rand_mdim_sample_100_X", index_col=False)

In [None]:
len(temp.iloc[0].to_list())

In [None]:
len(data)

In [None]:
data.tail()

In [None]:
# Extracting the first column
first_column = [row[0] for row in X_avg]
second_column = [row[1] for row in X_avg]


import matplotlib.pyplot as plt
import numpy as np

ax = plt.figure().add_subplot(projection='3d')

# Plot scatterplot data (20 2D points per colour) on the x and z axes.
colors = ('r', 'g', 'b', 'k')

# Fixing random state for reproducibility
np.random.seed(19680801)


y = pd.read_csv("rand_mdim_sample_100_y", index_col=False)
c_list = []
for c in colors:
    c_list.extend([c] * len(first_column))
# By using zdir='y', the y value of these points is fixed to the zs value 0
# and the (x, y) points are plotted on the x and z axes.
ax.scatter(first_column, y, second_column, zdir='y',label='points in (x, z)', s=3)
ax.set_title("3d Scatter Plot of Mod. (Avg) Sample Data Set")
# Make legend, set axes limits and labels
#ax.legend()
ax.set_xlim(-10, 10)
ax.set_ylim(-25, 25)
ax.set_zlim(0, 14)
ax.set_xlabel('Avg of Previous Signals')
ax.set_ylabel('Acoustic Signal')
ax.set_zlabel('Time to Failure')

# Customize the view angle so it's easier to see that the scatter points lie
# on the plane y=0
ax.view_init(elev=20., azim=-35, roll=0)

plt.show()

In [None]:
min(y)