### Importing necessary libraries

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import mean_squared_error
import random
import gc

#### Importing the train dataset

In [None]:

#train_data = pd.read_csv(r'C:\Users\lscon\Desktop\AA\projeto\the-three-body-problem\mlNOVA\X_train.csv')
train_data = pd.read_csv(r'C:\Users\duart\OneDrive\Ambiente_de_Trabalho\Master_Analysis_Engineering_Big_Data\23-24\1st_semester\AA_ML\Kaggle_challenges\3_body_problem\3_body_problem\X_train.csv')
#train_data = pd.read_csv(r'C:\Users\lscon\Desktop\AA\projeto\the-three-body-problem\mlNOVA\X_train.csv')
#test_data = pd.read_csv(r'C:/Users/duart/OneDrive/Ambiente_de_Trabalho/Master_Analysis_Engineering_Big_Data/23-24/1st_semester/AA_ML/Kaggle_challenges/3_body_problem/3_body_problem/X_test.csv')

print(train_data.shape)
np.linalg.matrix_rank(train_data)

#### Dropping the velocities

In [None]:
# Identify faulty rows based on the criterion (all values = 0.0 except for Id)
zero_rows = train_data[(train_data.drop('Id', axis=1) == 0).all(axis=1)]

# Remove the faulty rows from the DataFrame
train_data_preprocessed = train_data[~train_data.index.isin(zero_rows.index)]
train_data_preprocessed.reset_index(drop=True, inplace=True)
#train_data_preprocessed.to_csv('train_preprocessed.csv', index=False)

# np.linalg.matrix_rank(train_data_preprocessed)

In [None]:
# #gives the stats for the preprocessed data (without the rows with zeros)
# summary_stats_filtered = train_data_preprocessed.describe(include='all')

# #gives the stats for the nonprocessed data
# summary_stats = train_data.describe(include="all")

#### Correlations Matrix

In [None]:
# Calculate the correlation matrix and plot it as a heatmap
corr_matrix = train_data_preprocessed.drop(train_data_preprocessed.columns[13],
                                       axis=1).corr()
corr_matrix.to_excel('corr_matrix_train_processed.xlsx')

plt.figure(figsize=(20, 16), dpi=800)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.4f',
            linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.savefig('Corr_matrix_heatmap.jpg', dpi=800)
plt.show()

# Create a pairwise scatter plot matrix
scatter_matrix = pd.plotting.scatter_matrix(train_data_preprocessed,
                                            figsize=(20, 20))
plt.show()


In [None]:
#create pairwise plots of correlation between variables

rows = 50000
x1_rows = x1.head(n=rows)
partial_train_data = train_data_preprocessed.drop(columns=['v_x_1','v_x_2','v_y_1', 
                                                           'v_y_2', 'v_x_3', 'v_y_3']).head(n=rows)


_= sns.pairplot(partial_train_data, kind="reg", diag_kind="kde", plot_kws={'line_kws':{'color':'red'}})
plt.title('Pairwise plots t vs velocity components')
plt.savefig('pairwisetv_50000')

## Creating our Feature Matrix

In [None]:
#create feature label matrices
#we're not going to use the velocity components as features
train_data_without_velocity = train_data_preprocessed.drop(columns=['Id','v_x_1','v_x_2','v_y_1', 
                                                           'v_y_2', 'v_x_3', 'v_y_3'])
#divide by simulations
list_of_times = [values for values in train_data_without_velocity['t']]
time_index_tuples = list(enumerate(list_of_times))
zeros_indexes = list(filter(lambda value: value[1] == 0, time_index_tuples))
zeros_indexes = [value[0] for value in zeros_indexes] 
list_of_simulations = []
lower_bound = 0
for i in range(len(zeros_indexes)-1):
    simulation = train_data_without_velocity.iloc[lower_bound:zeros_indexes[i+1]]
    list_of_simulations.append(simulation)
    lower_bound = zeros_indexes[i+1]
from tqdm import tqdm

#add label and put the starting position at every row
#x1
for simulation in tqdm(list_of_simulations):
    first_row_values = simulation.head(1)
    simulation.loc[:,'x_1_label'] = simulation.loc[:, 'x_1']
    simulation.loc[:,'y_1_label'] = simulation.loc[:, 'y_1']
    simulation.loc[:,'x_2_label'] = simulation.loc[:, 'x_2']
    simulation.loc[:,'y_2_label'] = simulation.loc[:, 'y_2']
    simulation.loc[:,'x_3_label'] = simulation.loc[:, 'x_3']
    simulation.loc[:,'y_3_label'] = simulation.loc[:, 'y_3']
    for index, row in simulation.iterrows():
        simulation.at[index, 'x_1'] = first_row_values['x_1']
        simulation.at[index, 'y_1'] = first_row_values['y_1']
        simulation.at[index, 'x_2'] = first_row_values['x_2']
        simulation.at[index, 'y_2'] = first_row_values['y_2']
        simulation.at[index, 'x_3'] = first_row_values['x_3']
        simulation.at[index, 'y_3'] = first_row_values['y_3']

list_of_simulations_copy = list_of_simulations.copy()
random.shuffle(list_of_simulations_copy) #we shuffle the data here so we are only shuffling different simulations and not amongst them
all_simulations = pd.concat(list_of_simulations_copy, ignore_index=True)
all_simulations.to_csv('feature_matrix.csv', index=False)


### Load the feature matrix from here after it is done

In [None]:
#load the feature matrix file
total_data = np.genfromtxt('feature_matrix.csv', delimiter=',')
total_data = total_data[1:] 


### Splitting the dataset into training and validation

In [4]:

data_train, data_temp = train_test_split(total_data, train_size=0.7, shuffle=False)
data_vali, data_test = train_test_split(data_temp, test_size=0.5, shuffle=False)


### Splitting the labels from the features

In [None]:
features_train = data_train[:, :7]  
labels_train = data_train[:, 7:] 

features_vali = data_vali[:, :7]  
labels_vali = data_vali[:, 7:] 

features_test = data_test[:, :7]  
labels_test = data_test[:, 7:]

np.linalg.matrix_rank(features_train)

##### Loading a feature matrix and dropping the x1 and y1 columns - splitting in sets and labels from features

In [29]:
feat_matrix_redux = pd.read_csv('feature_matrix.csv')
feat_matrix_redux = feat_matrix_redux.drop(columns=['x_1', 'y_1'])

In [30]:
feat_matrix_redux.to_csv('feat_matrix_redux.csv', index=False)
feat_matrix_redux = feat_matrix_redux.to_numpy()

In [31]:
data_redux_train, data_redux_temp = train_test_split(feat_matrix_redux, train_size=0.7, shuffle=False)
data_redux_vali, data_redux_test = train_test_split(data_redux_temp, test_size=0.5, shuffle=False)

data_redux_train.shape

(762673, 11)

In [33]:
#split the labels from the features
feat_redux_train = data_redux_train[:, :5]  
label_redux_train = data_redux_train[:, 5:] 

feat_redux_vali = data_redux_vali[:, :5]  
label_redux_vali = data_redux_vali[:, 5:] 

feat_redux_test = data_redux_test[:, :5]  
label_redux_test = data_redux_test[:, 5:]


(762673, 6)


In [None]:
# plot the points (isto e porque acho que era interessante termos no deck of slides)

In [7]:
# polynomial features - let's check the best polynomial + ridge regression
# escolher com base no menor MSE com o VALIDATION SET
# entretanto descobri que o sklearn tem uma crossvalidation feature que e capaz de ser bem util no calculo do MSE (literalmente transformar
# aquilo em duas linhas o que e fixolas)
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html#sklearn.model_selection.cross_val_score

# Create a pipeline with PolynomialFeatures take calculates the MSE
# We can use it to find the 1st polynomial degree which overfits the data (MSE train =0)
# When that happens, we then take that model and do ridge regression for that polynomial features' degree 

# pipelines = [make_pipeline(StandardScaler(), PolynomialFeatures(6), LinearRegression())] #for degree in range(5, 8)]
# # Define the hyperparameters and their respective values to search
# alphas = [1e-15, 1e-10, 1e-8, 1e-5, 1e-3, 1e-2, 0.1, 1.0, 10.0, 20, 30, 35, 40, 50, 60, 75, 80, 100]

# # pipelines = [make_pipeline(StandardScaler(), PolynomialFeatures(degree), Ridge(alpha)) for degree in range(1, 15) for alpha in alphas]

# for pipe in pipelines:
#     pipe.fit(features_train, labels_train)
#     labels_pred_train = pipe.predict(features_train)
#     labels_pred_vali = pipe.predict(features_vali)
#     mse_train = mean_squared_error(labels_train, labels_pred_train)
#     print(f"MSE Train:\t{mse_train}")
#     mse_vali = mean_squared_error(labels_vali, labels_pred_vali)
#     print(f"MSE Vali:\t{mse_vali}")
#     poly_output_feat = pipe[1].n_output_features_
#     print(f"Polynomial Features:\t{poly_output_feat}")

# err=[]
# for k, pipe in enumerate(pipelines):
    
#     err.append[mean_squared_error(labels_vali, labels_pred)]


# # Calculate RMSE for the best model
# labels_train_pred = best_model.predict(features_train)
# rmse = sqrt(mean_squared_error(labels_train, labels_train_pred))
# print("Root Mean Square Error (RMSE) for the best model:", rmse)


In [None]:
#depois no final grafico para o melhor claro e calculamos sqrt(mse) para vermos o quao off estamos - test set
#depois seria implementar isto tudo para as outras matrizes que faltam
#sugeria depois transformar isto do modelo numa funcao para ser mais simples e nao repetirmos codigo
# o mesmo poderia ser feito para a criaçao das matrizes

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge

# Create a pipeline object
pipeline_redux_ridge = Pipeline([
    ('scaler', StandardScaler()),
    ('polynomial_features', PolynomialFeatures(6)),
    ('regressor', Ridge(100))
])

# Fit the pipeline model to the training data
pipeline_redux_ridge.fit(feat_redux_train, label_redux_train)

# Make predictions on the validation data using the pipeline model
labels_pred_vali_redux = pipeline_redux_ridge.predict(feat_redux_vali)

# Evaluate the performance of the pipeline model on the validation data
mse_vali = mean_squared_error(label_redux_vali, labels_pred_vali_redux, squared=False)
print(f"RMSE Vali:\t{mse_vali}")

# Make predictions on the test data using the pipeline model
labels_pred_test_redux = pipeline_redux_ridge.predict(feat_redux_test)

# Evaluate the performance of the pipeline model on the test data
mse_test = mean_squared_error(label_redux_test, labels_pred_test_redux, squared=False)
print(f"RMSE Test:\t{mse_test}")

RMSE Vali:	1.1511185968083442
RMSE Test:	1.5047333023544216


# Model with Ridge and Grid Search

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import GridSearchCV

# Define the range of alpha values to evaluate
alpha_values = np.linspace(0.1, 10, 10)
alphas = [0.5, 0.9]

# Create a pipeline object
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('polynomial_features', PolynomialFeatures(5)),
    ('regressor', Ridge())
])

# Create a grid search
grid_search = GridSearchCV(pipeline, {'regressor__alpha': alphas}, n_jobs=-1)

# Fit the grid search object to the training data
grid_search.fit(features_train, labels_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the validation data using the best model
labels_pred_vali = best_model.predict(features_vali)

# Evaluate the performance of the best model on the validation data
mse_vali = mean_squared_error(labels_vali, labels_pred_vali, squared=False)
print(f"RMSE Vali:\t{mse_vali}")

# Make predictions on the test data using the best model
labels_pred_test = best_model.predict(features_test)

# Evaluate the performance of the best model on the test data
mse_test = mean_squared_error(labels_test, labels_pred_test, squared=False)
print(f"RMSE Test:\t{mse_test}")


# Model with Ridge Regression without GridSearch

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge

# Create a pipeline object
pipeline_ridge = Pipeline([
    ('scaler', StandardScaler()),
    ('polynomial_features', PolynomialFeatures(6)),
    ('regressor', Ridge(100))
])

# Fit the pipeline model to the training data
pipeline_ridge.fit(features_train, labels_train)

# Make predictions on the validation data using the pipeline model
labels_pred_vali = pipeline_ridge.predict(features_vali)

# Evaluate the performance of the pipeline model on the validation data
mse_vali = mean_squared_error(labels_vali, labels_pred_vali, squared=False)
print(f"RMSE Vali:\t{mse_vali}")

# Make predictions on the test data using the pipeline model
labels_pred_test = pipeline_ridge.predict(features_test)

# Evaluate the performance of the pipeline model on the test data
mse_test = mean_squared_error(labels_test, labels_pred_test, squared=False)
print(f"RMSE Test:\t{mse_test}")


NameError: name 'StandardScaler' is not defined

# Model with LASSO regression without GridSearch

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso

# Create a pipeline object
pipeline_lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('polynomial_features', PolynomialFeatures(6)),
    ('regressor', Lasso(alpha=0.1))
])

# Fit the pipeline model to the training data
pipeline_lasso.fit(features_train, labels_train)

# Make predictions on the validation data using the pipeline model
labels_pred_vali = pipeline_lasso.predict(features_vali)

# Evaluate the performance of the pipeline model on the validation data
mse_vali = mean_squared_error(labels_vali, labels_pred_vali, squared=False)
print(f"RMSE Vali:\t{mse_vali}")

# Make predictions on the test data using the pipeline model
labels_pred_test = pipeline_lasso.predict(features_test)

# Evaluate the performance of the pipeline model on the test data
mse_test = mean_squared_error(labels_test, labels_pred_test, squared=False)
print(f"RMSE Test:\t{mse_test}")


RMSE Vali:	1.33016875390813
RMSE Test:	1.1591774716969916


# Trying on real world data - Creating Predictions

In [38]:
# Read the real-world dataset into a Pandas DataFrame and drop the Id column - RUN ONLY ONCE PER TRIALRUN
X_realworld = pd.read_csv(r'C:/Users/duart/OneDrive/Ambiente_de_Trabalho/Master_Analysis_Engineering_Big_Data/23-24/1st_semester/AA_ML/Kaggle_challenges/3_body_problem/3_body_problem/X_test.csv')
id_column = X_realworld['Id']
print(id_column.head())
X_realworld.drop('Id', axis=1, inplace=True)
X_realworld.to_csv('X_realworld.csv', index=False)


0    0
1    1
2    2
3    3
4    4
Name: Id, dtype: int64


## CHANGE TO THE FULL NAME OF THE PIPELINE YOU WANT TO CALL!

In [1]:
# # Preprocess the data using the pipeline
# # Preprocess the real-world data
# X_realworld_processed = pipeline.named_steps['scaler'].transform(X_realworld)
# X_realworld_processed = pipeline.named_steps['polynomial_features'].transform(X_realworld_processed)


# Make predictions on the preprocessed real-world data
predictions_realworld = pipeline_ridge.predict(X_realworld)


# Create a new Pandas DataFrame with the predictions
df_predictions = pd.DataFrame(predictions_realworld)
df_predictions.insert(loc=0, column='Id', value = id_column)
df_predictions.columns=['Id', 'x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']

# Submit the Pandas DataFrame to the challenge creator
df_predictions.to_csv('predictions.csv', index=False)

NameError: name 'pipeline_ridge' is not defined

In [31]:
print(df_predictions.shape)

(1041621, 7)
