In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error, make_scorer
from sklearn.decomposition import PCA

In [2]:
path_archivo = "../input_data/combi_500_100.csv"
path_rand1 = "../input_data/uni_52500_1.csv"
path_rand2 = "../input_data/uni_52500_2.csv"
datos_combi  = pd.read_csv(path_archivo)
datos_rand1 = pd.read_csv(path_rand1)
datos_rand2 = pd.read_csv(path_rand2)

In [3]:
datos_combi = datos_combi.sort_values(by=["eta", "beta", "phi_K"]).reset_index(drop=True)

In [4]:
N_eig = 5
target = "phi_K"
def preprocesar_datos(d_frame, N_eig, target):
    features_eig = list(map(lambda x: "eig_" + str(x+1), range(N_eig)))
    features_dim = ["eta", "beta"]
    feature_especial = ["eig_0"]
    features_tot = [target] + feature_especial + features_dim + features_eig
    dat_copy = d_frame.copy()
    for i in range(N_eig):
        key_mod = "eig_" + str(i+1)
        prev_key = "eig_" + str(i)
        if i == 0:  
            dat_copy[key_mod] = 1/dat_copy[key_mod]
        else:
            dat_copy[key_mod] = dat_copy[prev_key]/dat_copy[key_mod]
        #fin if 
    #fin for
    for i in range(N_eig):
        col = "eig_" + str(i+1)
        dat_copy[col] = np.log(dat_copy[col])
    #fin for 
    dat_copy = dat_copy.dropna()
    try:
        problematic_rows = dat_copy[(~np.isfinite(dat_copy)).any(axis=1) | (dat_copy.abs() > np.finfo(np.float64).max).any(axis=1)]
        problematic_indices = problematic_rows.index
        dat_copy = dat_copy.drop(index=problematic_indices)
    except:
        pass
    
    return dat_copy[features_tot]
#fin procesar_datos

In [5]:
datos_combi = preprocesar_datos(datos_combi, N_eig, target)
datos_rand1 = preprocesar_datos(datos_rand1, N_eig, target)
datos_rand2 = preprocesar_datos(datos_rand2, N_eig, target)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [6]:
metadata_temporal = {"combi": len(datos_combi), "rand1": len(datos_rand1), "rand2": len(datos_rand2)}
print(metadata_temporal)

{'combi': 52500, 'rand1': 52494, 'rand2': 52493}


In [7]:
N_phi_K = 500
N_datos = len(datos_combi)
N_partes = int(N_datos/N_phi_K)
print(N_partes)

105


In [8]:
def graficar_lambdas(data_frame, target, n_eig, preds = {}, op = lambda x: x, nombre = "lineal/", nombre_feat = "lambda_moño"):
    cols_eig = list(map(lambda x: "pca" + str(x), range(n_eig)))
    cols_preds = list(map(lambda x: "pred_" + x, cols_eig)) if len(preds) != 0 else []
    eta = str(np.array(data_frame["eta"])[0]/np.pi)[:4] + "pi"
    beta = str(np.array(data_frame["beta"])[0]/np.pi)[:4] + "pi"
    x = data_frame[target]
    ys = op(data_frame[cols_eig])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(x, ys)
    if len(preds) != 0:
        ax.plot(preds["y"], ys)
        nombre = "pred/" + nombre
    #fin if 
    ax.set_xlabel(target)
    ax.set_ylabel(nombre_feat)
    ax.set_title("eta=" +  eta + ", beta=" + beta)
    ax.legend(cols_eig + cols_preds)
    plt.savefig("pictures/" + nombre + "eta=" +  eta + ",beta=" + beta + "_" + ".png")
    plt.close()
#fin for 

In [9]:
def graficar_todas(d_frame, n_eig, target, op = lambda x: x, preds = {}, nombre = "lineal/", nombre_feat = "l_moño"):
    for n in range(N_partes):
        start = N_phi_K*n
        end = N_phi_K*(n+1)
        ys = preds["y"][start:end] if len(preds) != 0 else []
        preds_fin = {"y": ys, "X": preds["X"].iloc[start:end]} if len(preds) != 0 else {}
        graficar_lambdas(d_frame.iloc[start:end], target, n_eig, preds_fin, op, nombre, nombre_feat)
    #fin for 
#fin función

In [10]:
def get_metrics(X, y, model):
    y_gorro = model.predict(X)
    R2 = r2_score(y, y_gorro)
    RMSE = root_mean_squared_error(y, y_gorro)
    MAE = mean_absolute_error(y, y_gorro)
    return {"R2": R2, "RMSE": RMSE, "MAE": MAE}
#fin función

class TransformadorDeDatos(BaseEstimator, TransformerMixin):
    def __init__(self, cols_tr, deg = 3):
        self.deg = deg
        self.cols_tr = cols_tr
    #fin init

    def fit(self, X, y=None):
        return self
    #fin fit

    def transform(self, X):
        datos = X.copy()
        cols_transformar = self.cols_tr
        poly = PolynomialFeatures(degree=self.deg, include_bias=False)
        poly_data = poly.fit_transform(datos[cols_transformar])
        poly_feature_names = poly.get_feature_names_out(input_features=cols_transformar)
        datos_finales = pd.DataFrame(poly_data, columns=poly_feature_names)
        return datos_finales
    #fin transform
#fin class

def crear_pipeline(modelo, grado, nombre_modelo, cols_tr):
    pipeline = Pipeline([('custom-transformer', TransformadorDeDatos(cols_tr, deg = grado)),
            #('power-transformer', PowerTransformer(method = "box-cox", standardize = True)), 
            (nombre_modelo, modelo)])
    return pipeline
#fin función

In [11]:
orden_pol = 4
features = ["eta", "beta"] + list(map(lambda x: "eig_" + str(x+1), range(N_eig)))
pipeline = crear_pipeline(LinearRegression(), orden_pol, "re_lineal_simple", features)

In [12]:
X_combi = datos_combi[features]
X_rand1 = datos_rand1[features]
X_rand2 = datos_rand2[features]
y_combi = datos_combi[target]
y_rand1 = datos_rand1[target]
y_rand2 = datos_rand2[target]

In [13]:
X_train = pd.concat((X_combi, X_rand2), axis = 0)
y_train = pd.concat((y_combi, y_rand2), axis = 0)

In [14]:
pipeline.fit(X_train, y_train)

In [15]:
metrs_train = get_metrics(X_train, y_train, pipeline)
metrs_test = get_metrics(X_rand1, y_rand1, pipeline)

In [16]:
print(metrs_train)
print(metrs_test)

{'R2': 0.9037131260236186, 'RMSE': np.float64(0.140476163548656), 'MAE': np.float64(0.09567159649359941)}
{'R2': 0.6399157027955809, 'RMSE': np.float64(0.2721914333311182), 'MAE': np.float64(0.10518574928896558)}


In [17]:
path_archivo_kg = "../input_data/KG_random.csv"
str_to_ndarray = lambda x: np.fromstring(x, sep=' ')
datos_kg_random = pd.read_csv(path_archivo_kg, converters={'eigvals': str_to_ndarray})

In [18]:
datos_kg_random = datos_kg_random[datos_kg_random["shape"] == "parallelepiped"].copy()

In [19]:
del datos_kg_random["rho"]

In [20]:
datos_kg_random

Unnamed: 0,K,G,dx,dy,dz,shape,eigvals
98304,1.137564,0.925310,0.345791,0.722980,0.454774,parallelepiped,"[-1.5547410733937762e-13, -6.69847392884328e-1..."
98305,5.500412,5.321621,0.402060,0.999638,0.807374,parallelepiped,"[-2.1667717054444154e-12, -2.0543006395375965e..."
98306,3.375617,3.980662,0.429638,0.851665,0.529495,parallelepiped,"[-4.817232318550446e-13, -3.323750112763421e-1..."
98307,1.448420,4.074601,0.151318,0.942633,0.328977,parallelepiped,"[-2.6737693556924537e-12, -2.351597437975512e-..."
98308,0.351677,5.407791,0.446276,0.792512,0.989446,parallelepiped,"[-1.6759891759031827e-13, -3.9777497738663696e..."
...,...,...,...,...,...,...,...
131067,0.603325,3.002608,0.720661,0.439132,0.152189,parallelepiped,"[-2.9206283664231715e-12, -7.439679690463981e-..."
131068,2.302202,2.474801,0.132153,0.607512,0.605882,parallelepiped,"[-3.9535462097488675e-13, -1.163763376545102e-..."
131069,4.789391,3.091901,0.174914,0.402842,0.230977,parallelepiped,"[-5.3840655925037205e-12, -2.069314763230146e-..."
131070,0.920026,0.420177,0.185685,0.716565,0.322597,parallelepiped,"[-4.557497223449767e-13, -3.4356261396269377e-..."


In [21]:
for q in range(N_eig+1):#nondeg_minlen):
    datos_kg_random['eig_' + str(q)] = datos_kg_random['eigvals'].apply(lambda arr: arr[6 + q]) 

datos_kg_random = datos_kg_random.drop(columns=['eigvals'])

In [22]:
datos_kg_random.head()

Unnamed: 0,K,G,dx,dy,dz,shape,eig_0,eig_1,eig_2,eig_3,eig_4,eig_5
98304,1.137564,0.92531,0.345791,0.72298,0.454774,parallelepiped,3.344897,4.253522,5.383594,10.070057,10.958655,12.371888
98305,5.500412,5.321621,0.40206,0.999638,0.807374,parallelepiped,6.564257,10.320303,18.541555,20.077801,24.790483,25.277627
98306,3.375617,3.980662,0.429638,0.851665,0.529495,parallelepiped,10.756621,12.849225,15.328886,29.027839,33.703187,38.74468
98307,1.44842,4.074601,0.151318,0.942633,0.328977,parallelepiped,1.65911,5.366767,5.584904,10.40584,17.398203,22.070203
98308,0.351677,5.407791,0.446276,0.792512,0.989446,parallelepiped,2.555409,4.226163,4.688969,7.011962,7.698858,8.124265


In [23]:
datos_kg_random["phi_K"] = np.arctan(datos_kg_random["G"]/datos_kg_random["K"])
indexes_kg_random = [98304, 131071]
datos_kg_random["eta"] = np.ones(len(datos_kg_random))
datos_kg_random["beta"] = np.ones(len(datos_kg_random))
for i in range(indexes_kg_random[0], indexes_kg_random[1] + 1):
    lis_or = np.sort([datos_kg_random["dx"][i], datos_kg_random["dy"][i], datos_kg_random["dz"][i]])
    r = (lis_or[0]**2 + lis_or[1]**2 + lis_or[2]**2)**0.5
    datos_kg_random.loc[i, "eta"] = 2*np.arccos(lis_or[2]/r)
    datos_kg_random.loc[i, "beta"] = 4*np.arctan(lis_or[0]/lis_or[1])
#fin for 
datos_kg_random = datos_kg_random.copy()

In [24]:
datos_kg_random_copy = preprocesar_datos(datos_kg_random, N_eig, target)

In [25]:
datos_kg_random_copy

Unnamed: 0,phi_K,eig_0,eta,beta,eig_1,eig_2,eig_3,eig_4,eig_5
98304,0.682866,3.344897,1.337486,2.600392,-1.447747,-3.131104,-5.440670,-7.834799,-10.350226
98305,0.768879,6.564257,1.468138,1.848136,-2.334113,-5.254128,-8.253742,-11.464202,-14.694122
98306,0.867462,10.756621,1.350259,2.726644,-2.553284,-5.283023,-8.651278,-12.168870,-15.825864
98307,1.229252,1.659110,0.733530,1.724437,-1.680226,-3.400293,-5.742660,-8.599027,-11.693256
98308,1.505856,2.555409,1.486674,2.051426,-1.441294,-2.986507,-4.934125,-6.975197,-9.070052
...,...,...,...,...,...,...,...,...,...
131067,1.372503,2.463035,1.145569,1.334456,-1.533379,-3.704920,-6.377112,-9.076059,-11.823566
131068,0.821514,2.507252,1.591347,0.859013,-1.665130,-3.466072,-6.018688,-8.574453,-11.812814
131069,0.573257,36.155819,1.247024,2.592575,-3.765774,-7.789314,-12.513577,-17.378115,-22.303093
131070,0.428413,1.034104,0.958174,2.089119,-0.240687,-0.942454,-2.556530,-4.173563,-5.809895


In [26]:
X_test3 = datos_kg_random_copy[features]
y_test3 = datos_kg_random_copy[target]

In [27]:
metrs3 = get_metrics(X_test3, y_test3, pipeline)

In [28]:
print(metrs3)

{'R2': -7009.65404772543, 'RMSE': np.float64(30.74504221337483), 'MAE': np.float64(13.418588050670364)}
