# PREPROCESSING

In [25]:
import pandas as pd
import numpy as np
import ast

from glob import glob
import pickle as pkl

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.svm import SVR
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from processing.parsing import *
from processing.encodings import *

In [2]:
df_ = pd.read_parquet('processed_data/tatarabuela.parquet')
df_ = df_[['price', 'lat', 'lng', 'characteristics', 'agency', 'updated', 'timestamp', 'province']].dropna()
df_churro = freeChurro(df_)
df = pd.concat([df_.reset_index(drop=True), df_churro], axis = 1)
df.drop('characteristics', axis = 1, inplace = True)

In [3]:
def tryParseM2(obj):
    try:
        if 'm²' in obj:
            return float(obj[:-2])
        return float(''.join(obj.split('.')))
    except:
        return np.nan

# df['Superficie construida'].apply(lambda x : x if pd.isna(x) else tryParseM2(x))

In [4]:
df.Baños = df.Baños.astype(float)
df['Superficie construida'] = df['Superficie construida'].apply(lambda x : x if pd.isna(x) else tryParseM2(x))

df['bathrooms'] = df['Baños']
df['surface'] = df['Superficie construida']

df.drop(['Superficie construida', 'Baños'], axis=1, inplace=True)

In [5]:
df = df[['price', 'lat', 'lng', 'Habitaciones', 'Jardín', 'Gastos de comunidad', 'Antigüedad', 'Superficie útil', 'Ascensor', 'Garaje', 'Conservación', 'agency', 'updated', 'timestamp', 'surface', 'bathrooms', 'province']]

In [6]:
df

Unnamed: 0,price,lat,lng,Habitaciones,Jardín,Gastos de comunidad,Antigüedad,Superficie útil,Ascensor,Garaje,Conservación,agency,updated,timestamp,surface,bathrooms,province
0,75000.0,38.628784,-0.761397,2,,,,72 m²,Ascensor,,,Inmuebles de Topbrokers,1.673392e+09,1.699187e+15,76.0,1.0,alicante
1,56000.0,38.535113,-0.821114,3,,,Entre 20 y 30 años,96 m²,,,A reformar,Inmuebles de CICLOACTIVOS S.L.,1.673997e+09,1.699187e+15,114.0,2.0,alicante
2,39800.0,38.631641,-0.860810,2,,,,61 m²,,,,Inmuebles de ALTAMIRA,1.673133e+09,1.699187e+15,82.0,1.0,alicante
3,119500.0,38.537183,-0.817027,5,,,,220 m²,,,En buen estado,Inmuebles de Grupo Ideas Sax,1.674170e+09,1.699187e+15,220.0,3.0,alicante
4,103000.0,38.631503,-0.765994,4,,,,150 m²,,,,Inmuebles de Quo Real Estate,1.672701e+09,1.699187e+15,170.0,2.0,alicante
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276087,69000.0,41.599972,-1.281389,10,,,,290 m²,,1,,Inmuebles de ZARADELTA,1.674947e+09,1.699336e+15,302.0,3.0,zaragoza
276088,256800.0,41.482034,-1.373726,4,,,,215 m²,,,,Inmuebles de GTI ZARAGOZA,1.672960e+09,1.699336e+14,236.0,6.0,zaragoza
276089,105000.0,41.578195,-1.116482,,,,,,,,A estrenar,Inmuebles de Aliseda Inmobiliaria,1.674343e+09,1.699336e+15,76.0,,zaragoza
276090,89000.0,41.483506,-1.371575,4,,,,100 m²,,,,Inmuebles de INSERCONS EJEA DE LOS CABALLEROS,1.673997e+09,1.699336e+15,113.0,1.0,zaragoza


In [7]:
df['rooms'] = df['Habitaciones']
df = df.drop('Habitaciones', axis = 1)

df['garden'] = df['Jardín']
df = df.drop('Jardín', axis = 1)

df['community_expenses'] = df['Gastos de comunidad']
df = df.drop('Gastos de comunidad', axis = 1)

df['age'] = df['Antigüedad']
df = df.drop('Antigüedad', axis = 1)

df['useful_surface'] = df['Superficie útil']
df = df.drop('Superficie útil', axis = 1)

df['elevator'] = df['Ascensor']
df = df.drop('Ascensor', axis = 1)

df['garage'] = df['Garaje']
df = df.drop('Garaje', axis = 1)

df['state'] = df['Conservación']
df = df.drop('Conservación', axis = 1)

In [8]:
df = df.drop(['agency', 'updated', 'timestamp', 'community_expenses'], axis = 1)

In [9]:
df.rooms = df.rooms.astype(float)

In [10]:
df.garden = df.garden.fillna(False).apply(lambda x: True if x else False)

In [11]:
age_col = df['age'].unique()
ages = {' Menos de 5 años': 0,
                ' Entre 5 y 10 años' : 5,
                ' Entre 10 y 20 años' : 10,
                ' Entre 20 y 30 años' : 20,
                ' Entre 30 y 50 años' : 30,
                ' Más de 50 años': 50,}

In [12]:
df['age'] = df['age'].replace(ages)

In [13]:
df['useful_surface'] = df['useful_surface'].apply(lambda x : x if pd.isna(x) else tryParseM2(x))

In [14]:
df.elevator = df.elevator.fillna(False).apply(lambda x: True if x else False)

In [15]:
df.garage = df.garage.fillna(False).apply(lambda x: True if x else False)

In [16]:
df_sample = df.sample(round(df.shape[0]*0.15), random_state=42)
df_ = df.reset_index()[~df.reset_index()['index'].isin(df_sample.reset_index()['index'])].drop(['index'], axis=1)

In [17]:
criba = []
total_provinces = len(df.province.unique())
for i in range(1,20_000,1000):
    provinces = [province for province in df.province.unique() if len(df[df['province'] == province]) > i]
    percentage = 1 - df[df['province'].isin(provinces)].shape[0] / df.shape[0]
    n_models = len(provinces)/total_provinces
    criba.append(tuple([percentage, n_models]))

In [18]:
import plotly.express as px
px.line(criba)

In [19]:
provinces_to_train = [province for province in df.province.unique() if len(df[df['province'] == province]) > 4000]

In [None]:
for province in provinces_to_train:
    df_province = df[df['province'] == province].drop('province', axis = 1)
    df_province.to_parquet(f'./processed_data/provinces/data_{province}.parquet')

In [20]:
df[~df['province'].isin(provinces_to_train)].to_parquet(f'./processed_data/provinces/data_25.parquet')

In [21]:
df_25 = pd.read_parquet('./processed_data/provinces/data_25.parquet')
df_25

Unnamed: 0,price,lat,lng,surface,bathrooms,province,rooms,garden,age,useful_surface,elevator,garage,state
104110,385000.0,42.834200,-2.788629,160.0,3.0,alava_araba,4.0,False,10.0,150.0,False,False,En buen estado
104111,46100.0,42.672399,-2.839510,426.0,2.0,alava_araba,4.0,False,50.0,293.0,False,False,
104112,155000.0,42.875177,-3.147517,400.0,1.0,alava_araba,5.0,True,50.0,300.0,False,False,A reformar
104113,850000.0,42.801684,-2.898858,370.0,4.0,alava_araba,6.0,True,10.0,360.0,False,True,
104114,450000.0,42.728258,-2.860074,350.0,3.0,alava_araba,4.0,False,,300.0,False,True,En buen estado
...,...,...,...,...,...,...,...,...,...,...,...,...,...
276087,69000.0,41.599972,-1.281389,302.0,3.0,zaragoza,10.0,False,,290.0,False,True,
276088,256800.0,41.482034,-1.373726,236.0,6.0,zaragoza,4.0,False,,215.0,False,False,
276089,105000.0,41.578195,-1.116482,76.0,,zaragoza,,False,,,False,False,A estrenar
276090,89000.0,41.483506,-1.371575,113.0,1.0,zaragoza,4.0,False,,100.0,False,False,


In [143]:
df_prueba = pd.read_parquet('./processed_data/provinces/data_25.parquet')

In [144]:
df_prueba

Unnamed: 0,price,lat,lng,surface,bathrooms,province,rooms,garden,age,useful_surface,elevator,garage,state
104110,385000.0,42.834200,-2.788629,160.0,3.0,alava_araba,4.0,False,10.0,150.0,False,False,En buen estado
104111,46100.0,42.672399,-2.839510,426.0,2.0,alava_araba,4.0,False,50.0,293.0,False,False,
104112,155000.0,42.875177,-3.147517,400.0,1.0,alava_araba,5.0,True,50.0,300.0,False,False,A reformar
104113,850000.0,42.801684,-2.898858,370.0,4.0,alava_araba,6.0,True,10.0,360.0,False,True,
104114,450000.0,42.728258,-2.860074,350.0,3.0,alava_araba,4.0,False,,300.0,False,True,En buen estado
...,...,...,...,...,...,...,...,...,...,...,...,...,...
276087,69000.0,41.599972,-1.281389,302.0,3.0,zaragoza,10.0,False,,290.0,False,True,
276088,256800.0,41.482034,-1.373726,236.0,6.0,zaragoza,4.0,False,,215.0,False,False,
276089,105000.0,41.578195,-1.116482,76.0,,zaragoza,,False,,,False,False,A estrenar
276090,89000.0,41.483506,-1.371575,113.0,1.0,zaragoza,4.0,False,,100.0,False,False,


# Pos-procesamiento

In [None]:
df['age'] = df['age'].fillna(df['age'].mean()) # PROBAR IMPUTAR CON KNN

In [53]:
df.state = df.state.fillna("En buen estado")

In [54]:
df_num = df._get_numeric_data()
df_cat = df.drop(df_num.columns, axis = 1)
df_cat, encodings = targetEncoding(df_cat, df_num['price'], 'target_median')
df = pd.concat([df_num, df_cat], axis = 1)

In [56]:
df_ = outliersFilter(df = df_, min_price = 50_000, max_price = 5_000_000, max_baths = 15, max_surface = 1_000)

In [57]:
# df = df_.copy()

# type encoding

In [58]:
# # df_type, encodings_type = targetEncoding(df[['type']], df['price'], condition = 'target_mean')
# df_type, encodings_type = frequencyEncoding(df[['type']])
# df['type'] = df_type['type']

# garage, lift, garden and publisher encoding

In [59]:
# df = binaryEncoding(df)

# condition encoding

In [60]:
# df_condition, encodings_condition = frequencyEncoding(df[['condition']])
# df['condition'] = df_condition['condition']

In [61]:
# df_surface = df[['surface', 'net_surface']].dropna()
# np.corrcoef(df_surface['surface'], df_surface['net_surface'])

In [62]:
df_.dropna().shape[0]/df_.shape[0]

0.6636716520689183

# TRAINING

In [63]:
df_ = df_[~df_['price'].isna()].dropna()

X = df_.drop(['price'], axis = 1)
y = df_[['price']]

In [64]:
imputer = KNNImputer(n_neighbors=3)

# X = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

model.fit(X_train, y_train)

yhat = model.predict(X_test)

pd.DataFrame([[r2_score(y_test, yhat),
              mean_absolute_error(y_test, yhat),
              mean_squared_error(y_test, yhat)]],
             columns = ['r2', 'mae', 'mse'])

  return fit_method(estimator, *args, **kwargs)


Unnamed: 0,r2,mae,mse
0,0.737474,81036.039059,35650860000.0


# Validation

In [65]:
# df_sample['type'] = df_sample['type'].replace(encodings_type['type'])

# df_sample = binaryEncoding(df)

# df_sample['condition'] = df_sample['condition'].replace(encodings_condition['condition'])

df_sample['state'].replace(encodings['state'])

df_validation = df_sample.dropna()

In [142]:
encodings

{'state': state
  A estrenar        272000.0
  A reformar        132530.0
  En buen estado    197000.0
  Reformado         200000.0
 En buen estado     190000.0
 Name: price, dtype: float64}

In [66]:
df_validation

Unnamed: 0,price,lat,lng,surface,bathrooms,rooms,garden,age,useful_surface,elevator,garage,state
171709,210000.0,41.681706,2.792384,153.0,2.0,3.0,False,25.1211,120.0,True,False,190000.0
274407,200000.0,41.655024,-0.888262,140.0,1.0,4.0,False,25.1211,125.0,True,False,197000.0
109220,139000.0,37.392931,-1.945951,110.0,2.0,3.0,False,25.1211,100.0,True,True,197000.0
11536,84000.0,38.357757,-0.484963,85.0,1.0,2.0,False,25.1211,80.0,False,False,197000.0
51355,350000.0,37.094700,-4.387500,140.0,2.0,8.0,True,20.0000,140.0,False,True,200000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
634,280000.0,38.078408,-0.654891,80.0,2.0,2.0,True,25.1211,65.0,True,False,197000.0
119298,50000.0,43.204895,-5.791440,86.0,1.0,3.0,False,30.0000,86.0,False,False,190000.0
183262,85000.0,37.158049,-3.531664,77.0,1.0,3.0,False,25.1211,67.0,True,False,190000.0
181573,99000.0,37.219500,-3.688100,128.0,1.0,4.0,False,25.1211,105.0,False,False,190000.0


In [67]:
X_val = df_validation.drop(['price'], axis = 1)
y_val = df_validation[['price']]

yhat = model.predict(X_val)

pd.DataFrame([[r2_score(y_val, yhat),
              mean_absolute_error(y_val, yhat),
              mean_squared_error(y_val, yhat)]],
             columns = ['r2', 'mae', 'mse'])

Unnamed: 0,r2,mae,mse
0,0.61052,85603.197519,80600640000.0


In [77]:
print(type(X), X.shape)
print(type(X_val), X_val.shape)
print(type(y), y.shape)
print(type(y_val), y_val.shape)

<class 'pandas.core.frame.DataFrame'> (140711, 11)
<class 'pandas.core.frame.DataFrame'> (26089, 11)
<class 'pandas.core.frame.DataFrame'> (140711, 1)
<class 'pandas.core.frame.DataFrame'> (26089, 1)


In [69]:
import pickle as pkl

with open('./model.pkl', 'bw') as file:
    pkl.dump(model, file)

In [78]:
# X = X.to_numpy().astype(np.float32)
# y = y.to_numpy().astype(np.float32)
# X_val = X_val.to_numpy().astype(np.float32)
# y_val = y_val.to_numpy().astype(np.float32)

In [82]:
# import tensorflow as tf
# from keras.models import Sequential
# from keras.layers import Input, Dense
# from keras.metrics import R2Score, MeanSquaredLogarithmicError, MeanSquaredError, MeanAbsoluteError

# with tf.device("/GPU:0"):

#     model = Sequential()
    
#     model.add(Input(shape=(None, 11)))
#     model.add(Dense(128*8, activation='relu', kernel_initializer='he_normal'))
#     model.add(Dense(64*8, activation='relu', kernel_initializer='he_normal'))
#     model.add(Dense(64*2, activation='relu', kernel_initializer='he_normal'))
#     model.add(Dense(1, kernel_initializer='he_normal'))

#     model.compile(optimizer = 'adam', loss = 'mse', metrics = [R2Score(), MeanSquaredLogarithmicError(), MeanAbsoluteError()])

#     history = model.fit(X, y, epochs = 50, validation_data=(X_val, y_val), batch_size=64, validation_batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
 311/2199 [===>..........................] - ETA: 26s - loss: 113705975808.0000 - r2_score: 0.2347 - mean_squared_logarithmic_error: 0.4849 - mean_absolute_error: 169808.4375

KeyboardInterrupt: 

In [None]:
# import plotly.express as px
# fig = px.line(pd.DataFrame(history.history))

In [None]:
# fig

# Entrenamiento y validación de los 20 modelos
- 1 modelo -> 25% resto de España (tiene que entrenarse aparte de los demás porque tiene una columna extra)
- 19 modelos -> las provincias con más de 4.000 publicaciones

In [26]:
df_journal = pd.DataFrame(columns=['file', 'stage', 'r2', 'mae', 'mse', 'with_outliers']) # registro de las métricas para revisarlas a posterior

In [27]:
with_outliers = [True, False]
files = glob('./processed_data/provinces/*.parquet') # lista de direcciones de todos los archivos para tenerlos desde el inicio
# files

In [28]:
# Entrenamos con y sin outliers usando el archivo 'data_25.parquet'

for boolean in with_outliers:
  df_25 = pd.read_parquet(files[0])
  file_name = files[0].split('\\')[-1]

  df_25_validation = getSample(df_25, 0.15)
  df_25_train = df_25.reset_index()[~df_25.reset_index()['index'].isin(df_25_validation.reset_index()['index'])].drop(['index'], axis=1)

  if boolean == False:
      df_25_train = outliersFilter(df = df_25_train, min_price = 50_000, max_price = 5_000_000, max_baths = 15, max_surface = 1_000)

  df_25_cat, encodings = frequencyEncoding(df_25_train[['province', 'state']])
  df_25_train = pd.concat([df_25_train._get_numeric_data(), df_25_cat], axis = 1)

  df_25_train = df_25_train[~df_25_train['price'].isna()].dropna()

  X = df_25_train.drop(['price'], axis = 1)
  y = df_25_train[['price']]

  imputer = KNNImputer(n_neighbors=3)
  X = imputer.fit_transform(X)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
  model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)
  model.fit(X_train, y_train)
  yhat = model.predict(X_test)
  df_journal = pd.concat([df_journal, pd.DataFrame([[file_name,
                                                    'train',
                                                    r2_score(y_test, yhat),
                                                    mean_absolute_error(y_test, yhat),
                                                    mean_squared_error(y_test, yhat),
                                                    boolean]],
                                                  columns = ['file', 'stage', 'r2', 'mae', 'mse', 'with_outliers'])])
  
  df_25_validation['state'] = df_25_validation['state'].replace(encodings['state'])
  df_25_validation['province'] = df_25_validation['province'].replace(encodings['province'])
  df_25_validation = df_25_validation.dropna()

  X_val = df_25_validation.drop(['price'], axis = 1)
  y_val = df_25_validation[['price']]

  yhat = model.predict(X_val)

  df_journal = pd.concat([df_journal, pd.DataFrame([[file_name,
                                                    'validation',
                                                    r2_score(y_val, yhat),
                                                    mean_absolute_error(y_val, yhat),
                                                    mean_squared_error(y_val, yhat),
                                                    boolean]],
                                                  columns = ['file', 'stage', 'r2', 'mae', 'mse', 'with_outliers'])])
  
  if boolean == True:
    with open('./models/model_25.pkl', 'bw') as file:
      pkl.dump(model, file)
    with open('./models/model_25_encodings.pkl', 'bw') as file:
      pkl.dump(encodings, file)
  else:
    with open('./models/model_25_no_outliers.pkl', 'bw') as file:
      pkl.dump(model, file)
    with open('./models/model_25_no_outliers_encodings.pkl', 'bw') as file:
      pkl.dump(encodings, file)

  df_sample = pd.concat([df_sample, df_prov_sample], axis=0)
  return fit_method(estimator, *args, **kwargs)
  df_journal = pd.concat([df_journal, pd.DataFrame([[file_name,
  df_sample = pd.concat([df_sample, df_prov_sample], axis=0)
  return fit_method(estimator, *args, **kwargs)


In [29]:
df_journal

Unnamed: 0,file,stage,r2,mae,mse,with_outliers
0,data_25.parquet,train,0.756483,57072.287207,14228600000.0,True
0,data_25.parquet,validation,0.112116,132650.232596,50065680000.0,True
0,data_25.parquet,train,0.726868,59433.322089,13061830000.0,False
0,data_25.parquet,validation,0.489979,101971.031031,28758860000.0,False


In [30]:
# Entrenamos con y sin outliers usando el resto de archivos de la carpeta 'provinces'

for boolean in with_outliers:    
    for file in files[1:]:

        df = pd.read_parquet(file)
        file_name = file.split('\\')[-1]

        df_validation = df.sample(frac=0.15)
        df_train = df.reset_index()[~df.reset_index()['index'].isin(df_validation.reset_index()['index'])].drop(['index'], axis=1)

        if boolean == False:
            df_train = outliersFilter(df = df_train, min_price = 50_000, max_price = 5_000_000, max_baths = 15, max_surface = 1_000)

        df_cat, encodings = frequencyEncoding(df_train[['state']])
        df_train = pd.concat([df_train._get_numeric_data(), df_cat], axis = 1)

        df_train = df_train[~df_train['price'].isna()].dropna()

        X = df_train.drop(['price'], axis = 1)
        y = df_train[['price']]

        imputer = KNNImputer(n_neighbors=3)
        X = imputer.fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)
        model.fit(X_train, y_train)
        yhat = model.predict(X_test)
        df_journal = pd.concat([df_journal, pd.DataFrame([[file_name,
                                                        'train',
                                                        r2_score(y_test, yhat),
                                                        mean_absolute_error(y_test, yhat),
                                                        mean_squared_error(y_test, yhat),
                                                        boolean]],
                                                        columns = ['file', 'stage', 'r2', 'mae', 'mse', 'with_outliers'])])
        
        df_validation['state'] = df_validation['state'].replace(encodings['state'])
        df_validation = df_validation.dropna()

        X_val = df_validation.drop(['price'], axis = 1)
        y_val = df_validation[['price']]

        yhat = model.predict(X_val)

        df_journal = pd.concat([df_journal, pd.DataFrame([[file_name,
                                                        'validation',
                                                        r2_score(y_val, yhat),
                                                        mean_absolute_error(y_val, yhat),
                                                        mean_squared_error(y_val, yhat),
                                                        boolean]],
                                                        columns = ['file', 'stage', 'r2', 'mae', 'mse', 'with_outliers'])])
        
        model_name = file_name.split('.')[0][5:]
        
        if boolean == True:
            with open(f'./models/model_{model_name}.pkl', 'bw') as file:
                pkl.dump(model, file)
            with open(f'./models/model_{model_name}_encodings.pkl', 'bw') as file:
                pkl.dump(encodings, file)
        else:
            with open(f'./models/model_{model_name}_no_outliers.pkl', 'bw') as file:
                pkl.dump(model, file)
            with open(f'./models/model_{model_name}_no_outliers_encodings.pkl', 'bw') as file:
                pkl.dump(encodings, file)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [31]:
df_journal = df_journal.reset_index(drop=True)
df_journal

Unnamed: 0,file,stage,r2,mae,mse,with_outliers
0,data_25.parquet,train,0.756483,57072.287207,1.422860e+10,True
1,data_25.parquet,validation,0.112116,132650.232596,5.006568e+10,True
2,data_25.parquet,train,0.726868,59433.322089,1.306183e+10,False
3,data_25.parquet,validation,0.489979,101971.031031,2.875886e+10,False
4,data_alicante.parquet,train,-0.106710,100342.989022,1.087810e+11,True
...,...,...,...,...,...,...
75,data_tarragona.parquet,validation,0.684231,51476.919781,7.313855e+09,False
76,data_toledo.parquet,train,0.619892,42641.748919,3.750951e+09,False
77,data_toledo.parquet,validation,0.266832,60948.372243,1.519207e+10,False
78,data_valencia.parquet,train,0.502215,73675.352724,4.998829e+10,False


In [32]:
df_journal[df_journal['file']=='data_madrid.parquet']

Unnamed: 0,file,stage,r2,mae,mse,with_outliers
24,data_madrid.parquet,train,0.806927,119261.379792,83732020000.0,True
25,data_madrid.parquet,validation,0.722236,153009.245024,184679800000.0,True
62,data_madrid.parquet,train,0.871971,93867.218049,37404310000.0,False
63,data_madrid.parquet,validation,0.821289,110950.715544,71862320000.0,False


In [33]:
df_journal.to_csv('./models/journal.csv', sep=',', index=False)