In [59]:
import pandas as pd
import numpy as np
pd.options.plotting.backend = "plotly"
pd.set_option('display.max_columns', None) 

import warnings
warnings.filterwarnings('ignore')

import plotly.express as px

from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

from tqdm import tqdm

In [60]:
X_train = pd.read_csv('../data/transform/X_train_transform.csv')
X_train_no_cluster = X_train.drop(columns=['4_cluster','5_cluster'])
X_train_5c =  X_train.drop(columns=['4_cluster'])
X_train_4c =  X_train.drop(columns=['5_cluster'])

X_test = pd.read_csv('../data/transform/X_test_transform.csv')
X_test_no_cluster = X_test.drop(columns=['4_cluster','5_cluster'])
X_test_5c =  X_test.drop(columns=['4_cluster'])
X_test_4c =  X_test.drop(columns=['5_cluster'])

y_train = pd.read_csv('../data/transform/y_train.csv')

In [61]:
def corr_plot(X,y):
    # Creamos matriz de correlación
    corr = pd.concat([X,y],axis=1).corr()

    # Creamos máscara
    mask = np.triu(np.ones_like(corr,dtype=bool))
    data = corr[~pd.DataFrame(mask,index=corr.index,columns=corr.columns)]

    # Incializamos figura
    return px.imshow(abs(data).round(4),text_auto=True,color_continuous_scale='reds').update_layout(width=900,height=700)

In [62]:
corr_plot(X_train_no_cluster,y_train)

In [63]:
def train(X,y):  
    n_estimators=[int(x) for x in np.linspace(200, 800, 10)]  # nº de arboles del bosque

    max_features=['auto']   # calculo maximo carac sqrt

    min_samples_split=[2, 3, 4, 5]
    bootstrap=[True, False]  # con o sin reemplazamiento

    params={'n_estimators': n_estimators,
            'max_features': max_features,
            'min_samples_split':min_samples_split,
            'bootstrap': bootstrap}

    rfr = RFR()

    rf_random = RandomizedSearchCV(estimator=rfr,
                                param_distributions=params,
                                n_iter=80,
                                cv=5,
                                n_jobs=-1,
                               verbose=0,
                              scoring='neg_root_mean_squared_error')

    return rf_random.fit(X,y)

In [11]:
rf = train(X_train_no_cluster,y_train)


The total space of parameters 30 is smaller than n_iter=200. Running 30 iterations. For exhaustive searches, use GridSearchCV.



Fitting 4 folds for each of 30 candidates, totalling 120 fits



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [13]:
rf.best_estimator_

RandomForestRegressor(n_estimators=600)

In [14]:
y_hat = rf.best_estimator_.predict(X_train_no_cluster)

In [19]:
r2(y_hat,y_train)

0.9789582070399523

In [15]:
r2(y_hat,y_train)

0.9794798797503215

In [22]:
mse(y_hat,y_train,squared=False)

869609.2461044459

In [16]:
mse(y_hat,y_train,squared=False)

860610.8843972833

In [18]:
y_hat_test = rf.best_estimator_.predict(X_test_no_cluster)

In [19]:
muestra = pd.read_csv('../data/muestra.csv')

In [20]:
muestra.head()

Unnamed: 0,id,Price
0,0,2759145
1,1,10926785
2,2,20212944
3,3,46541311
4,4,46335226


In [21]:
muestra['Price'] = y_hat_test

In [22]:
muestra.head()

Unnamed: 0,id,Price
0,0,7315534.0
1,1,13797580.0
2,2,10467380.0
3,3,11531260.0
4,4,4637131.0


In [23]:
muestra.to_csv('../data/muestra_2.csv',index=False)

Combinaciones:
- Con y sin outliers
- sin cluster con clusters (4 y 5)
- Quitando columna version y no version

In [6]:
outliers = pd.read_csv('../data/transform/outliers_index.csv')

In [64]:
%%time

modelos = []
for out in range(2):
    for cluster in [0,4,5]:
        for version in range(2):
            if not out:
                index = outliers.iloc[:,0]
            else:
                index = []
            if cluster == 0:
                X = X_train_no_cluster.drop(index=index)
                X_t = X_test_no_cluster
            elif cluster == 4:
                X = X_train_4c.drop(index=index)
                X_t = X_test_4c
            else:
                X = X_train_5c.drop(index=index)
                X_t = X_test_5c
            if not version:
                X = X.drop(columns=['Operating_System_Version_10 S',
                                    'Operating_System_Version_7',
                                    'Operating_System_Version_Android',
                                    'Operating_System_Version_Chrome OS',
                                    'Operating_System_Version_No OS',
                                    'Operating_System_Version_Ubuntu',
                                    'Operating_System_Version_X'                                   
                                   ])
                X_t = X_t.drop(columns=['Operating_System_Version_10 S',
                                        'Operating_System_Version_7',
                                        'Operating_System_Version_Android',
                                        'Operating_System_Version_Chrome OS',
                                        'Operating_System_Version_No OS',
                                        'Operating_System_Version_Ubuntu',
                                        'Operating_System_Version_X'                                   
                                       ])
            print(out,cluster,version)
            modelo = train(X,y_train.drop(index=index))
            print('modelo ok')
            y_hat = modelo.best_estimator_.predict(X)
            print('y_hat ok')
            score = r2(y_hat,y_train.drop(index=index))
            print('r2 ok')
            mse_value = mse(y_hat,y_train.drop(index=index),squared=False)
            print('mse ok')
            
            modelos.append({'outliers':out,
                            'cluster':cluster,
                            'version':version,
                            'modelo': modelo,
                            'X':X,
                            'y':y_train.drop(index=index),
                            'X_test':X_t,
                            'R2':score,
                            'mse':mse_value,
                            'y_hat':y_hat
                           })

0 0 0
modelo ok
y_hat ok
r2 ok
mse ok
0 0 1
modelo ok
y_hat ok
r2 ok
mse ok
0 4 0
modelo ok
y_hat ok
r2 ok
mse ok
0 4 1
modelo ok
y_hat ok
r2 ok
mse ok
0 5 0
modelo ok
y_hat ok
r2 ok
mse ok
0 5 1
modelo ok
y_hat ok
r2 ok
mse ok
1 0 0
modelo ok
y_hat ok
r2 ok
mse ok
1 0 1
modelo ok
y_hat ok
r2 ok
mse ok
1 4 0
modelo ok
y_hat ok
r2 ok
mse ok
1 4 1
modelo ok
y_hat ok
r2 ok
mse ok
1 5 0
modelo ok
y_hat ok
r2 ok
mse ok
1 5 1
modelo ok
y_hat ok
r2 ok
mse ok
CPU times: total: 42.4 s
Wall time: 55min 8s


In [66]:
modelos[0].keys()

dict_keys(['outliers', 'cluster', 'version', 'modelo', 'X', 'y', 'X_test', 'R2', 'mse', 'y_hat'])

In [80]:
df_models = pd.DataFrame(columns=['Outliers','Cluster','version','R2','mse'])
for index,m in enumerate(modelos):
    df_models.loc[index]=[m['outliers'],m['cluster'],m['version'],m['R2'],m['mse']]
    muestra = pd.read_csv('../data/muestra.csv')
    y_hat_test = m['modelo'].best_estimator_.predict(m['X_test'])
    muestra['Price'] = y_hat_test
    muestra.to_csv(f'../data/muestra{m["outliers"]}_{m["cluster"]}_{m["version"]}.csv',index=False)
    
    

In [81]:
df_models

Unnamed: 0,Outliers,Cluster,version,R2,mse
0,0.0,0.0,0.0,0.976735,755969.018577
1,0.0,0.0,1.0,0.977759,739773.487695
2,0.0,4.0,0.0,0.977526,742915.688039
3,0.0,4.0,1.0,0.977768,739730.901528
4,0.0,5.0,0.0,0.968001,880399.169371
5,0.0,5.0,1.0,0.973802,799216.851028
6,1.0,0.0,0.0,0.979321,864220.493039
7,1.0,0.0,1.0,0.979753,855306.198935
8,1.0,4.0,0.0,0.979413,862522.681484
9,1.0,4.0,1.0,0.980012,851279.362074


In [82]:
px.scatter(df_models,x='Cluster',y='R2',color='Outliers',symbol='version')

In [58]:
mse = mse(y_hat,y_train.drop(index=index),squared=False)

TypeError: 'numpy.float64' object is not callable

In [39]:
modelos_2 = modelos[:]

In [34]:
modelos[0]

{'outliers': 0,
 'cluster': 0,
 'version': 0,
 'modelo': RandomizedSearchCV(cv=4, estimator=RandomForestRegressor(), n_iter=200,
                    n_jobs=-1,
                    param_distributions={'bootstrap': [True, False],
                                         'max_features': ['auto'],
                                         'min_samples_split': [2, 5, 10],
                                         'n_estimators': [200, 300, 400, 500,
                                                          600]},
                    scoring='neg_root_mean_squared_error', verbose=100)}

In [35]:
for modelo in modelos:
    print(modelo['modelo'].best_estimator_)
    print(modelo['modelo'].best_params_)

RandomForestRegressor(n_estimators=600)
{'n_estimators': 600, 'min_samples_split': 2, 'max_features': 'auto', 'bootstrap': True}
RandomForestRegressor(n_estimators=300)
{'n_estimators': 300, 'min_samples_split': 2, 'max_features': 'auto', 'bootstrap': True}
RandomForestRegressor(n_estimators=500)
{'n_estimators': 500, 'min_samples_split': 2, 'max_features': 'auto', 'bootstrap': True}
RandomForestRegressor(n_estimators=600)
{'n_estimators': 600, 'min_samples_split': 2, 'max_features': 'auto', 'bootstrap': True}
RandomForestRegressor(n_estimators=500)
{'n_estimators': 500, 'min_samples_split': 2, 'max_features': 'auto', 'bootstrap': True}
RandomForestRegressor(n_estimators=600)
{'n_estimators': 600, 'min_samples_split': 2, 'max_features': 'auto', 'bootstrap': True}
RandomForestRegressor(n_estimators=300)
{'n_estimators': 300, 'min_samples_split': 2, 'max_features': 'auto', 'bootstrap': True}
RandomForestRegressor(n_estimators=400)
{'n_estimators': 400, 'min_samples_split': 2, 'max_featur