### Clasificación
##### **Pregunta:** Revise atentamente el bloque de código entregado.  Después de efectuado el procedimiento, se pide utilizar validación cruzada, con cv=10 y aplicar árboles de decisión ¿En qué rango se encuentra el accuracy del modelo?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('ML_Houses_dataset.csv', sep=',', encoding='utf-8')


In [3]:
# Obtener las columnas numéricas y categóricas
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns


df[categorical_columns] = df[categorical_columns].astype(str)

# Definir las transformaciones para las columnas numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Definir las transformaciones para las columnas categóricas
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Crear la pipeline, debi utilizar label encoder pero por tiempos lo manejo así
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

transformed_df = pipeline.fit_transform(df)

In [4]:
# Obtener las columnas numéricas y categóricas después de la transformación
numeric_columns_transformed = pipeline.named_steps['preprocessor'].transformers_[0][1]['scaler'].get_feature_names_out(input_features=numeric_columns)
categorical_columns_transformed = pipeline.named_steps['preprocessor'].transformers_[1][1]['onehot'].get_feature_names_out(input_features=categorical_columns)


all_columns_transformed = np.concatenate([numeric_columns_transformed, categorical_columns_transformed])

# Convertir el array transformado a un DataFrame con los nuevos nombres de columnas
transformed_df_2 = pd.DataFrame(transformed_df.toarray(), columns=all_columns_transformed)
transformed_df_2

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,Pesos,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-1.719199,0.066079,-0.242809,-0.220904,0.637740,-0.513528,1.052509,0.890104,0.490914,0.332154,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-1.716832,-0.881362,0.424035,-0.099164,-0.077179,2.179050,0.155552,-0.418209,-0.566134,-0.000681,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.714464,0.066079,-0.109440,0.075506,0.637740,-0.513528,0.986068,0.841648,0.307548,0.517063,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-1.712096,0.302939,-0.465091,-0.104457,0.637740,-0.513528,-1.870907,-0.708945,-0.566134,-0.512260,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-1.709729,0.066079,0.601861,0.394148,1.352660,-0.513528,0.952847,0.744736,1.321451,0.843734,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1755,1.728000,-0.881362,0.646317,0.279289,-0.077179,0.383998,0.221993,0.163264,0.075645,0.350645,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1756,-1.442192,-0.881362,0.157297,-0.211800,-0.792098,-0.513528,1.152171,1.083929,-0.566134,-0.327352,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1757,-1.122569,0.066079,-0.242809,-0.229797,-0.077179,-0.513528,1.085730,0.938560,-0.566134,-0.105462,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1758,-1.250418,0.421369,0.957511,-0.531077,-0.077179,0.383998,-1.970569,-1.678066,-0.566134,-0.956040,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
import numpy as np
# Definir las features (X) y la variable objetivo (y)
X = transformed_df_2[['GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr', 'OverallCond', 'Pesos', 'Alley_Grvl', 'Alley_Pave', 'Alley_nan', 'WallMat_Concrete', 'WallMat_Wood', 'WallMat_nan']]
y = transformed_df_2['SalePrice']

In [8]:
# create a cross validation with cv=10
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
scores = cross_val_score(regressor, X, y, cv=10)
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.73352635 0.99996166 0.99990785 0.9999518  0.99963782 0.99977878
 0.99986722 0.99976772 0.99999454 1.        ]
Mean: 0.973239374061334
Standard Deviation: 0.07990441841292838


In [9]:
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

# Assuming X and y are already defined
regressor = DecisionTreeRegressor()

# For Mean Absolute Error
mae_scores = cross_val_score(regressor, X, y, cv=10, scoring=make_scorer(mean_absolute_error))
print("MAE Scores:", mae_scores)
print("MAE Mean:", mae_scores.mean())
print("MAE Standard Deviation:", mae_scores.std())

# For Mean Squared Error
mse_scores = cross_val_score(regressor, X, y, cv=10, scoring=make_scorer(mean_squared_error))
print("\nMSE Scores:", mse_scores)
print("MSE Mean:", mse_scores.mean())
print("MSE Standard Deviation:", mse_scores.std())

# For R-squared
r2_scores = cross_val_score(regressor, X, y, cv=10, scoring='r2')
print("\nR2 Scores:", r2_scores)
print("R2 Mean:", r2_scores.mean())
print("R2 Standard Deviation:", r2_scores.std())

MAE Scores: [7.94232138e-02 2.29999031e-03 2.87246248e-03 2.88969061e-03
 3.33803719e-03 2.28027136e-03 2.70647737e-03 5.03884827e-03
 8.09168722e-04 1.78400539e-17]
MAE Mean: 0.010165816014740316
MAE Standard Deviation: 0.02312187697148405

MSE Scores: [1.99848011e-01 2.18612744e-04 1.01923942e-04 8.32467555e-05
 3.29952662e-04 1.62190122e-04 1.34490638e-04 6.78436615e-04
 1.29407918e-05 2.00478638e-33]
MSE Mean: 0.020156980556826264
MSE Standard Deviation: 0.05989730192927092

R2 Scores: [0.73350491 0.99978125 0.99991543 0.99993003 0.99961516 0.99918457
 0.99989164 0.99913412 0.99999448 1.        ]
R2 Mean: 0.9730951586852843
R2 Standard Deviation: 0.07986398660396238
