In [1]:
import extractor, explorer, cleaner, transformer, spliter, scaler, dim_reductor, trainer, loader
import pandas as pd


def extract():
    cars_train = extractor.extract_csv('cars_train.csv')
    cars_test = extractor.extract_csv('cars_test.csv')
    return cars_train, cars_test


def explore(train):
    insignificant_cols = explorer.get_anova_conclusions(train)
    #Cuando termine de probar, tengo que eliminar la siguiente fila y dejar la anterior.
    #insignificant_cols = ['manufacturer', 'condition', 'fuel', 'title_status', 'drive', 
    #                      'size', 'type', 'paint_color', 'state_name']
    explorer.get_nulls_conclusions(train)
    explorer.get_correlation_conclusions(train)
    explorer.plot_price_outliers(train)
    return insignificant_cols


def clean(train, test, insignificant_cols):
    clean_train, clean_test = cleaner.remove_cols(train, test, insignificant_cols)
    clean_train = cleaner.remove_nans(clean_train)
    clean_test = cleaner.fill_nans(clean_test)
    return clean_train, clean_test


def transform(train, test):
    try:
        train = transformer.transform_cylinders(train)
        test = transformer.transform_cylinders(test)
    except:
        print("\nNo hay cylinders en el dataframe.")
    final_train = transformer.one_hot_encode(train)
    final_test = transformer.one_hot_encode(test)
    try:
        final_test = transformer.unify_dimension(train, test, final_test)
    except:
        print("\nNo es necesario unificar la dimensión.")
    return final_train, final_test
    

def train_split(train):
    X_train, y_train = spliter.split_train(train)
    return X_train, y_train


def rescale(X_train, X_test):
    """
    Es recomendable rescalar las variables de cara a entrenar el modelo.
    No obstante, en mi último entrenamiento he observado que afecta negativamente al modelo.
    Por ello, he comentado la línea en el main.
    """
    print("\n\n===== Se rescalan X_train, X_test ======\n")
    scaled_X_train = scaler.standard_scaler(X_train) # Hay que elegir entre uno de los métodos de rescale.
    scaled_X_test = scaler.standard_scaler(X_test) # Hay que elegir entre uno de los métodos de rescale.
    return scaled_X_train, scaled_X_test


def reduce_dimension(X_train, X_test):
    X_train_pc, X_test_pc = dim_reductor.data_to_pca(X_train, X_test)
    return X_train_pc, X_test_pc


def train_and_predict(X_train, y_train, X_test):
    print("\n\n===== Modelos de entrenamiento ======\n")
    model_results = []
    model_results.append(trainer.linear_regression(X_train, y_train, X_test))
    model_results.append(trainer.random_forest(X_train, y_train, X_test))
    #model_results.append(trainer.decision_tree(X_train, y_train, X_test))
    #model_results.append(trainer.kneighbors(X_train, y_train, X_test))
    #model_results.append(trainer.gradient_boosting(X_train, y_train, X_test))
    return model_results

    
def load(model_results, test):
    for result in model_results:
        y_pred = result[0]
        name = result[1]
        submission = loader.create_submission(y_pred, test)
        loader.load_csv(submission, "cars_submission_"+name+".csv")


    

def main():
    train, test = extract()
    insignificant_cols = explore(train)
    clean_train, clean_test = clean(train, test, insignificant_cols)
    final_train, X_test = transform(clean_train, clean_test)
    X_train, y_train = train_split(final_train)
    X_train, X_test = rescale(X_train, X_test) # No siempre conviene. Aparte de los dummies, tengo muy pocas variables numéricas.
    #if clean_train.shape[1]>3:
    #    X_train, X_test = reduce_dimension(X_train, X_test) # No funciona para mi X. No tiene muchas variables numéricas.
    model_results = train_and_predict(X_train, y_train, X_test)
    load(model_results, test)
    print("\n\n===== The submission files are ready in the outputs folder! =====")

    
if __name__ == "__main__":
    main()

===== Calculando ANOVAs =====

price ~ C(year)
price ~ C(manufacturer)
price ~ C(condition)
price ~ C(cylinders)
price ~ C(fuel)
price ~ C(title_status)
price ~ C(transmission)
price ~ C(drive)
price ~ C(size)
price ~ C(type)
price ~ C(paint_color)
price ~ C(state_fips)
price ~ C(state_code)
price ~ C(state_name)
price ~ C(weather)


===== Fin de los cálculos. Resultados =====

        variable       p-value
0           year  1.644301e-50
3      cylinders  8.900004e-16
6   transmission  7.959975e-04
14       weather  6.983536e-03
13    state_name  1.932749e-02
11    state_fips  2.947323e-02
12    state_code  2.947323e-02
10   paint_color  2.422719e-01
7          drive  3.956268e-01
8           size  4.551301e-01
4           fuel  5.519853e-01
1   manufacturer  6.991262e-01
9           type  9.213114e-01
2      condition  9.737462e-01
5   title_status  9.786889e-01

    

===== Conclusiones de los ANOVA =====

    - year, cylinders, transmission, weather, state_name, state_fips, state_c

<Figure size 640x480 with 2 Axes>



===== Boxplot de la variable price =====


<Figure size 500x1000 with 1 Axes>



===== Resumen de eliminación de variables de cara al modelo =====


        Eliminamos:
        - las variables no significativas resultantes del ANOVA
        - 'state_code', 'state_fips' ya que son redundantes con state_name
        - variables con muchos nulls
        - variables categóricas con muchos valores únicos
        - 'lat', 'long' no son determinantes la una sin la otra

        
        En definitiva, se eliminan las siguientes variables:

        ['Id','paint_color', 'drive', 'size', 'fuel', 'manufacturer', 'type', 'condition', 'title_status',

        'city', 'make', 'odometer', 'lat', 'long', 'county_fips', 'county_name', 'state_code', 'state_fips']
        



Se ha rescalado con StandardScaler
Se ha rescalado con StandardScaler



Se ha entrenado el modelo LinearRegression
Se ha entrenado el modelo RandomForestRegressor


===== The submission files are ready in the outputs folder! =====
