In [1]:
# Tratamiento de datos
# ==============================================================================
import pandas as pd

# Gráficos
# ==============================================================================
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('default')

In [2]:
# Importamos los datos

train = pd.read_csv(r'datos/train_tratado.csv')
train.columns

Index(['Unnamed: 0', 'id', 'Company', 'Product', 'TypeName', 'Inches',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight',
       'Price_euros', '2 in 1 Convertible', 'Gaming', 'Netbook', 'Notebook',
       'Ultrabook', 'Workstation', 'Touchscreen', 'High_resolucion',
       'Chrome OS', 'Linux', 'Mac OS X', 'No OS', 'Windows 10', 'Windows 10 S',
       'Windows 7', 'macOS', 'Intel Core i3', 'Intel Core i5', 'Intel Core i7',
       'Intel Celeron Dual Core', 'Intel Pentium Quad Core', 'A4', 'A6', 'A8',
       'A10', 'A12'],
      dtype='object')

In [3]:
# Nombramos a 'X' y a 'y'. Quitamos las variables que no son numéricas

X = train[['Inches', 'Ram', 'Memory', 'Weight',
       '2 in 1 Convertible', 'Gaming', 'Netbook', 'Notebook',
       'Ultrabook', 'Workstation', 'Touchscreen', 'High_resolucion',
       'Chrome OS', 'Linux', 'Mac OS X', 'No OS', 'Windows 10', 'Windows 10 S',
       'Windows 7', 'macOS', 'Intel Core i3', 'Intel Core i5', 'Intel Core i7',
       'Intel Celeron Dual Core', 'Intel Pentium Quad Core', 'A4', 'A6', 'A8',
       'A10', 'A12']]

y = train['Price_euros']

In [4]:
# Separamos X_train y X_test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(729, 30)
(183, 30)
(729,)
(183,)


# Probamos con regresión polinómica de grado 2

In [5]:
from sklearn.preprocessing import PolynomialFeatures

poly_reg_2 = PolynomialFeatures(degree = 2)
poly_reg_2.fit(X_train)
X_poly_train = poly_reg_2.transform(X_train)
# X_poly_test = poly_reg_2.transform(X_test)

In [6]:
from sklearn.linear_model import LinearRegression

pol_reg_2 = LinearRegression()
pol_reg_2.fit(X_poly_train, y_train)
# pol_reg_2.fit(X_poly_test, y_test)

LinearRegression()

In [7]:
from sklearn import metrics

# Evaluamos el modelo
print('Train MAE:', metrics.mean_absolute_error(y_train, pol_reg_2.predict(X_poly_train)))
print('Test MAE:', metrics.mean_absolute_error(y_test, pol_reg_2.predict(poly_reg_2.fit_transform(X_test))))
print("")
print('Train MAPE:', metrics.mean_absolute_percentage_error(y_train, pol_reg_2.predict(X_poly_train)))
print('Test MAPE:', metrics.mean_absolute_percentage_error(y_test, pol_reg_2.predict(poly_reg_2.fit_transform(X_test))))
print("")
print('Train MSE:', metrics.mean_squared_error(y_train, pol_reg_2.predict(X_poly_train)))
print('Test MSE:', metrics.mean_squared_error(y_test, pol_reg_2.predict(poly_reg_2.fit_transform(X_test))))
print("")
print('Train RMSE:', np.sqrt(metrics.mean_squared_error(y_train, pol_reg_2.predict(X_poly_train))))
print('Test RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pol_reg_2.predict(poly_reg_2.fit_transform(X_test)))))
print("")
print('Train score', pol_reg_2.score(X_poly_train, y_train))
print('Test score', pol_reg_2.score(poly_reg_2.fit_transform(X_test), y_test))

Train MAE: 184.63779053794985
Test MAE: 323.55968907963126

Train MAPE: 0.17387136638656642
Test MAPE: 0.37364845412025405

Train MSE: 72514.50020583298
Test MSE: 300352.02976840397

Train RMSE: 269.2851652167883
Test RMSE: 548.0438210293078

Train score 0.8594311524605812
Test score 0.38996109088903785


# Probamos con regresión polinómica de grado 3

In [8]:
poly_reg = PolynomialFeatures(degree = 3)
poly_reg.fit(X_train)
X_poly_train = poly_reg.transform(X_train)
# X_poly_test = poly_reg.transform(X_test)


pol_reg = LinearRegression()
pol_reg.fit(X_poly_train, y_train)
# pol_reg.fit(X_poly_test, y_test)

LinearRegression()

In [9]:
# Evaluamos el modelo
print('Train MAE:', metrics.mean_absolute_error(y_train, pol_reg.predict(X_poly_train)))
print('Test MAE:', metrics.mean_absolute_error(y_test, pol_reg.predict(poly_reg.fit_transform(X_test))))
print("")
print('Train MAPE:', metrics.mean_absolute_percentage_error(y_train, pol_reg.predict(X_poly_train)))
print('Test MAPE:', metrics.mean_absolute_percentage_error(y_test, pol_reg.predict(poly_reg.fit_transform(X_test))))
print("")
print('Train MSE:', metrics.mean_squared_error(y_train, pol_reg.predict(X_poly_train)))
print('Test MSE:', metrics.mean_squared_error(y_test, pol_reg.predict(poly_reg.fit_transform(X_test))))
print("")
print('Train RMSE:', np.sqrt(metrics.mean_squared_error(y_train, pol_reg.predict(X_poly_train))))
print('Test RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pol_reg.predict(poly_reg.fit_transform(X_test)))))
print("")
print('Train score', pol_reg.score(X_poly_train, y_train))
print('Test score', pol_reg.score(poly_reg.fit_transform(X_test), y_test))

Train MAE: 111.45933232575288
Test MAE: 13908.11817806836

Train MAPE: 0.09846627165831684
Test MAPE: 9.798124613441336

Train MSE: 34399.73285420277
Test MSE: 7855856846.964483

Train RMSE: 185.47164973171175
Test RMSE: 88633.27167020568

Train score 0.933316360324439
Test score -15954.871331215616


# Probamos con regresión polinómica de grado 4

In [10]:
poly_reg = PolynomialFeatures(degree = 4)
poly_reg.fit(X_train)
X_poly_train = poly_reg.transform(X_train)
# X_poly_test = poly_reg.transform(X_test)


pol_reg = LinearRegression()
pol_reg.fit(X_poly_train, y_train)
# pol_reg.fit(X_poly_test, y_test)

LinearRegression()

In [11]:
# Evaluamos el modelo
print('Train MAE:', metrics.mean_absolute_error(y_train, pol_reg.predict(X_poly_train)))
print('Test MAE:', metrics.mean_absolute_error(y_test, pol_reg.predict(poly_reg.fit_transform(X_test))))
print("")
print('Train MAPE:', metrics.mean_absolute_percentage_error(y_train, pol_reg.predict(X_poly_train)))
print('Test MAPE:', metrics.mean_absolute_percentage_error(y_test, pol_reg.predict(poly_reg.fit_transform(X_test))))
print("")
print('Train MSE:', metrics.mean_squared_error(y_train, pol_reg.predict(X_poly_train)))
print('Test MSE:', metrics.mean_squared_error(y_test, pol_reg.predict(poly_reg.fit_transform(X_test))))
print("")
print('Train RMSE:', np.sqrt(metrics.mean_squared_error(y_train, pol_reg.predict(X_poly_train))))
print('Test RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pol_reg.predict(poly_reg.fit_transform(X_test)))))
print("")
print('Train score', pol_reg.score(X_poly_train, y_train))
print('Test score', pol_reg.score(poly_reg.fit_transform(X_test), y_test))

Train MAE: 125.97884605875143
Test MAE: 670301.556880342

Train MAPE: 0.14102413748833473
Test MAPE: 891.6359754455698

Train MSE: 31569.05759166268
Test MSE: 17073752352854.852

Train RMSE: 177.67683470746175
Test RMSE: 4132039.7327294485

Train score 0.9388036043692044
Test score -34678150.72681743


# Punto óptimo en test en el grado polinómico de grado 4

In [12]:
X_pred = pd.read_csv("Datos/test_tratado.csv")

X_pred_1 = X_pred[['Inches', 'Ram', 'Memory', 'Weight',
       '2 in 1 Convertible', 'Gaming', 'Netbook', 'Notebook',
       'Ultrabook', 'Workstation', 'Touchscreen', 'High_resolucion',
       'Chrome OS', 'Linux', 'Mac OS X', 'No OS', 'Windows 10', 'Windows 10 S',
       'Windows 7', 'macOS', 'Intel Core i3', 'Intel Core i5', 'Intel Core i7',
       'Intel Celeron Dual Core', 'Intel Pentium Quad Core', 'A4', 'A6', 'A8',
       'A10', 'A12']]

In [13]:
poly_reg = PolynomialFeatures(degree = 4)
poly_reg.fit(X_pred_1)
X_poly_pred = poly_reg.transform(X_pred_1)

In [14]:
predictions = pol_reg.predict(X_poly_pred)
predictions = pd.DataFrame(predictions)

In [15]:
submission = pd.DataFrame(X_pred['id'].copy())
submission['Price_euros'] = predictions[0].copy()

In [16]:
submission.shape

(391, 2)

In [17]:
submission.head(20)

Unnamed: 0,id,Price_euros
0,1184,-374.9384
1,815,1331.75
2,1244,298526.9
3,1121,1231.492
4,443,-135827.3
5,16,978.9871
6,262,694.1809
7,152,808.8987
8,336,-8454074.0
9,773,1415.384


# Pasamos el check

In [18]:
import urllib.request
from PIL import Image

In [19]:
sample = pd.read_csv("Datos/sample_submission.csv")

In [20]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                submission.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")


In [92]:
# chequeator(submission)

You're ready to submit!
