In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
# Importamos los datos

train = pd.read_csv(r'datos/train_tratado.csv')
train.columns

Index(['Unnamed: 0', 'id', 'Company', 'Product', 'TypeName', 'Inches',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight',
       'Price_euros', '2 in 1 Convertible', 'Gaming', 'Netbook', 'Notebook',
       'Ultrabook', 'Workstation', 'Touchscreen', 'High_resolucion',
       'Chrome OS', 'Linux', 'Mac OS X', 'No OS', 'Windows 10', 'Windows 10 S',
       'Windows 7', 'macOS', 'Intel Core i3', 'Intel Core i5', 'Intel Core i7',
       'Intel Celeron Dual Core', 'Intel Pentium Quad Core', 'A4', 'A6', 'A8',
       'A10', 'A12'],
      dtype='object')

# Simplificamos el modelo

En primer lugar, vemos que no hay overfitting, por tanto la regularización ni la carga polinómica mejoran el modelo. Por ende, veremos si hay underfitting y podemos mejorar el modelo.

Cierta multicolinearidad presente en dos columnas ('2 in 1 Convertible' & 'Touchscreen' ). Por ende, como '2 in 1 Convertible' tiene menor correlación con nuestra target, la borramos del dataset  

# Vamos a supersimplificar

El punto óptimo de menor error en test es cuando consideramos únicamente las variables que afectan en nuestra predicción en más del 9%, que acorde a la matriz de correlación son:
- Ram (0.74)
- Notebook (-0.53)
- Gaming (0.37)
- High_resolution (0.36)
- Ultrabook (0.24)
- Weight (0.24)
- Workstation (0.23)
- Touchscreen (0.21)
- Memory (0.14)
- Netbook (-0.12)
- Chrome OS (-0.13)
- Linux (-0.16)
- No OS (-0.17)
- Windows 10 (0.12)
- Windows 7 (0.14)
- macOS (0.091)

In [37]:
X = train[['Ram', 'Memory', 'Weight',
       'Gaming', 'Netbook', 'Notebook',
       'Ultrabook', 'Workstation', 'Touchscreen', 'High_resolucion',
       'Chrome OS', 'Linux', 'No OS', 'Windows 10', 
       'Windows 7', 'macOS']]

In [38]:
X = train[['Ram', 'Memory', 'Weight',
       'Gaming', 'Netbook', 'Notebook',
       'Ultrabook', 'Workstation', 'Touchscreen', 'High_resolucion',
       'Chrome OS', 'Linux', 'No OS', 'Windows 10', 
       'Windows 7', 'macOS', 'Intel Core i3', 'Intel Core i5', 'Intel Core i7',
       'Intel Celeron Dual Core', 'Intel Pentium Quad Core', 'A4', 'A6', 'A8',
       'A10', 'A12']]



y = train['Price_euros']

# Separamos X_train y X_test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)

from sklearn.linear_model import LinearRegression
#creating LinearRegression Object
model = LinearRegression()

#Training the Data Model
model.fit(X_train, y_train)

from sklearn import metrics
# Calculamos los errores
print('Train MAE:', metrics.mean_absolute_error(y_train, model.predict(X_train)))
print('Test MAE:', metrics.mean_absolute_error(y_test, model.predict(X_test)))
print("")
print('Train MAPE:', metrics.mean_absolute_percentage_error(y_train, model.predict(X_train)))
print('Test MAPE:', metrics.mean_absolute_percentage_error(y_test, model.predict(X_test)))
print("")
print('Train MSE:', metrics.mean_squared_error(y_train, model.predict(X_train)))
print('Test MSE:', metrics.mean_squared_error(y_test, model.predict(X_test)))
print("")
print('Train RMSE:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(X_train))))
print('Test RMSE:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(X_test))))
print("")
print('Train score', model.score(X_train, y_train))
print('Test score', model.score(X_test, y_test))

Train MAE: 267.8734113108484
Test MAE: 264.95371728384

Train MAPE: 0.26046521718589616
Test MAPE: 0.27629252239254387

Train MSE: 151926.15162693337
Test MSE: 143743.86350683094

Train RMSE: 389.7770537460272
Test RMSE: 379.13567954866886

Train score 0.7054922259040936
Test score 0.7080447575043263


# Evaluamos con los datos del data set de tests

In [39]:
X_pred = pd.read_csv("Datos/test_tratado.csv")

X_pred_1 = X_pred[['Ram', 'Memory', 'Weight',
       'Gaming', 'Netbook', 'Notebook',
       'Ultrabook', 'Workstation', 'Touchscreen', 'High_resolucion',
       'Chrome OS', 'Linux', 'No OS', 'Windows 10', 
       'Windows 7', 'macOS', 'Intel Core i3', 'Intel Core i5', 'Intel Core i7',
       'Intel Celeron Dual Core', 'Intel Pentium Quad Core', 'A4', 'A6', 'A8',
       'A10', 'A12']]

predictions = model.predict(X_pred_1)
predictions = pd.DataFrame(predictions)

submission = pd.DataFrame(X_pred['id'].copy())
submission['Price_euros'] = predictions[0].copy()

submission.shape


(391, 2)

# Pasamos el check

In [40]:
import urllib.request
from PIL import Image

sample = pd.read_csv("Datos/sample_submission.csv")

In [41]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                submission.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")

In [42]:
# chequeator(submission)

You're ready to submit!
