In [1]:
import numpy as np
import pandas as pd

from joblib import dump, load

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import scipy.stats as stats

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

# 1. Carga de los datos

In [3]:
import os 
ruta = os.getcwd()
print(ruta)

/Users/duman/Documents/universidad/7semestre/BI/BI-202302/Lab2


In [11]:
df_laptops_train = pd.read_csv('./data/laptop_data_train.csv', sep=',', encoding='utf-8')

In [12]:
df_laptops_train.shape

(1216, 14)

In [13]:
df_laptops_train.sample(5)

Unnamed: 0,id,Company,TypeName,Ram,Weight,Price,TouchScreen,Ips,Ppi,Cpu_brand,HDD,SSD,Gpu_brand,Os
672,1155,HP,Notebook,4,1.91,10.147032,0,0,141.211998,AMD Processor,0.0,256,AMD,Windows
209,154,HP,Ultrabook,4,1.48,10.962127,0,0,157.350512,Intel Core i5,0.0,256,Intel,Windows
167,196,Razer,Gaming,32,3.49,12.691441,1,0,254.671349,Intel Core i7,0.0,1000,Nvidia,Windows
682,864,Dell,Ultrabook,16,1.29,11.689792,1,0,276.05353,Intel Core i7,0.0,512,Intel,Windows
234,405,Asus,Ultrabook,8,1.1,11.510858,0,0,157.350512,Intel Core i7,0.0,512,Intel,Windows


In [17]:
df_laptops_unlabeled = pd.read_csv('./data/laptop_data_test_unlabeled.csv', sep=',', encoding='utf-8')

In [18]:
df_laptops_unlabeled.shape

(64, 14)

In [19]:
df_laptops_unlabeled.sample(5)

Unnamed: 0,id,Company,TypeName,Ram,Weight,TouchScreen,Ips,Ppi,Cpu_brand,HDD,SSD,Gpu_brand,Os,CalculatedPrice
16,1154,Dell,Notebook,8,2.04,1,1,282.423996,Intel Core i5,0.0,256,Nvidia,Windows,
29,1137,Dell,Notebook,8,2.5,0,0,100.45467,Intel Core i5,1000.0,0,AMD,Windows,
5,342,HP,Notebook,8,2.1,0,1,141.211998,Intel Core i3,1000.0,0,Nvidia,Windows,
24,329,Dell,Notebook,32,2.06,1,0,282.423996,Intel Core i7,0.0,1000,Nvidia,Windows,
13,43,Acer,Notebook,4,2.2,0,1,141.211998,Intel Core i5,0.0,256,Intel,Windows,


# 2. Entendimiento y limpieza de los datos

In [20]:
# Eleccion de variables relevantes para calcular el precio de un computador
features = [
    'Company',
    'TypeName',
    'Ram',
    'TouchScreen',
    'Ips',
    'Ppi',
    'Cpu_brand',
    'HDD',
    'SSD',
    'Gpu_brand',
    'Os'
]

In [21]:
# Revisión del porcentaje de valores nulos (vacíos) en cada columna

df_laptops_train[['Price']+ features].isnull().sum() / df_laptops_train.shape[0]

Price          0.012336
Company        0.000000
TypeName       0.000000
Ram            0.000000
TouchScreen    0.000000
Ips            0.000000
Ppi            0.009868
Cpu_brand      0.000000
HDD            0.013158
SSD            0.000000
Gpu_brand      0.000000
Os             0.000000
dtype: float64

In [22]:
# Eliminar entradas con datos vacíos
# Antes de realizar el drop, se genera un informe con los datos que se van a eliminar
# .isnull() es igual a .inna()
entradas_a_eliminar = df_laptops_train[df_laptops_train[['Price']+ features].isna().any(axis=1)]
entradas_a_eliminar.to_csv('./data/entradas_a_eliminar.csv', sep=',', encoding='utf-8', index=False)

#conocer entradas eliminadas
entradas_a_eliminar.shape

(36, 14)

In [23]:
# Eliminar entradas con datos vacíos
df_laptops_train = df_laptops_train.dropna(subset=['Price']+ features)
df_laptops_test_unlabeled = df_laptops_unlabeled.dropna(subset=features)

In [24]:
df_laptops_train.shape

(1180, 14)