# Laptops market 💻
# Precio Portátiles 💻
## Veamos cómo de buenos somos prediciendo el valor de los productos.

### Overview
Nuestro jefe estaba buscando un@s maquinas para obtener datos de la competencia y poder aplicarle los conocimientos obtenidos para asignar precios a nuestra tienda de "MERIMARKT".
Lamentablemente se habían ido de vacaciones y nos lo ha pedido a nosotr@s …
💥🪓🔪

Nos toca arremangarnos las mangas y aplicar los conocimientos obtenidos en ML para obtener un modelo de predicción de precios de portátiles en función de sus marcas y prestaciones para poder lanzarlos a un precio competitivo al mercado.

### Evaluation
En esta tarea, utilizaremos el error absoluto medio (MAE) para evaluar la eficacia del modelo.

---
---

In [663]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error, root_mean_squared_error


#### DATASET

In [664]:
data = pd.read_csv('./data/train.csv')

In [665]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                912 non-null    int64  
 1   Company           912 non-null    object 
 2   Product           912 non-null    object 
 3   TypeName          912 non-null    object 
 4   Inches            912 non-null    float64
 5   ScreenResolution  912 non-null    object 
 6   Cpu               912 non-null    object 
 7   Ram               912 non-null    object 
 8   Memory            912 non-null    object 
 9   Gpu               912 non-null    object 
 10  OpSys             912 non-null    object 
 11  Weight            912 non-null    object 
 12  Price_euros       912 non-null    float64
dtypes: float64(2), int64(1), object(10)
memory usage: 92.8+ KB


In [666]:
data.describe()

Unnamed: 0,ID,Inches,Price_euros
count,912.0,912.0,912.0
mean,666.192982,15.011404,1108.122873
std,384.873846,1.411744,714.597741
min,1.0,10.1,174.0
25%,330.5,14.0,589.0
50%,673.5,15.6,949.0
75%,998.5,15.6,1458.5
max,1320.0,18.4,6099.0


In [667]:
data.describe(include='all')

Unnamed: 0,ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
count,912.0,912,912,912,912.0,912,912,912,912,912,912,912,912.0
unique,,19,475,6,,35,104,8,36,91,9,158,
top,,Lenovo,Inspiron 3567,Notebook,,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,2.2kg,
freq,,208,21,522,,346,142,434,293,199,746,86,
mean,666.192982,,,,15.011404,,,,,,,,1108.122873
std,384.873846,,,,1.411744,,,,,,,,714.597741
min,1.0,,,,10.1,,,,,,,,174.0
25%,330.5,,,,14.0,,,,,,,,589.0
50%,673.5,,,,15.6,,,,,,,,949.0
75%,998.5,,,,15.6,,,,,,,,1458.5


In [668]:
data.head()

Unnamed: 0,ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1002,Dell,Inspiron 5567,Notebook,15.6,1366x768,Intel Core i7 7500U 2.7GHz,8GB,1TB HDD,AMD Radeon R7 M445,Windows 10,2.36kg,749.0
1,867,Asus,X541NA (N4200/4GB/1TB/W10),Notebook,15.6,1366x768,Intel Pentium Quad Core N4200 1.1GHz,4GB,1TB HDD,Intel HD Graphics 505,Windows 10,2kg,449.0
2,966,Toshiba,Portege Z30-C-1CW,Notebook,13.3,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 7,1.2kg,1460.0
3,767,Dell,Alienware 17,Gaming,15.6,IPS Panel 4K Ultra HD 3840x2160,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1070,Windows 10,4.42kg,2868.99
4,1241,Dell,Latitude E7270,Ultrabook,12.5,Full HD / Touchscreen 1920x1080,Intel Core i5 6300U 2.4GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 7,1.26kg,1713.37


#### DATA CLEANING

In [669]:
data = data.rename(columns=str.lower)

In [670]:

data = data.rename(columns={'price_euros' : 'price'})

In [671]:
data = data.drop('id', axis=1)

In [672]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   company           912 non-null    object 
 1   product           912 non-null    object 
 2   typename          912 non-null    object 
 3   inches            912 non-null    float64
 4   screenresolution  912 non-null    object 
 5   cpu               912 non-null    object 
 6   ram               912 non-null    object 
 7   memory            912 non-null    object 
 8   gpu               912 non-null    object 
 9   opsys             912 non-null    object 
 10  weight            912 non-null    object 
 11  price             912 non-null    float64
dtypes: float64(2), object(10)
memory usage: 85.6+ KB


In [673]:
# pd.get_dummies(data['company'], dtype=int)

In [674]:
data_object = 'company'
data = pd.concat([data, pd.get_dummies(data[data_object], prefix=data_object, dtype=int)], axis=1)
data.drop(columns=[data_object], inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   product            912 non-null    object 
 1   typename           912 non-null    object 
 2   inches             912 non-null    float64
 3   screenresolution   912 non-null    object 
 4   cpu                912 non-null    object 
 5   ram                912 non-null    object 
 6   memory             912 non-null    object 
 7   gpu                912 non-null    object 
 8   opsys              912 non-null    object 
 9   weight             912 non-null    object 
 10  price              912 non-null    float64
 11  company_Acer       912 non-null    int64  
 12  company_Apple      912 non-null    int64  
 13  company_Asus       912 non-null    int64  
 14  company_Chuwi      912 non-null    int64  
 15  company_Dell       912 non-null    int64  
 16  company_Fujitsu    912 non

In [675]:
data_object = 'product'
data.drop(columns=[data_object], inplace=True)
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   typename           912 non-null    object 
 1   inches             912 non-null    float64
 2   screenresolution   912 non-null    object 
 3   cpu                912 non-null    object 
 4   ram                912 non-null    object 
 5   memory             912 non-null    object 
 6   gpu                912 non-null    object 
 7   opsys              912 non-null    object 
 8   weight             912 non-null    object 
 9   price              912 non-null    float64
 10  company_Acer       912 non-null    int64  
 11  company_Apple      912 non-null    int64  
 12  company_Asus       912 non-null    int64  
 13  company_Chuwi      912 non-null    int64  
 14  company_Dell       912 non-null    int64  
 15  company_Fujitsu    912 non-null    int64  
 16  company_Google     912 non

In [676]:
data_object = 'typename'
data = pd.concat([data, pd.get_dummies(data[data_object], prefix=data_object, dtype=int)], axis=1)
data.drop(columns=[data_object], inplace=True)
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   inches                       912 non-null    float64
 1   screenresolution             912 non-null    object 
 2   cpu                          912 non-null    object 
 3   ram                          912 non-null    object 
 4   memory                       912 non-null    object 
 5   gpu                          912 non-null    object 
 6   opsys                        912 non-null    object 
 7   weight                       912 non-null    object 
 8   price                        912 non-null    float64
 9   company_Acer                 912 non-null    int64  
 10  company_Apple                912 non-null    int64  
 11  company_Asus                 912 non-null    int64  
 12  company_Chuwi                912 non-null    int64  
 13  company_Dell        

In [677]:
def process_screen_resolution(resolution):
    result = {
        'screen_4K': 0,  
        'screen_HD': 0,
        'screen_Touchscreen': 0,  
        'screen_Retina': 0,
        'screen_Ultra': 0,
        'screen_width': None,  
        'screen_height': None
    }

    if '4K' in resolution:
        result['screen_4K'] = 1  
    if 'Full HD' in resolution or 'HD' in resolution:
        result['screen_HD'] = 1
    if 'Retina' in resolution:
        result['screen_Retina'] = 1
    
    match = re.search(r'(\d{3,4})x(\d{3,4})', resolution)
    if match:
        width, height = match.groups()
        result['screen_width'] = int(width)
        result['screen_height'] = int(height)

    return result

data_screen = data['screenresolution'].apply(process_screen_resolution)

data_screen = pd.DataFrame(data_screen.tolist())

data = pd.concat([data, data_screen], axis=1)

data.drop(columns=['screenresolution'], inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   inches                       912 non-null    float64
 1   cpu                          912 non-null    object 
 2   ram                          912 non-null    object 
 3   memory                       912 non-null    object 
 4   gpu                          912 non-null    object 
 5   opsys                        912 non-null    object 
 6   weight                       912 non-null    object 
 7   price                        912 non-null    float64
 8   company_Acer                 912 non-null    int64  
 9   company_Apple                912 non-null    int64  
 10  company_Asus                 912 non-null    int64  
 11  company_Chuwi                912 non-null    int64  
 12  company_Dell                 912 non-null    int64  
 13  company_Fujitsu     

In [678]:
def extract_cpu_info(cpu_string):
    cpu_intel = 0
    cpu_amd = 0
    cpu_ghz = None

    if 'Intel' in cpu_string:
        cpu_intel = 1
        match = re.search(r'(\d+\.?\d*)GHz', cpu_string)
        if match:
            cpu_ghz = float(match.group(1))
    
    if 'AMD' in cpu_string:
        cpu_amd = 1
        match = re.search(r'(\d+\.?\d*)GHz', cpu_string)
        if match:
            cpu_ghz = float(match.group(1))

    return pd.Series([cpu_intel, cpu_amd, cpu_ghz], index=['cpu_intel', 'cpu_amd', 'cpu_ghz'])

# Suponiendo que 'data' es un DataFrame y 'cpu' es la columna que contiene la información de la CPU.
data[['cpu_intel', 'cpu_amd', 'cpu_ghz']] = data['cpu'].apply(extract_cpu_info)

data['cpu_intel'] = data['cpu_intel'].astype(int)
data['cpu_amd'] = data['cpu_amd'].astype(int)

data.drop(columns=['cpu'], inplace=True)

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   inches                       912 non-null    float64
 1   ram                          912 non-null    object 
 2   memory                       912 non-null    object 
 3   gpu                          912 non-null    object 
 4   opsys                        912 non-null    object 
 5   weight                       912 non-null    object 
 6   price                        912 non-null    float64
 7   company_Acer                 912 non-null    int64  
 8   company_Apple                912 non-null    int64  
 9   company_Asus                 912 non-null    int64  
 10  company_Chuwi                912 non-null    int64  
 11  company_Dell                 912 non-null    int64  
 12  company_Fujitsu              912 non-null    int64  
 13  company_Google      

In [679]:
data.describe()


Unnamed: 0,inches,price,company_Acer,company_Apple,company_Asus,company_Chuwi,company_Dell,company_Fujitsu,company_Google,company_HP,...,screen_4K,screen_HD,screen_Touchscreen,screen_Retina,screen_Ultra,screen_width,screen_height,cpu_intel,cpu_amd,cpu_ghz
count,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,...,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0
mean,15.011404,1108.122873,0.082237,0.01864,0.118421,0.002193,0.226974,0.003289,0.002193,0.207237,...,0.037281,0.691886,0.0,0.015351,0.0,1890.050439,1067.899123,0.95614,0.04386,2.287895
std,1.411744,714.597741,0.274876,0.135325,0.323283,0.046804,0.419105,0.057291,0.046804,0.405549,...,0.189553,0.461967,0.0,0.123011,0.0,506.007321,290.338307,0.204895,0.204895,0.513277
min,10.1,174.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1366.0,768.0,0.0,0.0,0.9
25%,14.0,589.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1366.0,768.0,1.0,0.0,2.0
50%,15.6,949.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1920.0,1080.0,1.0,0.0,2.5
75%,15.6,1458.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1920.0,1080.0,1.0,0.0,2.7
max,18.4,6099.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,3840.0,2160.0,1.0,1.0,3.6


In [680]:
def extract_ram(ram_string):
    match = re.search(r'(\d+)GB', ram_string)
    if match:
        return int(match.group(1))
    return None

data['ram_numeric'] = data['ram'].apply(extract_ram)

data.drop(columns=['ram'], inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   inches                       912 non-null    float64
 1   memory                       912 non-null    object 
 2   gpu                          912 non-null    object 
 3   opsys                        912 non-null    object 
 4   weight                       912 non-null    object 
 5   price                        912 non-null    float64
 6   company_Acer                 912 non-null    int64  
 7   company_Apple                912 non-null    int64  
 8   company_Asus                 912 non-null    int64  
 9   company_Chuwi                912 non-null    int64  
 10  company_Dell                 912 non-null    int64  
 11  company_Fujitsu              912 non-null    int64  
 12  company_Google               912 non-null    int64  
 13  company_HP          

In [681]:
# data_object = 'memory'
# # data = pd.concat([data, pd.get_dummies(data[data_object], prefix=data_object, dtype=int)], axis=1)
# data.drop(columns=[data_object], inplace=True)
# data.info()


In [682]:
def extract_memory_info(memory_string):
    memory_types = {
        'memory_HDD': 0,
        'memory_SSD': 0,
        'memory_Flash': 0
    }
    memory_sizes = {    
        'memory_HDD_GB': 0,
        'memory_SSD_GB': 0,
        'memory_Flash_GB': 0
    }
    
    matches = re.findall(r'(\d+)([A-Za-z ]+)', memory_string)
    
    for match in matches:
        size = int(match[0])
        unit = match[1].strip().upper()
        
        if 'TB' in unit:
            size *= 1000  
        
        if 'HDD' in unit:
            memory_types['memory_HDD'] = 1
            memory_sizes['memory_HDD_GB'] += size
        elif 'SSD' in unit:
            memory_types['memory_SSD'] = 1
            memory_sizes['memory_SSD_GB'] += size
        elif 'FLASH STORAGE' in unit:
            memory_types['memory_Flash'] = 1
            memory_sizes['memory_Flash_GB'] += size
    
    result = {**memory_types, **memory_sizes}
    return pd.Series(result)

data[['memory_HDD', 'memory_SSD', 'memory_Flash', 'memory_HDD_GB', 'memory_SSD_GB', 'memory_Flash_GB']] = data['memory'].apply(extract_memory_info)

# Eliminar la columna 'Memory'
data.drop(columns=['memory'], inplace=True)

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 47 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   inches                       912 non-null    float64
 1   gpu                          912 non-null    object 
 2   opsys                        912 non-null    object 
 3   weight                       912 non-null    object 
 4   price                        912 non-null    float64
 5   company_Acer                 912 non-null    int64  
 6   company_Apple                912 non-null    int64  
 7   company_Asus                 912 non-null    int64  
 8   company_Chuwi                912 non-null    int64  
 9   company_Dell                 912 non-null    int64  
 10  company_Fujitsu              912 non-null    int64  
 11  company_Google               912 non-null    int64  
 12  company_HP                   912 non-null    int64  
 13  company_Huawei      

In [683]:
data.describe()

Unnamed: 0,inches,price,company_Acer,company_Apple,company_Asus,company_Chuwi,company_Dell,company_Fujitsu,company_Google,company_HP,...,cpu_intel,cpu_amd,cpu_ghz,ram_numeric,memory_HDD,memory_SSD,memory_Flash,memory_HDD_GB,memory_SSD_GB,memory_Flash_GB
count,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,...,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0
mean,15.011404,1108.122873,0.082237,0.01864,0.118421,0.002193,0.226974,0.003289,0.002193,0.207237,...,0.95614,0.04386,2.287895,8.188596,0.4375,0.64364,0.059211,400.254386,180.074561,5.22807
std,1.411744,714.597741,0.274876,0.135325,0.323283,0.046804,0.419105,0.057291,0.046804,0.405549,...,0.204895,0.204895,0.513277,4.899827,0.496351,0.479186,0.236148,503.985613,184.580513,34.250715
min,10.1,174.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.9,2.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,14.0,589.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15.6,949.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.5,8.0,0.0,1.0,0.0,0.0,180.0,0.0
75%,15.6,1458.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.7,8.0,1.0,1.0,0.0,1000.0,256.0,0.0
max,18.4,6099.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,3.6,32.0,1.0,1.0,1.0,2000.0,1024.0,512.0


In [684]:
data.head()

Unnamed: 0,inches,gpu,opsys,weight,price,company_Acer,company_Apple,company_Asus,company_Chuwi,company_Dell,...,cpu_intel,cpu_amd,cpu_ghz,ram_numeric,memory_HDD,memory_SSD,memory_Flash,memory_HDD_GB,memory_SSD_GB,memory_Flash_GB
0,15.6,AMD Radeon R7 M445,Windows 10,2.36kg,749.0,0,0,0,0,1,...,1,0,2.7,8,1,0,0,1000,0,0
1,15.6,Intel HD Graphics 505,Windows 10,2kg,449.0,0,0,1,0,0,...,1,0,1.1,4,1,0,0,1000,0,0
2,13.3,Intel HD Graphics 520,Windows 7,1.2kg,1460.0,0,0,0,0,0,...,1,0,2.3,8,0,1,0,0,256,0
3,15.6,Nvidia GeForce GTX 1070,Windows 10,4.42kg,2868.99,0,0,0,0,1,...,1,0,2.8,16,1,1,0,1000,256,0
4,12.5,Intel HD Graphics 520,Windows 7,1.26kg,1713.37,0,0,0,0,1,...,1,0,2.4,8,0,1,0,0,256,0


In [685]:
data_object = 'gpu'
# data = pd.concat([data, pd.get_dummies(data[data_object], prefix=data_object, dtype=int)], axis=1)
data.drop(columns=[data_object], inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 46 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   inches                       912 non-null    float64
 1   opsys                        912 non-null    object 
 2   weight                       912 non-null    object 
 3   price                        912 non-null    float64
 4   company_Acer                 912 non-null    int64  
 5   company_Apple                912 non-null    int64  
 6   company_Asus                 912 non-null    int64  
 7   company_Chuwi                912 non-null    int64  
 8   company_Dell                 912 non-null    int64  
 9   company_Fujitsu              912 non-null    int64  
 10  company_Google               912 non-null    int64  
 11  company_HP                   912 non-null    int64  
 12  company_Huawei               912 non-null    int64  
 13  company_LG          

In [686]:
data_object = 'opsys'
data = pd.concat([data, pd.get_dummies(data[data_object], prefix=data_object, dtype=int)], axis=1)
data.drop(columns=[data_object], inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   inches                       912 non-null    float64
 1   weight                       912 non-null    object 
 2   price                        912 non-null    float64
 3   company_Acer                 912 non-null    int64  
 4   company_Apple                912 non-null    int64  
 5   company_Asus                 912 non-null    int64  
 6   company_Chuwi                912 non-null    int64  
 7   company_Dell                 912 non-null    int64  
 8   company_Fujitsu              912 non-null    int64  
 9   company_Google               912 non-null    int64  
 10  company_HP                   912 non-null    int64  
 11  company_Huawei               912 non-null    int64  
 12  company_LG                   912 non-null    int64  
 13  company_Lenovo      

In [687]:
def extract_weight(weight_string):
    match = re.search(r'(\d+(\.\d+)?)', weight_string)
    if match:
        return float(match.group(1))
    return None

data['weight_kg'] = data['weight'].apply(extract_weight)

data.drop(columns=['weight'], inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   inches                       912 non-null    float64
 1   price                        912 non-null    float64
 2   company_Acer                 912 non-null    int64  
 3   company_Apple                912 non-null    int64  
 4   company_Asus                 912 non-null    int64  
 5   company_Chuwi                912 non-null    int64  
 6   company_Dell                 912 non-null    int64  
 7   company_Fujitsu              912 non-null    int64  
 8   company_Google               912 non-null    int64  
 9   company_HP                   912 non-null    int64  
 10  company_Huawei               912 non-null    int64  
 11  company_LG                   912 non-null    int64  
 12  company_Lenovo               912 non-null    int64  
 13  company_MSI         

In [688]:
data.head()

Unnamed: 0,inches,price,company_Acer,company_Apple,company_Asus,company_Chuwi,company_Dell,company_Fujitsu,company_Google,company_HP,...,opsys_Android,opsys_Chrome OS,opsys_Linux,opsys_Mac OS X,opsys_No OS,opsys_Windows 10,opsys_Windows 10 S,opsys_Windows 7,opsys_macOS,weight_kg
0,15.6,749.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,2.36
1,15.6,449.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2.0
2,13.3,1460.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1.2
3,15.6,2868.99,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,4.42
4,12.5,1713.37,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1.26


In [689]:
columns = list(data.columns)

index_inches = columns.index('inches')
index_price = columns.index('price')

columns[index_inches], columns[index_price] = columns[index_price], columns[index_inches]

data = data[columns]

In [690]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   price                        912 non-null    float64
 1   inches                       912 non-null    float64
 2   company_Acer                 912 non-null    int64  
 3   company_Apple                912 non-null    int64  
 4   company_Asus                 912 non-null    int64  
 5   company_Chuwi                912 non-null    int64  
 6   company_Dell                 912 non-null    int64  
 7   company_Fujitsu              912 non-null    int64  
 8   company_Google               912 non-null    int64  
 9   company_HP                   912 non-null    int64  
 10  company_Huawei               912 non-null    int64  
 11  company_LG                   912 non-null    int64  
 12  company_Lenovo               912 non-null    int64  
 13  company_MSI         

#### TRAIN


In [691]:
X = data.drop('price', axis=1)
y = data['price']


In [692]:
X.describe()

Unnamed: 0,inches,company_Acer,company_Apple,company_Asus,company_Chuwi,company_Dell,company_Fujitsu,company_Google,company_HP,company_Huawei,...,opsys_Android,opsys_Chrome OS,opsys_Linux,opsys_Mac OS X,opsys_No OS,opsys_Windows 10,opsys_Windows 10 S,opsys_Windows 7,opsys_macOS,weight_kg
count,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,...,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0
mean,15.011404,0.082237,0.01864,0.118421,0.002193,0.226974,0.003289,0.002193,0.207237,0.001096,...,0.002193,0.02193,0.051535,0.006579,0.048246,0.817982,0.004386,0.035088,0.012061,2.032112
std,1.411744,0.274876,0.135325,0.323283,0.046804,0.419105,0.057291,0.046804,0.405549,0.033113,...,0.046804,0.146535,0.221208,0.080888,0.214402,0.386071,0.066117,0.184103,0.10922,0.653772
min,10.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.69
25%,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.5
50%,15.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.04
75%,15.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.3
max,18.4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.6


In [693]:
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

# X_df = pd.DataFrame(X, columns=data.columns[1:])
# X_df.describe()

In [694]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [695]:
print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)


X_train: (729, 53)
X_test: (183, 53)
y_train: (729,)
y_test: (183,)


In [696]:
model = LinearRegression()

model.fit(X_train, y_train)



In [697]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

rmse = root_mean_squared_error(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')


Mean Absolute Error: 229.04283228237324
Mean Squared Error: 111451.61174639546
Root Mean Squared Error: 333.8436935848803
R-squared: 0.7419540702740595


In [698]:
predicciones = model.predict(X_test)

print(predicciones)

[1622.20603703 1111.08381443  955.35573813 1015.42435113 1685.56596486
  337.30938699  394.39926738  318.76893756  487.73931166  838.77384596
  263.51560814 2477.3327303   702.26100271 1118.40422523  397.55077316
 1300.09203103  462.21806141 1329.50881677  136.85774997  882.80617651
  600.24497711  984.37752852 1616.85496187  604.82736123 1432.3510181
  618.03801518 1077.02666168  942.04474321 1425.33188803 1142.20410888
  861.30084602 2087.22714189  420.72650296  989.06361486  328.07566194
  561.49496793  -81.96074107 2086.25799899 1222.53564846 1995.70043732
  857.10652155  919.97056515 2121.83734683  192.17020334  571.04764684
  303.70545074 1384.35203563  635.6749334  1071.95293172 1114.28323438
 1840.0742171  1811.67603237  804.5763954  1267.023131   1203.04653111
 1036.92597497  257.64049152  441.54393142  726.96708959 1244.0579302
 1271.27083677  533.29203752 1154.102471   1103.70852637 1864.4048187
  893.3575403  1251.45946344  522.51772251 1013.64839663  712.40602481
  305.520

#### PREDICT

In [699]:
import csv
import os
from datetime import datetime

# Get the current date and time
current_datetime = datetime.now()

# Construct the filename
filename = f"submission_dgerwig_{current_datetime.strftime('%Y_%m_%d__%H_%M')}.csv"

# Directory where the file will be saved
directory = "submissions"

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Full path for the file
filepath = os.path.join(directory, filename)

# Data for the CSV (example)
# data = [
#     {"ID": 1, "Price_euros": 100.0},
#     {"ID": 2, "Price_euros": 150.5},
#     {"ID": 3, "Price_euros": 200.75}
# ]

# CSV fields
fields = ["ID", "Price_euros"]

# Create the CSV file
with open(filepath, mode='w', newline='') as csv_file:
    csv_writer = csv.DictWriter(csv_file, fieldnames=fields)

    # Write the header
    csv_writer.writeheader()

    # Write the data
    csv_writer.writerows(data)

print(f"✅ File '{filepath}' generated successfully.")


AttributeError: 'str' object has no attribute 'keys'

#### EVALUATE


#### SUBMMIT

In [None]:
import os
import glob

def get_most_recent_file(directory):
    # Ensure the directory path uses the correct separator
    directory = os.path.abspath(directory)
    files = glob.glob(os.path.join(directory, "*"))
    if not files:
        return None
    return max(files, key=os.path.getmtime)

directory = "./submissions"
most_recent_file = get_most_recent_file(directory)

if most_recent_file is None:
    print("No files found in the submissions directory.")
else:
    print(f"Most recent file: {most_recent_file}")
