# Laptops market 💻
# Precio Portátiles 💻
## Veamos cómo de buenos somos prediciendo el valor de los productos.

### Overview
Nuestro jefe estaba buscando un@s maquinas para obtener datos de la competencia y poder aplicarle los conocimientos obtenidos para asignar precios a nuestra tienda de "MERIMARKT".
Lamentablemente se habían ido de vacaciones y nos lo ha pedido a nosotr@s …
💥🪓🔪

Nos toca arremangarnos las mangas y aplicar los conocimientos obtenidos en ML para obtener un modelo de predicción de precios de portátiles en función de sus marcas y prestaciones para poder lanzarlos a un precio competitivo al mercado.

### Evaluation
En esta tarea, utilizaremos el error absoluto medio (MAE) para evaluar la eficacia del modelo.

---
---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error, root_mean_squared_error


#### DATASET

In [None]:
data = pd.read_csv('./data/train.csv')

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include='all')

In [None]:
data.head()

#### DATA CLEANING

In [None]:
data = data.rename(columns=str.lower)

In [None]:

data = data.rename(columns={'price_euros' : 'price'})

In [None]:
data = data.drop('id', axis=1)

In [None]:
data.info()

In [None]:
# pd.get_dummies(data['company'], dtype=int)

In [None]:
data_object = 'company'
data = pd.concat([data, pd.get_dummies(data[data_object], prefix=data_object, dtype=int)], axis=1)
data.drop(columns=[data_object], inplace=True)
data.info()

In [None]:
data_object = 'product'
data.drop(columns=[data_object], inplace=True)
data.info()


In [None]:
data_object = 'typename'
data = pd.concat([data, pd.get_dummies(data[data_object], prefix=data_object, dtype=int)], axis=1)
data.drop(columns=[data_object], inplace=True)
data.info()


In [None]:
def process_screen_resolution(resolution):
    result = {
        'screen_4K': 0,  
        'screen_HD': 0,
        'screen_Touchscreen': 0,  
        'screen_Retina': 0,
        'screen_Ultra': 0,
        'screen_width': None,  
        'screen_height': None
    }

    if '4K' in resolution:
        result['screen_4K'] = 1  
    if 'Full HD' in resolution or 'HD' in resolution:
        result['screen_HD'] = 1
    if 'Retina' in resolution:
        result['screen_Retina'] = 1
    
    match = re.search(r'(\d{3,4})x(\d{3,4})', resolution)
    if match:
        width, height = match.groups()
        result['screen_width'] = int(width)
        result['screen_height'] = int(height)

    return result

data_screen = data['screenresolution'].apply(process_screen_resolution)

data_screen = pd.DataFrame(data_screen.tolist())

data = pd.concat([data, data_screen], axis=1)

data.drop(columns=['screenresolution'], inplace=True)

data.info()

In [None]:
def extract_cpu_info(cpu_string):
    cpu_intel = 0
    cpu_amd = 0
    cpu_ghz = None

    if 'Intel' in cpu_string:
        cpu_intel = 1
        match = re.search(r'(\d+\.?\d*)GHz', cpu_string)
        if match:
            cpu_ghz = float(match.group(1))
    
    if 'AMD' in cpu_string:
        cpu_amd = 1
        match = re.search(r'(\d+\.?\d*)GHz', cpu_string)
        if match:
            cpu_ghz = float(match.group(1))

    return pd.Series([cpu_intel, cpu_amd, cpu_ghz], index=['cpu_intel', 'cpu_amd', 'cpu_ghz'])

data[['cpu_intel', 'cpu_amd', 'cpu_ghz']] = data['cpu'].apply(extract_cpu_info)

data['cpu_intel'] = data['cpu_intel'].astype(int)
data['cpu_amd'] = data['cpu_amd'].astype(int)

data.drop(columns=['cpu'], inplace=True)

data.info()


In [None]:
data.describe()


In [None]:
def extract_ram(ram_string):
    match = re.search(r'(\d+)GB', ram_string)
    if match:
        return int(match.group(1))
    return None

data['ram_numeric'] = data['ram'].apply(extract_ram)

data.drop(columns=['ram'], inplace=True)

data.info()

In [None]:
def extract_memory_info(memory_string):
    memory_types = {
        'memory_HDD': 0,
        'memory_SSD': 0,
        'memory_Flash': 0
    }
    memory_sizes = {    
        'memory_HDD_GB': 0,
        'memory_SSD_GB': 0,
        'memory_Flash_GB': 0
    }
    
    matches = re.findall(r'(\d+)([A-Za-z ]+)', memory_string)
    
    for match in matches:
        size = int(match[0])
        unit = match[1].strip().upper()
        
        if 'TB' in unit:
            size *= 1000  
        
        if 'HDD' in unit:
            memory_types['memory_HDD'] = 1
            memory_sizes['memory_HDD_GB'] += size
        elif 'SSD' in unit:
            memory_types['memory_SSD'] = 1
            memory_sizes['memory_SSD_GB'] += size
        elif 'FLASH STORAGE' in unit:
            memory_types['memory_Flash'] = 1
            memory_sizes['memory_Flash_GB'] += size
    
    result = {**memory_types, **memory_sizes}
    return pd.Series(result)

data[['memory_HDD', 'memory_SSD', 'memory_Flash', 'memory_HDD_GB', 'memory_SSD_GB', 'memory_Flash_GB']] = data['memory'].apply(extract_memory_info)

data.drop(columns=['memory'], inplace=True)

data.info()


In [None]:
data.describe()

In [None]:
data.head()

In [None]:
data_object = 'gpu'
# data = pd.concat([data, pd.get_dummies(data[data_object], prefix=data_object, dtype=int)], axis=1)
data.drop(columns=[data_object], inplace=True)
data.info()

In [None]:
data_object = 'opsys'
data = pd.concat([data, pd.get_dummies(data[data_object], prefix=data_object, dtype=int)], axis=1)
data.drop(columns=[data_object], inplace=True)
data.info()

In [None]:
def extract_weight(weight_string):
    match = re.search(r'(\d+(\.\d+)?)', weight_string)
    if match:
        return float(match.group(1))
    return None

data['weight_kg'] = data['weight'].apply(extract_weight)

data.drop(columns=['weight'], inplace=True)

data.info()

In [None]:
data.head()

In [None]:
columns = list(data.columns)

index_inches = columns.index('inches')
index_price = columns.index('price')

columns[index_inches], columns[index_price] = columns[index_price], columns[index_inches]

data = data[columns]

In [None]:
data.info()

#### TRAIN


In [None]:
features = ['company_Fujitsu', 'company_Mediacom', 'opsys_Android']
data = data.drop(features, axis=1)

In [None]:
data.info()

In [None]:
X = data.drop('price', axis=1)
y = data['price']


In [None]:
X.info()

In [None]:
X.describe()

In [None]:
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

# X_df = pd.DataFrame(X, columns=data.columns[1:])
# X_df.describe()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)


In [None]:
model = LinearRegression()

model.fit(X_train, y_train)



In [None]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

rmse = root_mean_squared_error(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')


In [None]:
predicciones = model.predict(X_test)

print(predicciones)

#### PREDICT

In [None]:
data_test = pd.read_csv('./data/test.csv')

In [None]:
data_test = data_test.rename(columns=str.lower)

In [None]:
data_test.info()

In [None]:
data_object = 'company'
data_test = pd.concat([data_test, pd.get_dummies(data_test[data_object], prefix=data_object, dtype=int)], axis=1)
data_test.drop(columns=[data_object], inplace=True)
data_test.info()

In [None]:
data_object = 'product'
data_test.drop(columns=[data_object], inplace=True)
data_test.info()

In [None]:
data_object = 'typename'
data_test = pd.concat([data_test, pd.get_dummies(data_test[data_object], prefix=data_object, dtype=int)], axis=1)
data_test.drop(columns=[data_object], inplace=True)
data_test.info()

In [None]:
def process_screen_resolution(resolution):
    result = {
        'screen_4K': 0,  
        'screen_HD': 0,
        'screen_Touchscreen': 0,  
        'screen_Retina': 0,
        'screen_Ultra': 0,
        'screen_width': None,  
        'screen_height': None
    }

    if '4K' in resolution:
        result['screen_4K'] = 1  
    if 'Full HD' in resolution or 'HD' in resolution:
        result['screen_HD'] = 1
    if 'Retina' in resolution:
        result['screen_Retina'] = 1
    
    match = re.search(r'(\d{3,4})x(\d{3,4})', resolution)
    if match:
        width, height = match.groups()
        result['screen_width'] = int(width)
        result['screen_height'] = int(height)

    return result

data_screen = data_test['screenresolution'].apply(process_screen_resolution)

data_screen = pd.DataFrame(data_screen.tolist())

data_test = pd.concat([data_test, data_screen], axis=1)

data_test.drop(columns=['screenresolution'], inplace=True)

data_test.info()

In [None]:
def extract_cpu_info(cpu_string):
    cpu_intel = 0
    cpu_amd = 0
    cpu_ghz = None

    if 'Intel' in cpu_string:
        cpu_intel = 1
        match = re.search(r'(\d+\.?\d*)GHz', cpu_string)
        if match:
            cpu_ghz = float(match.group(1))
    
    if 'AMD' in cpu_string:
        cpu_amd = 1
        match = re.search(r'(\d+\.?\d*)GHz', cpu_string)
        if match:
            cpu_ghz = float(match.group(1))

    return pd.Series([cpu_intel, cpu_amd, cpu_ghz], index=['cpu_intel', 'cpu_amd', 'cpu_ghz'])

data_test[['cpu_intel', 'cpu_amd', 'cpu_ghz']] = data_test['cpu'].apply(extract_cpu_info)

data_test['cpu_intel'] = data_test['cpu_intel'].astype(int)
data_test['cpu_amd'] = data_test['cpu_amd'].astype(int)

data_test.drop(columns=['cpu'], inplace=True)

data_test.info()

In [None]:
def extract_ram(ram_string):
    match = re.search(r'(\d+)GB', ram_string)
    if match:
        return int(match.group(1))
    return None

data_test['ram_numeric'] = data_test['ram'].apply(extract_ram)

data_test.drop(columns=['ram'], inplace=True)

data_test.info()

In [None]:
def extract_memory_info(memory_string):
    memory_types = {
        'memory_HDD': 0,
        'memory_SSD': 0,
        'memory_Flash': 0
    }
    memory_sizes = {    
        'memory_HDD_GB': 0,
        'memory_SSD_GB': 0,
        'memory_Flash_GB': 0
    }
    
    matches = re.findall(r'(\d+)([A-Za-z ]+)', memory_string)
    
    for match in matches:
        size = int(match[0])
        unit = match[1].strip().upper()
        
        if 'TB' in unit:
            size *= 1000  
        
        if 'HDD' in unit:
            memory_types['memory_HDD'] = 1
            memory_sizes['memory_HDD_GB'] += size
        elif 'SSD' in unit:
            memory_types['memory_SSD'] = 1
            memory_sizes['memory_SSD_GB'] += size
        elif 'FLASH STORAGE' in unit:
            memory_types['memory_Flash'] = 1
            memory_sizes['memory_Flash_GB'] += size
    
    result = {**memory_types, **memory_sizes}
    return pd.Series(result)

data_test[['memory_HDD', 'memory_SSD', 'memory_Flash', 'memory_HDD_GB', 'memory_SSD_GB', 'memory_Flash_GB']] = data_test['memory'].apply(extract_memory_info)

data_test.drop(columns=['memory'], inplace=True)

data_test.info()

In [None]:
data_object = 'gpu'
# data = pd.concat([data, pd.get_dummies(data[data_object], prefix=data_object, dtype=int)], axis=1)
data_test.drop(columns=[data_object], inplace=True)
data_test.info()

In [None]:
data_object = 'opsys'
data_test = pd.concat([data_test, pd.get_dummies(data_test[data_object], prefix=data_object, dtype=int)], axis=1)
data_test.drop(columns=[data_object], inplace=True)
data_test.info()

In [None]:
def extract_weight(weight_string):
    match = re.search(r'(\d+(\.\d+)?)', weight_string)
    if match:
        return float(match.group(1))
    return None

data_test['weight_kg'] = data_test['weight'].apply(extract_weight)

data_test.drop(columns=['weight'], inplace=True)

data_test.info()

In [None]:
data_test_copy = data_test.drop('id', axis=1)
# features = ['company_Fujitsu', 'company_Mediacom', 'opsys_Android']
# for feature in features:
#     if feature not in data_test.columns:
#         data_test[feature] = 0

In [None]:
data_test_copy.info()

In [None]:
X_test = data_test_copy

X_test = X_test.dropna()

X_test.info()

X_test.describe()

predicciones = model.predict(X_test)


In [None]:
print(predicciones)


In [None]:
df = pd.DataFrame()
df['ID'] = data_test['id']
df2 = pd.DataFrame(predicciones, columns=['Price_euros'])
df = pd.concat([df, df2], axis=1)
df.describe()

In [None]:
import csv
import os
from datetime import datetime

# Get the current date and time
current_datetime = datetime.now()

# Construct the filename
filename = f"submission_dgerwig_{current_datetime.strftime('%Y_%m_%d__%H_%M')}.csv"

# Directory where the file will be saved
directory = "submissions"

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Full path for the file
filepath = os.path.join(directory, filename)


df.to_csv(filepath, index=False)


print(f"✅ File '{filepath}' generated successfully.")


#### EVALUATE


#### SUBMMIT

In [None]:
import os
import glob

def get_most_recent_file(directory):
    # Ensure the directory path uses the correct separator
    directory = os.path.abspath(directory)
    files = glob.glob(os.path.join(directory, "*"))
    if not files:
        return None
    return max(files, key=os.path.getmtime)

directory = "./submissions"
most_recent_file = get_most_recent_file(directory)

if most_recent_file is None:
    print("No files found in the submissions directory.")
else:
    print(f"Most recent file: {most_recent_file}")
