In [200]:

import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import joblib
from sklearn.preprocessing import LabelEncoder


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


from sklearn.metrics import (r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error)



In [201]:
#I read my csv file using encoding="latin1" to decode it
df = pd.read_csv('../dataset/laptop_price.csv', encoding="latin1")

In [202]:
# Drop columns
df = df.drop(['laptop_ID'], axis=1)
# Dropping laptop_ID because it's similar to the index.
print(df.columns.tolist())
# data.head(1)

['Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price_euros']


In [203]:
df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)
df['Ram'].head()

0     8
1     8
2     8
3    16
4     8
Name: Ram, dtype: int64

In [204]:
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)
df['Weight'].head()

0    1.37
1    1.34
2    1.86
3    1.83
4    1.37
Name: Weight, dtype: float64

In [205]:
# Create new columns with default 0
df['SSD'] = 0
df['HDD'] = 0
df['Hybrid'] = 0
df['Flash_Storage'] = 0

import re

# Function to convert memory strings to numbers.
def convert_memory(mem):
    mem = str(mem)
    ssd = hdd = hybrid = flash = 0  # Start with 0 for all storage types

    # Split by '+'
    parts = mem.split('+')
    for part in parts:
        part = part.strip()

        # Extract numeric size
        size_match = re.search(r'(\d+)', part)
        size = int(size_match.group(1)) if size_match else 0

        # Convert TB → GB
        if "TB" in part:
            size *= 1024

        # Assign to storage type
        if "SSD" in part:
            ssd += size
        elif "HDD" in part:
            hdd += size
        elif "Hybrid" in part:
            hybrid += size
        elif "Flash" in part or "Flash Storage" in part:
            flash += size

    return pd.Series([ssd, hdd, hybrid, flash])

# Apply function
df[['SSD', 'HDD', 'Hybrid', 'Flash_Storage']] = df['Memory'].apply(convert_memory)
df = df.drop('Memory', axis=1)
df.head()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Price_euros,SSD,HDD,Hybrid,Flash_Storage
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,128,0,0,0
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,Intel HD Graphics 6000,macOS,1.34,898.94,0,0,0,128
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,No OS,1.86,575.0,256,0,0,0
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,AMD Radeon Pro 455,macOS,1.83,2537.45,512,0,0,0
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,Intel Iris Plus Graphics 650,macOS,1.37,1803.6,256,0,0,0


In [206]:
# Touchscreen column
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)

# Extract X and Y resolution
df['X_res'] = df['ScreenResolution'].str.split('x').str[0].str.extract("(\d+)").astype(int)
df['Y_res'] = df['ScreenResolution'].str.split('x').str[1].str.extract("(\d+)").astype(int)

# Drop original ScreenResolution column
df = df.drop('ScreenResolution', axis=1)
df.head()

  df['X_res'] = df['ScreenResolution'].str.split('x').str[0].str.extract("(\d+)").astype(int)
  df['Y_res'] = df['ScreenResolution'].str.split('x').str[1].str.extract("(\d+)").astype(int)


Unnamed: 0,Company,Product,TypeName,Inches,Cpu,Ram,Gpu,OpSys,Weight,Price_euros,SSD,HDD,Hybrid,Flash_Storage,Touchscreen,X_res,Y_res
0,Apple,MacBook Pro,Ultrabook,13.3,Intel Core i5 2.3GHz,8,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,128,0,0,0,0,2560,1600
1,Apple,Macbook Air,Ultrabook,13.3,Intel Core i5 1.8GHz,8,Intel HD Graphics 6000,macOS,1.34,898.94,0,0,0,128,0,1440,900
2,HP,250 G6,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,No OS,1.86,575.0,256,0,0,0,0,1920,1080
3,Apple,MacBook Pro,Ultrabook,15.4,Intel Core i7 2.7GHz,16,AMD Radeon Pro 455,macOS,1.83,2537.45,512,0,0,0,0,2880,1800
4,Apple,MacBook Pro,Ultrabook,13.3,Intel Core i5 3.1GHz,8,Intel Iris Plus Graphics 650,macOS,1.37,1803.6,256,0,0,0,0,2560,1600


In [207]:
# CPU brand
df['Cpu_brand'] = df['Cpu'].apply(lambda x: x.split()[0])
df = df.drop('Cpu', axis=1)

# GPU brand
df['Gpu_brand'] = df['Gpu'].apply(lambda x: x.split()[0])
df = df.drop('Gpu', axis=1)

df.head()

Unnamed: 0,Company,Product,TypeName,Inches,Ram,OpSys,Weight,Price_euros,SSD,HDD,Hybrid,Flash_Storage,Touchscreen,X_res,Y_res,Cpu_brand,Gpu_brand
0,Apple,MacBook Pro,Ultrabook,13.3,8,macOS,1.37,1339.69,128,0,0,0,0,2560,1600,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,8,macOS,1.34,898.94,0,0,0,128,0,1440,900,Intel,Intel
2,HP,250 G6,Notebook,15.6,8,No OS,1.86,575.0,256,0,0,0,0,1920,1080,Intel,Intel
3,Apple,MacBook Pro,Ultrabook,15.4,16,macOS,1.83,2537.45,512,0,0,0,0,2880,1800,Intel,AMD
4,Apple,MacBook Pro,Ultrabook,13.3,8,macOS,1.37,1803.6,256,0,0,0,0,2560,1600,Intel,Intel


In [208]:
categorical_cols = ['Company', 'Product', 'TypeName', 'OpSys', 'Cpu_brand', 'Gpu_brand']
numerical_cols = ['Inches','Ram','Weight','SSD','HDD','Hybrid','Flash_Storage','Touchscreen','X_res','Y_res']
target_col = 'Price_euros'


In [209]:
from sklearn.preprocessing import OneHotEncoder

# Initialize encoder
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' avoids dummy variable trap

# Fit and transform categorical columns
encoded_cats = encoder.fit_transform(df[categorical_cols])

# Get the new column names
encoded_col_names = encoder.get_feature_names_out(categorical_cols)

# Convert to DataFrame
encoded_df = pd.DataFrame(encoded_cats, columns=encoded_col_names)

# Combine with numerical columns
final_df = pd.concat([df[numerical_cols], encoded_df], axis=1)

# Target
y = df[target_col]


In [210]:
df['Price_euros'] = pd.to_numeric(df['Price_euros'], errors='coerce')
df = df.dropna()  # Drop rows with missing values
df.head()

Unnamed: 0,Company,Product,TypeName,Inches,Ram,OpSys,Weight,Price_euros,SSD,HDD,Hybrid,Flash_Storage,Touchscreen,X_res,Y_res,Cpu_brand,Gpu_brand
0,Apple,MacBook Pro,Ultrabook,13.3,8,macOS,1.37,1339.69,128,0,0,0,0,2560,1600,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,8,macOS,1.34,898.94,0,0,0,128,0,1440,900,Intel,Intel
2,HP,250 G6,Notebook,15.6,8,No OS,1.86,575.0,256,0,0,0,0,1920,1080,Intel,Intel
3,Apple,MacBook Pro,Ultrabook,15.4,16,macOS,1.83,2537.45,512,0,0,0,0,2880,1800,Intel,AMD
4,Apple,MacBook Pro,Ultrabook,13.3,8,macOS,1.37,1803.6,256,0,0,0,0,2560,1600,Intel,Intel


In [211]:
df.to_csv("../dataset/laptops_cleaned.csv", index=False)