In [173]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

## Load and Explore the data

In [174]:
df = pd.read_csv('https://github.com/Mitko208/PythonTraining/raw/main/Lecture40/Homework/laptop_price.csv', encoding='latin1')
df.set_index('laptop_ID', inplace=True)
df.head(2)

Unnamed: 0_level_0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
laptop_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94


In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1303 entries, 1 to 1320
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1303 non-null   object 
 1   Product           1303 non-null   object 
 2   TypeName          1303 non-null   object 
 3   Inches            1303 non-null   float64
 4   ScreenResolution  1303 non-null   object 
 5   Cpu               1303 non-null   object 
 6   Ram               1303 non-null   object 
 7   Memory            1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price_euros       1303 non-null   float64
dtypes: float64(2), object(10)
memory usage: 132.3+ KB


In [176]:
df.shape

(1303, 12)

In [177]:
# check for missing values
df.isna().sum().sum()

0

In [178]:
print('Numerical columns: ', df.select_dtypes('number').columns)
print('Categorical columns: ', df.select_dtypes('object').columns)

Numerical columns:  Index(['Inches', 'Price_euros'], dtype='object')
Categorical columns:  Index(['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Ram',
       'Memory', 'Gpu', 'OpSys', 'Weight'],
      dtype='object')


In [179]:
# stats about all numerical data
df.describe()

Unnamed: 0,Inches,Price_euros
count,1303.0,1303.0
mean,15.017191,1123.686992
std,1.426304,699.009043
min,10.1,174.0
25%,14.0,599.0
50%,15.6,977.0
75%,15.6,1487.88
max,18.4,6099.0


In [180]:
# stats about categorical data
df.describe(include=[object]).T.sort_values(by='unique')

Unnamed: 0,count,unique,top,freq
TypeName,1303,6,Notebook,727
Ram,1303,9,8GB,619
OpSys,1303,9,Windows 10,1072
Company,1303,19,Dell,297
Memory,1303,39,256GB SSD,412
ScreenResolution,1303,40,Full HD 1920x1080,507
Gpu,1303,110,Intel HD Graphics 620,281
Cpu,1303,118,Intel Core i5 7200U 2.5GHz,190
Weight,1303,179,2.2kg,121
Product,1303,618,XPS 13,30


### Plot correlations

In [181]:
df[['Inches','Price_euros']].corr()

Unnamed: 0,Inches,Price_euros
Inches,1.0,0.068197
Price_euros,0.068197,1.0


## Feature Engineering 

### Convert Categorical => Numerical

In [182]:
# use only numerical for featiures
y = df['Price_euros']
X = df[['Inches','Ram']]
X.head(2)

Unnamed: 0_level_0,Inches,Ram
laptop_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,13.3,8GB
2,13.3,8GB


In [183]:
X.loc[:,'Ram']

laptop_ID
1        8GB
2        8GB
3        8GB
4       16GB
5        8GB
        ... 
1316     4GB
1317    16GB
1318     2GB
1319     6GB
1320     4GB
Name: Ram, Length: 1303, dtype: object

In [184]:
# TODO: fix warning
X['Ram'] = X['Ram'].str.replace('GB', '').astype(np.int8)
X.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Ram'] = X['Ram'].str.replace('GB', '').astype(np.int8)


Unnamed: 0_level_0,Inches,Ram
laptop_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,13.3,8
2,13.3,8


In [185]:
# One Hot Encode 'TypeName'
type_encoded = pd.get_dummies(df['TypeName'])
types_df = type_encoded.astype(np.int_)

print(X.shape)
print(types_df.shape)
X = pd.concat([X, types_df], axis=1)
X.head(2)

(1303, 2)
(1303, 6)


Unnamed: 0_level_0,Inches,Ram,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation
laptop_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,13.3,8,0,0,0,0,1,0
2,13.3,8,0,0,0,0,1,0


In [186]:
# 'OpSys': => 'Is_Windows_10
# 'Windows 10' => 1
# 'Non Windows 10'=>0
X['Is_Windows_10'] = df['OpSys'].apply(lambda x: x=='Windows 10')
X.head(3)

Unnamed: 0_level_0,Inches,Ram,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation,Is_Windows_10
laptop_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,13.3,8,0,0,0,0,1,0,False
2,13.3,8,0,0,0,0,1,0,False
3,15.6,8,0,0,0,1,0,0,False


## Split test/train data

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (977, 9)
X_test shape: (326, 9)
y_train shape: (977,)
y_test shape: (326,)


## Fit the model

In [188]:
from sklearn.linear_model import LinearRegression

In [189]:
lm = LinearRegression().fit(X_train,y_train)

In [190]:
print(lm.intercept_)
print(lm.coef_)

# y = lm.intercept_ + lm.coef_*X

827.5648542600959
[ -22.19148497   92.41327311  -36.29591507 -100.77916254 -380.55745794
 -362.17762772   84.88815589  794.92200738   68.68704271]


In [191]:
# df.head(2)

## Plot regression line

In [192]:
# fig, ax = plt.subplots()
# ax.scatter(x=X_train['Inches'], y=y_train)
# y = lm.intercept_ + lm.coef_*X_train['Inches']

# ax.plot(X_train['Inches'], y)

In [193]:
# sns.regplot(x='Inches', y='Price_euros', data=train_df)

# # Display the plot
# plt.title('Regression Line with Scatter for Inches vs. Price')
# plt.xlabel('Inches')
# plt.ylabel('Price ($)')
# plt.show()

In [194]:
y_pred = lm.predict(X_test)

In [195]:
# Predictions on training data
y_train_pred = lm.predict(X_train)

# Predictions on testing data
y_test_pred = lm.predict(X_test)

# Calculate metrics for training data
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate metrics for testing data
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

# Print metrics
print("Training Data Metrics:")
print(f"MAE: {train_mae:.2f}")
print(f"MSE: {train_mse:.2f}")
print(f"RMSE: {train_rmse:.2f}")
print(f"R-squared: {train_r2:.2f}")

print("\nTesting Data Metrics:")
print(f"MAE: {test_mae:.2f}")
print(f"MSE: {test_mse:.2f}")
print(f"RMSE: {test_rmse:.2f}")
print(f"R-squared: {test_r2:.2f}")

Training Data Metrics:
MAE: 286.67
MSE: 154224.53
RMSE: 392.71
R-squared: 0.67

Testing Data Metrics:
MAE: 308.12
MSE: 197350.58
RMSE: 444.24
R-squared: 0.63


In [196]:
# Calculate the Range and Mean of the Target Variable:
print(f'MAE: {test_mae}')

y_true = df['Price_euros']

# Calculate the range and mean of the target variable
target_range = np.max(y_true) - np.min(y_true)
target_mean = np.mean(y_true)

# Relative MAE compared to range
relative_mae_range = test_mae / target_range
print(f"Relative MAE to range: {relative_mae_range:.4f}")

# Relative MAE compared to mean
relative_mae_mean = test_mae / target_mean
print(f"Relative MAE to mean: {relative_mae_mean:.4f}")

# Evaluate against a baseline model (mean predictor)
baseline_prediction = np.mean(y_true)
baseline_mae = mean_absolute_error(y_true, np.full_like(y_true, baseline_prediction))
print(f"Baseline MAE: {baseline_mae:.4f}")

# Comparing MAE with baseline
improvement_over_baseline = (baseline_mae - test_mae) / baseline_mae
print(f"Improvement over baseline: {improvement_over_baseline:.4%}")

MAE: 308.12021962573357
Relative MAE to range: 0.0520
Relative MAE to mean: 0.2742
Baseline MAE: 534.8909
Improvement over baseline: 42.3957%
