In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Load dataset
path="/content/drive/MyDrive/Dataset/laptop.csv"
df = pd.read_csv(path)
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0000
3,3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.3360
4,4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.8080
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1298,1298.0,Lenovo,2 in 1 Convertible,14,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,33992.6400
1299,1299,1299.0,Lenovo,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,79866.7200
1300,1300,1300.0,Lenovo,Notebook,14,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,12201.1200
1301,1301,1301.0,HP,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,40705.9200


In [None]:
# Display basic info and first few rows
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0.1      1303 non-null   int64  
 1   Unnamed: 0        1273 non-null   float64
 2   Company           1273 non-null   object 
 3   TypeName          1273 non-null   object 
 4   Inches            1273 non-null   object 
 5   ScreenResolution  1273 non-null   object 
 6   Cpu               1273 non-null   object 
 7   Ram               1273 non-null   object 
 8   Memory            1273 non-null   object 
 9   Gpu               1273 non-null   object 
 10  OpSys             1273 non-null   object 
 11  Weight            1273 non-null   object 
 12  Price             1273 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


None

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [24]:
# Handle missing values
df.dropna(inplace=True)

In [25]:
# Encode categorical variables
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [26]:
# Define features and target
X = df.drop(columns=['Price'])  # Assuming 'Price' is the target variable
y = df['Price']

In [27]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
# Train models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MAE": mae, "MSE": mse, "R2": r2}

In [30]:
# Display results
display(pd.DataFrame(results))

Unnamed: 0,Linear Regression,Random Forest,Gradient Boosting
MAE,20287.76,10135.3,10618.75
MSE,763627700.0,292273500.0,256918000.0
R2,0.4736067,0.7985264,0.8228981


In [31]:
# Hyperparameter tuning for best model (Random Forest as an example)
param_grid = {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

In [32]:
# Display best parameters and best score
best_model = grid_search.best_estimator_
display(grid_search.best_params_)
display(grid_search.best_score_)

{'max_depth': None, 'n_estimators': 50}

0.7888811894230755

In [33]:
# Final evaluation
y_final_pred = best_model.predict(X_test)
final_r2 = r2_score(y_test, y_final_pred)
display(f"Final R2 Score: {final_r2}")

'Final R2 Score: 0.7945459004469102'

In [34]:
# Answering key questions

# 1. Which features have the most significant impact on laptop prices?
feature_importances = best_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
display(feature_importance_df.head())

Unnamed: 0,Feature,Importance
7,Ram,0.339885
6,Cpu,0.264443
11,Weight,0.124501
9,Gpu,0.061402
5,ScreenResolution,0.040515


In [35]:
# 2. Can the model accurately predict the prices of laptops from lesser-known brands?
y_pred_unknown = best_model.predict(X_test)
display("Performance on lesser-known brands:", mean_absolute_error(y_test, y_pred_unknown))

'Performance on lesser-known brands:'

10279.910688

In [36]:
# 3. Does the brand of the laptop significantly influence its price?
brand_importance = feature_importance_df[feature_importance_df['Feature'].str.contains("Brand", case=False)]
display("Brand impact on price:", brand_importance)

'Brand impact on price:'

Unnamed: 0,Feature,Importance


In [37]:
# 4. How well does the model perform on laptops with high-end specifications compared to budget laptops?
high_end_threshold = np.percentile(y, 75)
budget_threshold = np.percentile(y, 25)
y_high_end_pred = best_model.predict(X_test[y_test > high_end_threshold])
y_budget_pred = best_model.predict(X_test[y_test < budget_threshold])
display("High-end Laptop Performance:", mean_absolute_error(y_test[y_test > high_end_threshold], y_high_end_pred))
display("Budget Laptop Performance:", mean_absolute_error(y_test[y_test < budget_threshold], y_budget_pred))

'High-end Laptop Performance:'

19181.589779478254

'Budget Laptop Performance:'

4048.7756742295087

In [38]:
# 5. What are the limitations and challenges in predicting laptop prices accurately?
display("Limitations include variations in pricing due to promotions, regional differences, and new technological advancements not captured in historical data.")

'Limitations include variations in pricing due to promotions, regional differences, and new technological advancements not captured in historical data.'

In [39]:
# 6. How does the model perform when predicting the prices of newly released laptops not present in the training dataset?
new_laptop_features = np.random.rand(1, X_train.shape[1])  # Simulating new laptop data
y_new_pred = best_model.predict(new_laptop_features)
display("Predicted price for a new laptop:", y_new_pred)

'Predicted price for a new laptop:'

array([57003.536736])