In [1]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df = pd.read_csv('./Data/laptop_dataset.csv')
df.head(5)

In [None]:
df.shape

In [None]:
df[df.duplicated()]

In [6]:
df.drop_duplicates(inplace= True)

In [None]:
df['Price (Rs)'].describe()

In [None]:
df.isnull().mean()[df.isnull().mean() > 0.3] * 100

In [None]:
(df.isnull().mean()[df.isnull().mean() > 0.3] * 100).count()

In [10]:
col_name = df.isnull().mean()[df.isnull().mean() > 0.3].index
df2 = df.drop(columns= col_name)

In our dataset, 33 columns have more than 30% missing values. These columns primarily contain optional or rarely filled-in specs (like VGA ports, rear cameras, or lockports), which are not critical for predicting laptop price. Hence, we will drop them to simplify the dataset.

However, there's an exception:

-Graphics Memory

--Although this column has more than 60% missing values, in this case, NaN means that the laptop does not have a dedicated GPU.

--So, instead of dropping this column, we will replace all NaN values with 0, indicating no graphics memory.

In [None]:
df2.info()

In [None]:
target_value_null = df2[df2['Price (Rs)'].isna()]
len(target_value_null)

In [13]:
df2 = df2.dropna(subset=['Price (Rs)'])

There are 21 rows in our dataset where the target variable Price (Rs) is missing. These entries are not useful for training a machine learning model, as we don't know what we're trying to predict.

However, they can still be useful later (e.g. for making predictions on unknown data). So we’ll:

- Save them separately in a variable called target_value_null.

- Drop them from the main training dataset.

##########################################################################
# EDA For OS

In [None]:
df2['Operating System'].isna().sum()

In [None]:
df2['Operating System'].nunique()

In [None]:
df2['Operating System'].unique()

In [17]:
def operating_system(os):
    if os == 'Windows 10 Home Basic' or os == 'Windows 10 Professional' or os == 'Windows 10' or os == 'Windows 10 Home Premium':
        return "Win 10"
    elif os == 'Windows 11 Home Basic' or os == 'Windows 11' or os == 'Windows 11 Professional' or os == 'Windows 11 Home Premium':
        return "Win 11"
    elif os == 'DOS' or os == 'DOS Professional' or os == 'DOS Home Basic':
        return "DOS"
    elif os == 'Linux' or os == 'Ubuntu':
        return "Linux"
    elif os == 'Google Chrome' or os == 'Google Chrome Home Basic':
        return "Chorme OS"
    elif os == 'macOS Sonoma' or os == 'macOS Catalina' or os == 'macOS Big Sur' or os == 'macOS Mojave' or os == 'macOS Monterey Home Basic' or os == 'macOS Monterey' or os == 'macOS Sequoia' or os == 'macOS Ventura' or os == 'macOS High Sierra' or os == 'macOS Sierra' or os == 'MAC OS X El Capitan':
        return "MAC OS"
    else:
        return "Other"

In [18]:
df2['OS'] = df2['Operating System'].apply(operating_system)
df2['OS'] = df2['OS'].str.lower()
df2.drop(columns= ['Operating System'], inplace= True)

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(data=df2, x='OS', y='Price (Rs)')
plt.xticks(rotation=45)
plt.title('Price Distribution by Operating System')
plt.ylabel('Price (Rs)')
plt.xlabel('Operating System')
plt.tight_layout()
plt.show()

Although the original Operating System column had no missing values, it contained excessively detailed version info (e.g., Windows 10 Home Basic, Windows 11 Pro, etc.) that isn't typically a major concern for general buyers. To simplify:

We created a new column called Common OS, which groups similar entries under clean categories like:

Win 10, Win 11, MAC OS, DOS, Linux, etc.

This allows us to better analyze price trends by OS type.

Insights from the Boxplot:

Windows 10 and Windows 11 show a very wide price range, from entry-level laptops to premium models—this reflects their dominance across different price segments.

MAC OS generally commands a higher average price, with less variation.

Linux, Chrome OS, and DOS laptops tend to cluster at the lower end of the price spectrum.

# OS EDA End

#############################################################
# EDA For Weights

In [None]:
df2.Weight.isnull().sum()

In [21]:
df2['Weight'] = df2['Weight'].str.split(' ').str[0]
df2['Weight'] = df2['Weight'].astype('float32')

In [None]:
df2['Weight'].describe()

In [None]:
plt.hist(df2['Weight'])
plt.show()

In [None]:
sns.boxplot(df2['Weight'])

In [25]:
df2['Weight'].fillna(df2['Weight'].median(), inplace= True)

In [None]:
df2.Weight.isnull().sum()

The Weight column contains 306 missing values. Since there are a extreme outliers (very heavy laptops), using the mean could be misleading.

Instead, we chose to use the median to impute missing weights. The median is more robust to outliers and better reflects the central tendency of the data.

Also, from a user’s perspective, purchase decisions are typically based more on model configuration and performance rather than small differences in weight, making median imputation a reasonable choice.

# EDA Weight End

####################################################
# EDA for Display

In [None]:
df2['Display Size'].isna().sum()

In [None]:
df2[df2['Display Size'].isna()]

In [29]:
df2['Display Size'] = df2['Display Size'].str.split(" ").str[0]
df2['Display Size'] = df2['Display Size'].astype('float32')

In [30]:
df2.loc[[797, 1426, 7974], 'Display Size'] = 15.6

In [None]:
df2['Display Size'].isna().sum()

We found 3 rows with missing values in the Display Size column. Instead of using statistical imputation (like mean or median), we manually annotated these values after verifying the actual display sizes from reliable web sources.

Additionally, we cleaned the Display Size column by extracting only the numeric value, removing the "Inches" and any accompanying text. This ensures that the feature is ready for numerical analysis and modeling.

# EDA Display End

#################################################################
# EDA For RAM Capacity

In [None]:
df2['Capacity'].isna().sum()

In [33]:
df2['Capacity'] = df2['Capacity'].str.split(' ').str[0]
df2['Capacity'] = df2['Capacity'].astype('int32')

In [34]:
df2.rename(columns={'Capacity': 'RAM Capacity'}, inplace=True)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='RAM Capacity', data=df2, palette='viridis', order=df2['RAM Capacity'].value_counts().sort_values(ascending=False).index)
plt.title('Distribution of RAM Capacity')
plt.xlabel('RAM Capacity (in GB)')
plt.ylabel('Number of Laptops')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x=df2['RAM Capacity'], y=df2['Price (Rs)'], color='lightgreen')
plt.title('Box Plot of RAM Capacity')
plt.xlabel('RAM Capacity (in GB)')
plt.grid(axis='x')
plt.tight_layout()
plt.show()

No missing values are present in the RAM Capacity column (previously named Capacity, renamed for better readability). The most common RAM configurations are 8GB and 16GB.

As RAM capacity increases, the price of laptops tends to rise, indicating that higher RAM is associated with more premium or high-performance devices. Outliers are observed in higher RAM categories (like 32GB, 64GB, 128GB), likely representing gaming laptops or professional workstations.

# EDA for RAM Capacity End

###############################################################
# EDA for Processor

In [None]:
df2['Processor'].unique()[:5]

In [38]:
df2['Processor_Brand'] = df2['Processor'].str.extract(r'(Intel|AMD|Apple|MediaTek|Mediatek|Qualcomm|intel|inte|Microsoft|Inte)', expand=False)
df2.loc[df2['Processor'].str.contains('APU', case=False, na=False), 'Processor_Brand'] = 'AMD'

In [None]:
df2['Processor_Brand'].isna().sum()

In [None]:
df2[df2['Processor_Brand'].isna()]

In [41]:
df2['Processor_Brand'] = df2['Processor_Brand'].str.lower()

In [42]:
df2['Processor_Brand'] = df2['Processor_Brand'].replace({'inte': 'intel', 'Inte': 'intel', 'intel': 'intel'})

In [None]:
df2['Processor_Brand'].unique()

During the data exploration process, we initially identified three major processor brands: Intel, AMD, and Apple. To ensure completeness, we continued inspecting the 'Processor_Brand' column for missing values. As we iteratively handled and filled these values, the count of null entries gradually reduced. Once the number of missing values reached zero, it confirmed that no additional processor brands were present in the dataset beyond the ones already captured.

In [44]:
df2['Processor_Series'] = df2['Processor'].str.extract(
    r'(Quad\sCore|Quad-Core|Octa Core|Kompanio|Athlon|Pentium|Core 5|Core 7|Core 3|Core Ultra 5|Celeron|Dual\sCore|Dual-Core|Ultra 7|Ultra 9|Ryzen AI 9|Snapdragon|Xenon|m3|' +
    r'i[3579]|I[3579]|' +
    r'Ryzen\s[3579]|' +
    r'M\d(?:\s(?:Pro|Max|Ultra))?)',
    expand=False
)

In [45]:
df2.loc[1588, 'Processor_Series'] = df2.loc[1588, 'Processor_Series'] = 'Quad Core'
df2.loc[3709, 'Processor_Brand'] = df2.loc[3709, 'Processor_Brand'] = 'Intel'
df2.loc[3709, 'Processor_Series'] = df2.loc[3709, 'Processor_Series'] = 'i5'

In [None]:
df2['Processor_Series'].isna().sum()

In [None]:
df2[df2['Processor_Series'].isna()]

In [48]:
df2 = df2[df2['Processor_Series'].notna()]

In [None]:
df2['Processor_Series'].unique()

In [None]:
mapping = {
    'Dual-Core': 'Dual Core',
    'Quad-Core': 'Quad Core',
    'Core Ultra 5': 'Ultra 5',
    'Core 7': 'i7',
    'Core 3': 'i3'
}

df2['Processor_Series'] = df2['Processor_Series'].replace(mapping)
df2['Processor_Series'] = df2['Processor_Series'].str.lower()
df2['Processor_Series'].unique()

To extract the Processor Series, we applied a similar approach as used for the Processor Brand. Both of these columns are derived through feature extraction from the original Processor column. Additionally, we performed manual annotation by researching reliable sources to correctly assign the processor brand and series where the automated extraction failed. General preprocessing steps were applied to ensure consistency across the dataset.

We also dropped 4 rows from the dataset. These rows were removed because they either contained outdated HDD storage or had SSD capacities below 128GB, which fall outside the scope of modern laptop configurations considered in our analysis.

# EDA for Processor End

##################################################################################
# EDA For RAM

In [None]:
df2['RAM Type'].isna().sum()

In [None]:
df2['RAM Type'].unique()

In [None]:
avg_price_by_ram = df2.groupby('RAM Type')['Price (Rs)'].mean().sort_values(ascending=False)
avg_price_by_ram

In [None]:
plt.figure(figsize=(10, 6))
avg_price_by_ram.plot(kind='bar')
plt.title('Average Laptop Price by RAM Type')
plt.xlabel('RAM Type')
plt.ylabel('Average Price (₹)')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [55]:
def infer_ram_type_by_price(price):
    if price >= 135000:
        return 'DDR5'
    elif price >= 120000:
        return 'LPDDR5X'
    elif price >= 105000:
        return 'LPDDR3'
    elif price >= 95000:
        return 'Unified Memory'
    elif price >= 75000:
        return 'LPDDR5'
    elif price >= 60000:
        return 'LPDDR4X'
    elif price >= 50000:
        return 'DDR4'
    elif price >= 40000:
        return 'LPDDR4'
    else:
        return 'DDR3'

In [56]:
df2['RAM Type'] = df2.apply(
    lambda row: infer_ram_type_by_price(row['Price (Rs)']) if pd.isna(row['RAM Type']) else row['RAM Type'],
    axis=1
)

In [None]:
df2['RAM Type'] = df2['RAM Type'].str.lower()
df2['RAM Type'].isna().sum()

We observed 170 missing values in the 'RAM Type' column. To impute these, we used the average laptop price corresponding to each RAM type as a reference.

# EDA RAM End

############################################
# EDA For Graphics

In [None]:
plt.figure(figsize=(14, 5))

top10_gpu = df2['Graphic Processor'].value_counts().nlargest(10).index
plt.subplot(1, 2, 1)
sns.countplot(data=df2[df2['Graphic Processor'].isin(top10_gpu)],
              y='Graphic Processor', order=top10_gpu)
plt.title('Top 10 Graphic Processors')
plt.xlabel('Count')

In [59]:
df2['Graphics Memory'] = df['Graphics Memory']

In [None]:
top10_mem = df2['Graphics Memory'].value_counts().nlargest(10).index
plt.subplot(1, 2, 2)
sns.countplot(data=df2[df2['Graphics Memory'].isin(top10_mem)],
              y='Graphics Memory', order=top10_mem)
plt.title('Top 10 Graphics Memory Sizes (Before EDA)')
plt.xlabel('Count')

plt.tight_layout()
plt.show()

In [None]:
df2[pd.isna(df2['Graphics Memory']) & pd.isna(df2['Graphic Processor'])].shape

In [None]:
df2[pd.isna(df2['Graphics Memory']) & pd.notna(df2['Graphic Processor'])].shape

In [None]:
df2[pd.notna(df2['Graphics Memory']) & pd.notna(df2['Graphic Processor'])].shape

In [64]:
def get_graphics_type(row):
    if pd.isna(row['Graphics Memory']) and pd.isna(row['Graphic Processor']):
        return 'No Graphics'
    elif pd.isna(row['Graphics Memory']) and pd.notna(row['Graphic Processor']):
        return 'Integrated'
    elif pd.notna(row['Graphics Memory']) and pd.notna(row['Graphic Processor']):
        return 'Dedicated'
    else:
        return 'Unknown'

df2['graphics_type'] = df2.apply(get_graphics_type, axis=1)

The above function is designed to construct a new feature based on two existing features: 'Graphics Memory' and 'Graphic Processor'.

In [65]:
df2['Graphics Memory'] = df2['Graphics Memory'].fillna('0 GB')
df2['Graphics Memory'] = df2['Graphics Memory'].str.split(' ').str[0]
df2['Graphics Memory'] = df2['Graphics Memory'].astype('int32')

In [66]:
df2['Graphic Processor'] = df2['Graphic Processor'].fillna('No Graphics')
df2['Graphic Processor'] = df2['Graphic Processor'].str.lower()

In [None]:
df2['Graphic Processor'].unique()[:5]

In the 'Graphic Processor' and 'Graphics Memory' columns, null values indicate the absence of a dedicated graphics system. These are therefore replaced with 'No Graphics' and '0 GB' respectively.

# Graphics EDA End

#####################################################
# EDA For Series

In [None]:
df2.head(2)

In [None]:
df2['Series'].isna().sum()

In [70]:
df2['Series'] = df2['Series'].str.split(' ').str[0]
df2['Series'] = df2['Series'].str.lower()

In [71]:
def extract_series_from_model(model_val):
    if pd.isna(model_val):
        return np.nan
    space_idx = model_val.find(' ')
    hyphen_idx = model_val.find('-')

    if space_idx == -1 and hyphen_idx == -1:
        return model_val
    elif space_idx == -1:
        return model_val[:hyphen_idx]
    elif hyphen_idx == -1:
        return model_val[:space_idx]
    else:
        return model_val[:min(space_idx, hyphen_idx)]

df2['Series'] = df2.apply(
    lambda row: extract_series_from_model(row['Model']) if pd.isna(row['Series']) else row['Series'],
    axis=1
)

In [None]:
df2.Series.isna().sum()

In [None]:
df2.head(2)

In [None]:
cross_tab = pd.crosstab(df2['Brand'], df2['Series'])
top10_series = df2['Series'].value_counts().nlargest(10).index
cross_tab = cross_tab[top10_series]

cross_tab = cross_tab.loc[cross_tab.any(axis=1)]

mask = cross_tab == 0

plt.figure(figsize=(12, 6))
sns.heatmap(cross_tab, annot=True, fmt='d', cmap='YlGnBu', mask=mask, cbar=True, linewidths=0.5, linecolor='gray')
plt.title('Brand vs Series (Top 10 Series)')
plt.xlabel('Series')
plt.ylabel('Brand')
plt.tight_layout()
plt.show()

Initially, the 'series' column contained 1,087 null values. We first cleaned the column by retaining only the series names, and then imputed the missing values by extracting series names from the 'model' column using the extract_series_from_model function.

# Series EDA End

############################################################################
# EDA for Pixel Density

In [None]:
df2['Pixel Density'].isna().sum()

In [None]:
df2['Display Resolution'].isna().sum()

In [None]:
df2[(df2['Display Resolution'].isna()) & (df2['Pixel Density'].isna())].shape

In [None]:
df2[(pd.notna(df2['Display Resolution'])) & (df2['Pixel Density'].isna())]

In [None]:
# PPI = (((width)^2 + (height)^2)^0.5) / screen size (in inches) ---- Formula for PPI

print(((2560**2 + 1440**2)**0.5) / 16.1)
print(((1920**2 + 1080**2)**0.5) / 15.6)

In [80]:
df2.loc[761, 'Pixel Density'] = df2.loc[761, 'Pixel Density'] = '182.43'
df2.loc[797, 'Pixel Density'] = df2.loc[797, 'Pixel Density'] = '141.21'

In [81]:
df2['Pixel Density'] = df2['Pixel Density'].str.split(' ').str[0]
df2['Pixel Density'] = df2['Pixel Density'].astype('float64')

In [None]:
df2['Pixel Density'].isna().sum()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df2['Pixel Density'], kde=True, bins=30, color='skyblue')
plt.title('Distribution of Pixel Density (PPI)')
plt.xlabel('Pixel Density (PPI)')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(14, 6))
sns.boxplot(data=df2, x='Brand', y='Pixel Density', palette='Set2')
plt.title('Pixel Density by Brand')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Imputation of null values in the 'display_density' column was not possible in most cases, as the corresponding 'display_resolution' values were also missing. However, two entries had enough context to infer the values, so they were manually imputed.

# EDA Pixel Density End

############################################################
# EDA For SSD

In [None]:
df2['SSD Capacity'].isna().sum()

In [86]:
df2['SSD Capacity'] = df2['SSD Capacity'].str.split(' ').str[0]

In [87]:
bins = [0, 40000, 60000, 80000, 100000, 150000, np.inf]
labels = ['0-40K', '40K-60K', '60K-80K', '80K-100K', '100K-150K', '150K+']
df2['Price_Bin'] = pd.cut(df2['Price (Rs)'], bins=bins, labels=labels)

In [88]:
def smart_fill_ssd(row):
    if pd.isna(row['SSD Capacity']):
        group = df2[
            (df2['Brand'] == row['Brand']) &
            (df2['RAM Capacity'] == row['RAM Capacity']) &
            (df2['Price_Bin'] == row['Price_Bin']) &
            (df2['SSD Capacity'].notna())
        ]
        if not group.empty:
            return group['SSD Capacity'].mode().iloc[0]

        group = df2[
            (df2['Brand'] == row['Brand']) &
            (df2['RAM Capacity'] == row['RAM Capacity']) &
            (df2['SSD Capacity'].notna())
        ]
        if not group.empty:
            return group['SSD Capacity'].mode().iloc[0]

        group = df2[
            (df2['Brand'] == row['Brand']) &
            (df2['SSD Capacity'].notna())
        ]
        if not group.empty:
            return group['SSD Capacity'].mode().iloc[0]

        return '256GB'

    else:
        return row['SSD Capacity']


In [89]:
df2['SSD Capacity'] = df2.apply(smart_fill_ssd, axis=1)

In [None]:
df2['SSD Capacity'].unique()

In [None]:
df2[df2['SSD Capacity'].isin(['64', '16', '32','8'])].shape

In [92]:
df3 = df2[~df2['SSD Capacity'].isin(['8', '16', '32', '64'])]

In [93]:
df3['SSD Capacity'] = df3['SSD Capacity'].replace('1', '1024')
df3['SSD Capacity'] = df3['SSD Capacity'].replace('2', '2024')
df3['SSD Capacity'] = df3['SSD Capacity'].replace('4', '4024')

In [None]:
df3['SSD Capacity'].unique()

In [95]:
df3['SSD Capacity'] = df3['SSD Capacity'].replace('256GB', '256')
df3['SSD Capacity'] = df3['SSD Capacity'].astype('int32')

In [None]:
avg_price_ssd = df3.groupby('SSD Capacity')['Price (Rs)'].mean().sort_index()

plt.figure(figsize=(10, 5))
sns.barplot(x=avg_price_ssd.index, y=avg_price_ssd.values, palette='magma')
plt.title('Average Price by SSD Capacity')
plt.xlabel('SSD Capacity (GB)')
plt.ylabel('Average Price (Rs)')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df3, x='SSD Capacity', y='Price (Rs)', palette='coolwarm')
plt.title('Price Distribution by SSD Capacity')
plt.xlabel('SSD Capacity (GB)')
plt.ylabel('Price (Rs)')
plt.tight_layout()
plt.show()


There were 1,329 missing values in the 'SSD Capacity' column, which were imputed using the smart_fill_ssd function. This function fills missing values by selecting the most frequent SSD capacity among similar entries based on brand, RAM, and price bin. If no suitable match is found, it defaults to '256GB'. Additionally, laptops with SSD storage less than 128GB were removed, as these devices primarily relied on HDDs or optical drives. Since most laptops manufactured after 2020 come with SSDs, such entries were excluded to ensure the model is trained on relevant and modern configurations.

# SSD EDA End

####################################################
# EDA for Display Touchscreen

In [None]:
df3['Display Touchscreen'].isna().sum()

In [None]:
df3.groupby('Display Touchscreen')['Price (Rs)'].mean().plot(kind='bar', figsize=(6, 4))
plt.title('Average Laptop Price by Touchscreen Display')
plt.xlabel('Touchscreen')
plt.ylabel('Average Price (Rs)')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
df3.loc[df3['Display Touchscreen'].isna(), ['Brand', 'Series', ]].drop_duplicates()

In [101]:
df3.loc[2442, 'Display Touchscreen'] = df3.loc[2442, 'Display Touchscreen'] = 'Yes'
df3.loc[4611, 'Display Touchscreen'] = df3.loc[2442, 'Display Touchscreen'] = 'Yes'
df3.loc[6661, 'Display Touchscreen'] = df3.loc[2442, 'Display Touchscreen'] = 'Yes'
df3['Display Touchscreen'].fillna('No', inplace= True)
df3['Display Touchscreen'] = df3['Display Touchscreen'].str.lower()
df3['Display Touchscreen'] = df3['Display Touchscreen'].map({'no': 0, 'yes': 1})

There were 81 missing values in the 'Display Touchscreen' column. After manually reviewing the data, we identified 3 laptops as touchscreen models, while the remaining were confirmed to be non-touchscreen.

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=df3, x='Display Touchscreen', palette=["#e32f2f", '#1f78b4'])
plt.title('How many laptops are touchscreens?')
plt.xticks([0,1], ['No', 'Yes'])
plt.ylabel('Number of laptops')
plt.xlabel('')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df3, x='Display Touchscreen', y='Price (Rs)',
            palette=["#f83434",'#1f78b4'])
plt.xticks([0,1], ['No','Yes'])
plt.title('Price distribution by touchscreen presence')
plt.xlabel('Has Touchscreen?'); plt.ylabel('Price (Rs)')
plt.tight_layout()
plt.show()

# Display Touchscreen End

In [104]:
df3.reset_index(drop=True, inplace=True)

In [105]:
df3 = df3[['Brand', 'Series', 'Weight', 'Display Size', 'Pixel Density',
           'Display Touchscreen', 'Graphic Processor', 'RAM Capacity', 'RAM Type',
           'SSD Capacity', 'OS', 'Processor_Brand', 'Price (Rs)',
           'Processor_Series', 'Graphics Memory']]

During my time (Gaurav Pandey) working as a laptop salesperson at Vijay Sales (a well-known electronics retail store in India), I observed that when customers walk in to purchase a laptop, their primary considerations typically include the brand, RAM, storage, operating system, weight, and laptop series. However, for customers with specific needs—such as gamers, coders, video editors, or graphic designers—the decision-making process becomes more detailed. These customers often inquire about additional specifications like the graphics processor, graphics memory, pixel density, RAM type, processor brand, and display size.

In [None]:
for col in (df3.columns):
    print(col, df3[col].isna().sum())

In [107]:
df4 = df3.dropna(subset=['Pixel Density'])

In [None]:
for col in (df4.columns):
    print(col, df4[col].isna().sum())

In [None]:
df4.duplicated().sum()

In [110]:
df4.drop_duplicates(inplace= True)

In [111]:
df4.to_csv('./Data/processed_laptop_data.csv', index=False)

In [112]:
df_for_correlation = df4.copy()

In [113]:
from sklearn.preprocessing import LabelEncoder

In [None]:
categorical_cols = df_for_correlation.select_dtypes(include='object').columns
print(categorical_cols)
for col in categorical_cols:
    df_for_correlation[col] = LabelEncoder().fit_transform(df_for_correlation[col].astype(str))

correlation = df_for_correlation.corr()
sns.heatmap(correlation[['Price (Rs)']].sort_values(by='Price (Rs)', ascending=False), annot=True)
plt.show()

In [115]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

In [116]:
df=pd.read_csv('Data//processed_laptop_data.csv')

In [117]:
X = df.drop(columns= ['Price (Rs)'])
y = np.log(df['Price (Rs)'])

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [119]:
cat_col = ['Brand', 'Series', 'Graphic Processor', 'RAM Type', 'OS', 'Processor_Brand', 'Processor_Series']
num_col = [col for col in X_train.columns if col not in cat_col]

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first',handle_unknown='ignore'), cat_col),
    ('num', StandardScaler(), num_col)
],remainder='passthrough')

step2 = LinearRegression()

pipe_lr = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe_lr.fit(X_train,y_train)

y_pred = pipe_lr.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first',handle_unknown='ignore'), cat_col),
    ('num', RobustScaler(), num_col)
],remainder='passthrough')

step2 = SVR(kernel='rbf',C=10000,epsilon=0.1)

pipe_svr = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe_svr.fit(X_train,y_train)

y_pred = pipe_svr.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

In [None]:
svr_step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), cat_col),
    ('num', RobustScaler(), num_col)
], remainder='passthrough')

svr_step2 = SVR(kernel='rbf')

pipe_svr = Pipeline([
    ('step1', svr_step1),
    ('step2', svr_step2)
])

param_grid = {
    'step2__C': [100, 1000, 10000],
    'step2__epsilon': [0.01, 0.1, 0.5],
    'step2__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(pipe_svr, 
                          param_grid, 
                          cv=2,  
                          scoring='r2',
                          n_jobs=-1,
                          verbose=1)

print("\nPerforming grid search...")
grid_search.fit(X_train, y_train)

best_svr = grid_search.best_estimator_

y_pred = best_svr.predict(X_test)
print('\nSVR Validation Scores (after tuning):')
print('R2 score:', r2_score(y_test, y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))



In [None]:
input_df = pd.DataFrame([{
    'Brand': 'HP',
    'Series': 'pavilion',
    'Weight': 1.50,
    'Display Size': 14.0,
    'Pixel Density': 157.0,
    'Display Touchscreen': 0,
    'Graphic Processor': 'intel uhd',
    'RAM Capacity': 8,
    'RAM Type': 'ddr4',
    'SSD Capacity': 512,
    'OS': 'win 11',
    'Processor_Brand': 'intel',
    'Processor_Series': 'i3',
    'Graphics Memory': 0
}])

In [None]:
model_dict = {
    "Linear Regression": pipe_lr,
    "SVR": best_svr 
}

for name, model in model_dict.items():
    p = model.predict(input_df)
    predicted_price = np.exp(p[0])
    print(f"{name:25}: ₹ {predicted_price:,.2f}")