## 1. Import Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib auto

### a. Read Data

In [None]:
df = pd.read_csv('laptop_data.csv')
df.head()

In [None]:
df.columns

In [None]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
for column in df.columns:
    print(f"Column {column} after removing duplicate values: ", list(df[column].unique()))
    print("-"*30)

In [None]:
df['Ram'].unique()

In [None]:
df['Ram'] = df['Weight'].str.replace('kg','').astype('float32')

In [None]:
df.head()

In [None]:
df['Weight'].unique()

In [None]:
df['Ram'].unique()

In [None]:
df['Ram'] = df['Ram'].replace('GB','').astype('int32')

In [None]:
df.head()

In [None]:
df['Weight'].unique()

In [None]:
df['Weight'] = df['Weight'].str.replace('kg','').astype('float32')

In [None]:
df.head()

In [None]:
catvars = df.select_dtypes(include=['object']).columns
numvars = df.select_dtypes(include=['int32','int64','float32','float64']).columns

catvars, numvars

In [None]:
df.info()

## 2. Exploratory Data Analysis

### 2.1. Price Prediction

In [None]:
sns.displot(df['Price'], kind="kde")
plt.show()

### 2.2. Distribution of number of laptops in different companies

In [None]:
sns.countplot(data=df,x="Company")
plt.xticks(rotation=45)
plt.show()

In [None]:
df["Company"].value_counts()

### 2.3. Distribution of number of laptops for different types

In [None]:
df["TypeName"].value_counts()

In [None]:
sns.countplot(data=df, x="TypeName")
plt.xticks(rotation=45)
plt.show()

### 2.4. Distribution of number of laptops for different operating systems 

In [None]:
sns.countplot(data=df, x="OpSys")
plt.xticks(rotation=45)
plt.show()

In [None]:
df.head(3)

### 2.5. Company vs. Price

In [None]:
sns.barplot(df,x="Company",y="Price")
plt.xticks(rotation=45)
plt.show()

### 2.6. Laptop Types vs. Price

In [None]:
sns.barplot(df,x="TypeName",y="Price")
plt.xticks(rotation=45)
plt.show()

### 2.7. Laptop size vs. Price

In [None]:
sns.scatterplot(df,x='Inches',y='Price')

### 2.8. Screen resolution traits extraction

#### 2.8.1 TouchScreen

In [None]:
df['TouchScreen'] = df['ScreenResolution'].map(lambda x:1 if 'Touchscreen' in x else 0)
df.sample(5)

In [None]:
sns.countplot(df, x='TouchScreen')

In [None]:
sns.barplot(df, x='TouchScreen', y = 'Price')

#### 2.8.2 IPS

In [None]:
df['IPS'] = df['ScreenResolution'].map(lambda x:1 if 'IPS' in x else 0)
df.sample(5)

In [None]:
sns.countplot(df, x='IPS')

In [None]:
sns.barplot(df, x='IPS', y='Price')

#### 2.8.3 Extract X and Y resolution

In [None]:
df['ScreenResolution'].unique()

In [None]:
import re

In [None]:
pattern = r"(\d+)x(\d+)"

In [None]:
def get_xy_res(row):
    matches = re.search(pattern, row["ScreenResolution"]).groups(0)
    return int(matches[0]), int(matches[1])

In [None]:
df[["X_res","Y_res"]] = df.apply(get_xy_res, axis=1, result_type="expand")

In [None]:
df.head(3)

In [None]:
df.info()

In [None]:
sns.heatmap(df.corr(numeric_only=True),annot=True)

In [None]:
df.corr(numeric_only=True)['Price'].sort_values()

#### 2.8.4 Construct PPI

As we can see from the above heatmap, there is a very high correlation between X_res and Y_res. I will construct a new variable called PPI, given X_res, Y_res, and Inches. After that, I will delete X_res, Y_res, and Inches for the sake of retaining model training accruacy later on.

The formula to calculate PPI is below:

## $PPI = \frac{\sqrt{X^2+Y^2}}{Screen Resolution}$

In [None]:
df['PPI']=(((df['X_res']**2+df['Y_res']**2))**0.5/df['Inches']).astype(float)
df.head()

In [None]:
df.corr(numeric_only=True)

In [None]:
df.corr(numeric_only=True)['Price'].sort_values()

In [None]:
data_enums = {}
data_enums["ScreenResolution"] = list(df.apply(lambda x : "%dX%d" %(x["X_res"],x["Y_res"]),axis=1).unique())

In [None]:
df.drop(columns=['ScreenResolution','Inches','X_res','Y_res'],inplace=True)
df.head()

### 2.9 CPU Management

In [None]:
df['Cpu'].value_counts()

In [None]:
def get_cpu_name(cpu_text):
    if "Intel Core i7" in cpu_text:
        return "Intel Core i7"
    elif "Intel Core i5" in cpu_text:
        return "Intel Core i5"
    elif "Intel Core i3" in cpu_text:
        return "Intel Core i3"
    elif "Intel" in cpu_text:
        return "Intel Other"
    elif "AMD" in cpu_text:
        return "AMD"
    else:
        return "Other"

In [None]:
df['CPU_name']=df['Cpu'].map(get_cpu_name)
df.head()

In [None]:
sns.countplot(df, x='CPU_name')
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.barplot(df, x='CPU_name', y='Price')
plt.xticks(rotation=45)
plt.show()

In [None]:
df.drop(columns=['Cpu'], inplace=True)
df.head()

In [None]:
df['Memory'].value_counts()

Based on the observations, disk drive has four major types: SSD, HDD, Hybrid, and Storage. We can split them into four columns

In [None]:
def get_clean_number(value):
    pattern = "\d+"
    result = re.findall(pattern, value)
    return int(result[0])

get_clean_number("512GB SSD ")