In [1]:
import pandas as pd
lap_data = pd.read_csv('C:/Users/HAI/conda projects/conda projects/laptop_price.csv',encoding = "ISO-8859-1")

In [2]:
lap_data.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [3]:
# DATA PROCESSING

# Droping unneccesary column 'laptop_id'

lap_data=lap_data.drop(['laptop_ID'], axis = 1)

#function converts euro doller into rupees(india)

def priceconvt(row):
    row.Price_euros = row.Price_euros*91.19
    return row
lap_data= lap_data.apply(priceconvt,axis = 1)

# changing data types of columns
# renaming column

lap_data = lap_data.rename(columns={'Price_euros':'Price_INR'})

# changing ram column to integer

lap_data['Ram'] = lap_data['Ram'].str.replace('GB','')
lap_data['Ram'] = lap_data['Ram'].astype(int)

# changing weight column to float
lap_data['Weight'] = lap_data['Weight'].str.replace('kg','')
lap_data['Weight'] = lap_data['Weight'].astype(float)

# making more details in screen resolution column

import numpy as np
import re

# Convert object column to string column
lap_data['ScreenResolution'] = lap_data['ScreenResolution'].astype(str)

# Create touch_screen column indicating whether screen_resolution contains "touch screen"
lap_data['TouchScreen'] = lap_data['ScreenResolution'].str.contains('touchscreen', case=False).astype(int)

# Define function to extract and multiply integers
def multiply_resolution(text):
    # Extract integers from text using regular expression
    numbers = [int(s) for s in re.findall(r'\d+', text)]
    # Check if two integers were extracted
    if len(numbers) == 2:
        # Multiply integers and return result
        return numbers[0] * numbers[1]
    else:
        # Return default value
        return np.nan

# Apply function to screen_resolution column and overwrite it with the result
lap_data['ScreenResolution'] = lap_data['ScreenResolution'].apply(multiply_resolution)

# Fill NaN values in ScreenResolution column with most frequent value
most_frequent_value = lap_data['ScreenResolution'].mode().iloc[0]
lap_data['ScreenResolution'] = lap_data['ScreenResolution'].fillna(most_frequent_value)


# setting data_type for ScreenResolution column
lap_data['ScreenResolution'] = lap_data['ScreenResolution'].astype(np.int64)


In [4]:
# Defining predictors and targets
# target contains large integers so we taking log
y = np.log(lap_data['Price_INR'])
# droping price(predictor) column in X and Company and product has no relation on price so we drop that too
X = lap_data.drop(['Price_INR','Company','Product'],axis = 1)

#categorical cols
cat_cols = [col for col in X.columns if X[col].dtype == 'object']

#integer cols
num_cols = list(set(X.columns)-set(cat_cols))

# spliting data train and validation
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.1,random_state = 1)


In [5]:
# preprocessing pipe lines
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

# numerical transformer
num_transformer = SimpleImputer(strategy= 'mean')

# categorical_transformer
cat_transformer = Pipeline(steps = [('imp',SimpleImputer(strategy='most_frequent')),('onehot',OneHotEncoder(handle_unknown='ignore'))])

# preprocessor
preprocessor = ColumnTransformer(transformers=[('num',num_transformer,num_cols),('cat',cat_transformer,cat_cols)])

In [6]:
#final_pipeline
from sklearn.ensemble import RandomForestRegressor

my_pipe = Pipeline(steps = [('prepe',preprocessor),('rfr',RandomForestRegressor(n_estimators=100,criterion = "absolute_error",random_state=7))])

#fitting pipe line

my_pipe.fit(X_train,y_train)

# predictions

preds = my_pipe.predict(X_val)

# error validation
from sklearn.metrics import mean_absolute_error,r2_score

print("MAE:",mean_absolute_error(y_val,preds))
print("\nr2:",r2_score(y_val,preds))

MAE: 0.14418731976213947

r2: 0.8841417855593144


In [10]:
from sklearn.model_selection import cross_val_score
# splitting data as 5 folds and validating perfomence on each folds

mae_score = -1 * cross_val_score(my_pipe,X,y,cv = 5,scoring = 'neg_mean_absolute_error',n_jobs = 4)
print("cross validation MAE :",mae_score.mean())

cross validation MAE : 0.18638418643539933
