In [1]:
# assignment2_model.py
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

class LaptopPriceModel:
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)  # should be original or preprocessed CSV
        self.model = None
        print("✅ Data Loaded for Model")

    def preprocess(self):
        df = self.df.copy()

        # Example conversions - adapt if your columns differ
        if 'Ram' in df.columns:
            df['Ram'] = df['Ram'].astype(str).str.replace('GB', '', regex=False).astype(int)

        if 'Weight' in df.columns:
            df['Weight'] = df['Weight'].astype(str).str.replace('kg', '', regex=False).astype(float)

        if 'Cpu' in df.columns:
            # try to extract first two tokens as brand (adjust to dataset specifics)
            df['Cpu Brand'] = df['Cpu'].apply(lambda x: ' '.join(str(x).split()[:2]))

        if 'Memory' in df.columns:
            m = df['Memory'].astype(str)
            m = m.str.replace('Flash Storage', 'SSD', regex=False)
            m = m.str.replace('TB', '000', regex=False)  # e.g. 1TB -> 1000GB
            m = m.str.replace('GB', '', regex=False)
            # extract first number
            df['Memory'] = m.str.extract('(\d+)').astype(float)

        # Drop columns that are not needed or that break model training
        for col in ['Unnamed: 0', 'ScreenResolution', 'Cpu', 'Gpu']:
            if col in df.columns:
                df.drop(col, axis=1, inplace=True)

        # Dummy encode categorical variables
        df = pd.get_dummies(df, drop_first=True)

        # Ensure Price exists
        if 'Price' not in df.columns:
            raise ValueError("Price column not found in the dataset")

        self.X = df.drop('Price', axis=1)
        self.y = df['Price']
        print("✅ Preprocessing Complete")

        # Optionally save the preprocessed CSV for reuse
        df.to_csv('preprocessed_laptop_data.csv', index=False)
        print("✅ Preprocessed data saved as preprocessed_laptop_data.csv")

    def train_model(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        self.model = RandomForestRegressor(n_estimators=100, random_state=42)
        self.model.fit(X_train, y_train)
        self.X_test = X_test
        self.y_test = y_test
        print("✅ Model Training Complete")

    def evaluate_model(self):
        if self.model is None:
            raise ValueError("Model not trained or loaded.")
        y_pred = self.model.predict(self.X_test)
        r2 = r2_score(self.y_test, y_pred)
        mae = mean_absolute_error(self.y_test, y_pred)
        print(f"🎯 R2 Score: {r2:.4f}")
        print(f"📉 MAE: {mae:.2f}")

    def save_model(self, filename='laptop_price_model.pkl'):
        if self.model is None:
            raise ValueError("No model to save.")
        with open(filename, 'wb') as f:
            pickle.dump(self.model, f)
        print(f"✅ Model saved as {filename}")

    def load_model(self, filename='laptop_price_model.pkl'):
        with open(filename, 'rb') as f:
            self.model = pickle.load(f)
        print(f"✅ Model loaded from {filename}")

# Example usage:
if __name__ == "__main__":
    # Point to the same CSV used in Assignment 1 OR to the preprocessed CSV.
    m = LaptopPriceModel('laptop_data.csv')
    m.preprocess()       # runs preprocessing and saves preprocessed_laptop_data.csv
    m.train_model()
    m.evaluate_model()
    m.save_model('laptop_price_model.pkl')


  df['Memory'] = m.str.extract('(\d+)').astype(float)


✅ Data Loaded for Model
✅ Preprocessing Complete
✅ Preprocessed data saved as preprocessed_laptop_data.csv
✅ Model Training Complete
🎯 R2 Score: 0.7845
📉 MAE: 10862.11
✅ Model saved as laptop_price_model.pkl
