# Importing Libraries

In [1]:
# Import necessary libraries for data manipulation, preprocessing, and feature scaling
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


#Load Data Function

In [16]:
def load_data(train_path, test_path):
    """
    Load the train and test datasets.

    Parameters:
    train_path (str): Path to the training data CSV file.
    test_path (str): Path to the test data CSV file.

    Returns:
    tuple: DataFrames for training and test data.
    """
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

# Example usage:
train_path = "data/train - train.csv"
test_path = "data/test - test.csv"
train_df, test_df = load_data(train_path, test_path)
train_df.head()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1.0,0.0,7.0,0.6,188.0,2.0,...,20.0,756.0,2549.0,9.0,7.0,19,0,0,1,1
1,1021,1,0.5,1,0.0,1.0,53.0,0.7,136.0,3.0,...,905.0,1988.0,2631.0,17.0,3.0,7,1,1,0,2
2,563,1,0.5,1,2.0,1.0,41.0,0.9,145.0,5.0,...,1263.0,1716.0,2603.0,11.0,2.0,9,1,1,0,2
3,615,1,2.5,0,0.0,0.0,10.0,0.8,131.0,6.0,...,1216.0,1786.0,2769.0,16.0,8.0,11,1,0,0,2
4,1821,1,1.2,0,13.0,1.0,44.0,0.6,141.0,2.0,...,1208.0,1212.0,1411.0,8.0,2.0,15,1,1,0,1


#Handle Missing Values Function

In [17]:
def handle_missing_values(df):
    """
    Handle missing values in the dataset.

    Parameters:
    df (DataFrame): Input DataFrame with potential missing values.

    Returns:
    DataFrame: DataFrame with missing values handled.
    """
    # Using SimpleImputer to fill missing values
    imputer = SimpleImputer(strategy='mean')
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return df_imputed

# Example usage:
train_df = handle_missing_values(train_df)
test_df = handle_missing_values(test_df)
train_df.head()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842.0,0.0,2.2,0.0,1.0,0.0,7.0,0.6,188.0,2.0,...,20.0,756.0,2549.0,9.0,7.0,19.0,0.0,0.0,1.0,1.0
1,1021.0,1.0,0.5,1.0,0.0,1.0,53.0,0.7,136.0,3.0,...,905.0,1988.0,2631.0,17.0,3.0,7.0,1.0,1.0,0.0,2.0
2,563.0,1.0,0.5,1.0,2.0,1.0,41.0,0.9,145.0,5.0,...,1263.0,1716.0,2603.0,11.0,2.0,9.0,1.0,1.0,0.0,2.0
3,615.0,1.0,2.5,0.0,0.0,0.0,10.0,0.8,131.0,6.0,...,1216.0,1786.0,2769.0,16.0,8.0,11.0,1.0,0.0,0.0,2.0
4,1821.0,1.0,1.2,0.0,13.0,1.0,44.0,0.6,141.0,2.0,...,1208.0,1212.0,1411.0,8.0,2.0,15.0,1.0,1.0,0.0,1.0


#Feature Engineering Function

In [18]:
def feature_engineering(df):
    """
    Apply feature engineering techniques.

    Parameters:
    df (DataFrame): Input DataFrame for feature engineering.

    Returns:
    DataFrame: DataFrame with new features added.
    """
    # Example: Create pixel density feature
    df['px_density'] = df['px_width'] * df['px_height']
    return df

# Example usage:
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)
train_df.head()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range,px_density
0,842.0,0.0,2.2,0.0,1.0,0.0,7.0,0.6,188.0,2.0,...,756.0,2549.0,9.0,7.0,19.0,0.0,0.0,1.0,1.0,15120.0
1,1021.0,1.0,0.5,1.0,0.0,1.0,53.0,0.7,136.0,3.0,...,1988.0,2631.0,17.0,3.0,7.0,1.0,1.0,0.0,2.0,1799140.0
2,563.0,1.0,0.5,1.0,2.0,1.0,41.0,0.9,145.0,5.0,...,1716.0,2603.0,11.0,2.0,9.0,1.0,1.0,0.0,2.0,2167308.0
3,615.0,1.0,2.5,0.0,0.0,0.0,10.0,0.8,131.0,6.0,...,1786.0,2769.0,16.0,8.0,11.0,1.0,0.0,0.0,2.0,2171776.0
4,1821.0,1.0,1.2,0.0,13.0,1.0,44.0,0.6,141.0,2.0,...,1212.0,1411.0,8.0,2.0,15.0,1.0,1.0,0.0,1.0,1464096.0


#Scale Features Function

In [19]:
def scale_features(train_df, test_df):
    """
    Scale numerical features using StandardScaler.

    Parameters:
    train_df (DataFrame): Training data DataFrame.
    test_df (DataFrame): Test data DataFrame.

    Returns:
    tuple: Scaled training and test DataFrames.
    """
    scaler = StandardScaler()
    features = train_df.columns.difference(['price_range'])
    train_df[features] = scaler.fit_transform(train_df[features])
    test_df[features] = scaler.transform(test_df[features])
    return train_df, test_df

# Example usage:
train_df, test_df = scale_features(train_df, test_df)
train_df.head()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range,px_density
0,-0.902597,-0.99005,0.830779,-1.019184,-0.764629,-1.044861,-1.382405,0.339276,1.350676,-1.101463,...,-1.146424,0.391659,-0.784666,0.28325,1.462493,-1.786861,-1.006018,0.986097,1.0,-1.074563
1,-0.495139,1.010051,-1.253064,0.981177,-0.995615,0.959464,1.156334,0.686381,-0.120727,-0.664034,...,1.70524,0.467272,1.115452,-0.635188,-0.734267,0.559641,0.994018,-1.014099,2.0,1.081776
2,-1.537686,1.010051,-1.253064,0.981177,-0.533642,0.959464,0.494054,1.380591,0.133939,0.210825,...,1.075652,0.441453,-0.309636,-0.864797,-0.36814,0.559641,0.994018,-1.014099,2.0,1.52678
3,-1.419319,1.010051,1.198517,-1.019184,-0.995615,-1.044861,-1.216835,1.033486,-0.262208,0.648255,...,1.237678,0.594525,0.877937,0.51286,-0.002014,0.559641,-1.006018,-1.014099,2.0,1.53218
4,1.325906,1.010051,-0.395011,-1.019184,2.007209,0.959464,0.659624,0.339276,0.020754,-1.101463,...,-0.090938,-0.657712,-1.02218,-0.864797,0.73024,0.559641,0.994018,-1.014099,1.0,0.67681


#Main Data Preparation Function

In [10]:
def prepare_data(train_path, test_path):
    """
    Main function to prepare the data.

    Parameters:
    train_path (str): Path to the training data CSV file.
    test_path (str): Path to the test data CSV file.

    Returns:
    tuple: Features and target for training data, and features for test data.
    """
    # Load data
    train_df, test_df = load_data(train_path, test_path)

    # Handle missing values
    train_df = handle_missing_values(train_df)
    test_df = handle_missing_values(test_df)

    # Feature engineering
    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)

    # Scale features
    train_df, test_df = scale_features(train_df, test_df)

    # Split features and target for training data
    X_train = train_df.drop(columns=['price_range'])
    y_train = train_df['price_range']

    return X_train, y_train, test_df

# Example usage:
train_path = "data/train - train.csv"
test_path = "data/test - test.csv"
X_train, y_train, X_test = prepare_data(train_path, test_path)
X_train.head()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,px_density
0,-0.902597,-0.99005,0.830779,-1.019184,-0.764629,-1.044861,-1.382405,0.339276,1.350676,-1.101463,...,-1.410683,-1.146424,0.391659,-0.784666,0.28325,1.462493,-1.786861,-1.006018,0.986097,-1.074563
1,-0.495139,1.010051,-1.253064,0.981177,-0.995615,0.959464,1.156334,0.686381,-0.120727,-0.664034,...,0.587958,1.70524,0.467272,1.115452,-0.635188,-0.734267,0.559641,0.994018,-1.014099,1.081776
2,-1.537686,1.010051,-1.253064,0.981177,-0.533642,0.959464,0.494054,1.380591,0.133939,0.210825,...,1.396447,1.075652,0.441453,-0.309636,-0.864797,-0.36814,0.559641,0.994018,-1.014099,1.52678
3,-1.419319,1.010051,1.198517,-1.019184,-0.995615,-1.044861,-1.216835,1.033486,-0.262208,0.648255,...,1.290305,1.237678,0.594525,0.877937,0.51286,-0.002014,0.559641,-1.006018,-1.014099,1.53218
4,1.325906,1.010051,-0.395011,-1.019184,2.007209,0.959464,0.659624,0.339276,0.020754,-1.101463,...,1.272238,-0.090938,-0.657712,-1.02218,-0.864797,0.73024,0.559641,0.994018,-1.014099,0.67681


#Save Prepared Data

In [20]:
# Save prepared data to CSV files
X_train.to_csv("data/X_train_prepared.csv", index=False)
y_train.to_csv("data/y_train_prepared.csv", index=False)
X_test.to_csv("data/X_test_prepared.csv", index=False)
print("Done with data preparation")


Done with data preparation
