# Smartphone Price Prediction

Vishakha Joshi (22070126132)  
Yash Chandak (22070126134)  
Girish Mahale (23070126504)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import pickle


# Dataset Description

In [None]:
df = pd.read_csv('prep_smartphones.csv')
df.head()


In [None]:
df.shape


In [None]:
df.describe()


In [None]:
df.info()


In [None]:
df.isna().sum().sum()


In [None]:
df.isna().sum()


In [None]:
category_cols = ['Brand', 'Color', 'SIM Type', 'Hybrid Sim Slot', 'Resolution Type', 'Display Type', 'Processor Type', 'Quick Charging', 'Processor Core', 'Audio Jack', 'Network Type']
df[category_cols] = df[category_cols].astype('category')
categoric = df.select_dtypes(include='category')
numeric = df.select_dtypes(include='number')


In [None]:
corr_matrix = numeric.corr()
corr_matrix['Price'].sort_values()


In [None]:
plt.figure(figsize=(12, 10))
sns.set(font_scale=1)
sns.set_style('whitegrid')
heatmap = sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', square=True, fmt='.2f', linewidths=0.5, linecolor='gray')

plt.title('Correlation Matrix Heatmap (Numeric Variables)', fontsize=16)
plt.tight_layout()
plt.show()


  Inference:

  Based on the heatmap , as we are working on price prediction we will take the most highly corelated feature like Internal storage

In [None]:
from scipy.stats import pointbiserialr

corrs = []
for col in categoric:
    h_statistic, p_value = pointbiserialr(df[col], df['Price'].to_numpy())
    corr = {
        'Categorical Variable': col,
        'H-statistic': h_statistic,
        'p-value': p_value
    }
    corrs.append(corr)

corr_df = pd.DataFrame(corrs, columns=['Categorical Variable', 'H-statistic', 'p-value'])
corr_df_sorted = corr_df.sort_values(by='p-value')
corr_df_sorted


In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='p-value', y='Categorical Variable', data=corr_df_sorted, palette='coolwarm')
plt.xscale('log')
plt.xlabel('p-value', fontsize=12)
plt.ylabel('Categorical Variable', fontsize=12)
plt.title('Correlation of Price with Categorical Variables', fontsize=14)
plt.tight_layout()
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.xticks(rotation=45)
plt.show()


In [None]:
df.hist(bins=20, color='teal', alpha=0.7, edgecolor='black', grid=False, layout=(4, 4), figsize=(15, 15))
plt.suptitle('Histograms of Numeric Variables', fontsize=16)
plt.tight_layout()
plt.show()


Inferences:

From this our distribution is not much normal and it is skewed distribution .

In [None]:
nrows, ncols = 4, 4
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 15))

for i in range(nrows):
    for j in range(ncols):
        ax = axes[i, j]
        col = numeric.columns[i * ncols + j]
        sns.scatterplot(x=col, y='Price', data=df, ax=ax, alpha=0.7, edgecolor='black')
        ax.set_xlabel(col, fontsize=10)
        ax.set_ylabel('Price', fontsize=10)
        ax.set_title(f'{col} vs Price', fontsize=12)

plt.suptitle('Scatter Plot of Numeric Variables', fontsize=16)
plt.tight_layout()
plt.show()


Inferences:

Based on this plot with respect to price which all are more diveresed and data is deviated and not homoscadicity.

In [None]:
nrows, ncols = 4, 4
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 15))

for i in range(nrows):
    for j in range(ncols):
        ax = axes[i, j]
        col = numeric.columns[i * ncols + j]
        sns.boxplot(x=col, y="Price", data=df , ax=ax ,palette='rainbow')
        ax.set_xlabel(col, fontsize=10)
        ax.set_ylabel('Price', fontsize=10)
        ax.set_title(f'{col} vs Price', fontsize=12)

        plt.tight_layout()


Inferences:

Based on this box plot , we can say that it is not clear but it shows less data and many outliers.

In [None]:
sns.boxplot(x=col, y="Price", data=df ,palette='rainbow')
plt.tight_layout()
plt.show()


In [None]:
nrows, ncols = 4, 4
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 10))

for i in range(nrows):
    for j in range(ncols):
        ax = axes[i, j]
        col = numeric.columns[i * ncols + j]
        sns.violinplot(x=col, y="Price", data=df,ax=ax,palette='rainbow')
        ax.set_xlabel(col, fontsize=10)
        ax.set_ylabel('Price', fontsize=10)
        ax.set_title(f'{col} vs Price', fontsize=12)
        plt.tight_layout()


Inference:

Based on the violinplot , the much long is the violin it shows range is more and data is less .

In [None]:
sns.violinplot(x='Internal Storage', y="Price", data=df,palette='rainbow')
plt.tight_layout()
plt.show()


In [None]:
sns.jointplot(x='Internal Storage',y='Price',data=df,kind='reg')


# Model Building

In [None]:
numeric_features = ['Price', 'RAM', 'Internal Storage', 'Battery Capacity']
categoric_features = ['Resolution Type', 'Processor Type', 'Processor Core']
features = numeric_features + categoric_features
df = df[features]
numeric = df.select_dtypes(include='number')
categoric = df.select_dtypes(include='category')


In [None]:
from sklearn.preprocessing import FunctionTransformer

transform = FunctionTransformer(func=np.log1p)
transformed_df = transform.fit_transform(numeric)
for col in categoric.columns:
    transformed_df[col] = df[col]

transformed_df.head()


In [None]:
with open('log_transform.pkl', 'wb') as f:
    pickle.dump(transform, f)


In [None]:
transformed_df.hist(bins=20, alpha=0.7, edgecolor='black', grid=False, layout=(4, 4), figsize=(15, 15))
plt.suptitle('Histograms of Numeric Variables (Log Transformed)', fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(transformed_df)
scaled_df = pd.DataFrame(scaled, columns=transformed_df.columns)
scaled_df.head()


In [None]:
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [None]:
from sklearn.model_selection import train_test_split

X = scaled_df.drop('Price', axis=1)
y = scaled_df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cols = features.copy()
cols.remove('Price')
covariates = pd.DataFrame(X_test, columns=cols)


In [None]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)


In [None]:
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
from sklearn.gaussian_process import GaussianProcessRegressor

kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(1e-1)
gpr_model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
gpr_model.fit(X_train, y_train)
gpr_pred = gpr_model.predict(X_test)


In [None]:
def theils_u(y_true, y_pred):
    n = len(y_true)
    U1 = np.sqrt(np.sum((y_true - y_pred) ** 2)) / np.sqrt(np.sum(y_true ** 2))
    U2 = np.sqrt(np.sum((y_true - y_pred) ** 2)) / np.sqrt(np.sum(y_pred ** 2))
    return U1, U2


In [None]:
def index_of_agreement(y_true, y_pred):
    numerator = np.sum((y_true - y_pred) ** 2)
    denominator = np.sum((np.abs(y_pred - np.mean(y_true)) + np.abs(y_true - np.mean(y_true))) ** 2)
    return 1 - (numerator / denominator)


In [None]:
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan

def breusch_pagan_test(residuals, covariates):
    covariates = sm.add_constant(covariates)
    min_rows = min(len(residuals), len(covariates))
    residuals = residuals[:min_rows]
    covariates = covariates[:min_rows]
    lm, lm_p_value, fvalue, f_p_value = het_breuschpagan(residuals, covariates)
    return lm, lm_p_value, fvalue, f_p_value


In [None]:
from statsmodels.stats.stattools import durbin_watson

def durbin_watson_test(residuals):
    dw_statistic = durbin_watson(residuals)
    return dw_statistic


In [None]:
def plot_residual_histogram(residuals, model_name):
    sns.set_style("white")
    plt.hist(residuals, bins=20, edgecolor='k')
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of Residuals - {model_name.title()}')
    plt.show()


In [None]:
def plot_residual_vs_covariates(residuals, covariates, model_name):
    num_covariates = len(covariates.columns)
    num_rows = int(num_covariates / 2) + (num_covariates % 2)
    num_cols = 2

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 10))
    fig.suptitle(f"{model_name.title()} - Residuals vs. Covariates", fontsize=16)
    sns.set_style("white")
    for i, cov_name in enumerate(covariates.columns):
        row = i // num_cols
        col = i % num_cols
        ax = axes[row, col]
        covariate = covariates[cov_name]
        ax.scatter(covariate, residuals, alpha=0.5)
        ax.set_xlabel(cov_name)
        ax.set_ylabel("Residuals")

    for i in range(num_covariates, num_rows * num_cols):
        fig.delaxes(axes.flatten()[i])

    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def calculate_metrics(y_true, y_pred, covariates, model_name):
    # Calculate Residuals
    residuals = y_true - y_pred

    # Calculate R-squared
    r_squared = r2_score(y_true, y_pred)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_true, y_pred)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_true, y_pred)

    # Calculate Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    # Calculate Normalized Root Mean Squared Error (nRMSE)
    nrmse = np.sqrt(mse) / (max(y_true) - min(y_true))

    # Calculate Theil's U1 and U2
    u1, u2 = theils_u(y_true, y_pred)

    # Calculate Index of Agreement
    ioa = index_of_agreement(y_true, y_pred)

    # Create a Pandas DataFrame to store the metrics
    metrics_df = pd.DataFrame({
        'Model': [model_name],
        'R-squared': [r_squared],
        'MAE': [mae],
        'MSE': [mse],
        'MAPE': [mape],
        "Theil's U1": [u1],
        "Theil's U2": [u2],
        'Index of Agreement': [ioa]
    })

    # Durbin-Watson test (Test for autocorrelation among residuals)
    dw_stat = durbin_watson_test(residuals)
    metrics_df['Durbin-Watson Statistic'] = [dw_stat]

    # Calculate Test for heteroscedasticity (Breusch-Pagan Test) - p-value
    lm, lm_p_value, fvalue, f_p_value = breusch_pagan_test(residuals, covariates)

    if lm_p_value < 0.05:
        metrics_df['Heteroscedasticity'] = ["Yes"]
    else:
        metrics_df['Heteroscedasticity'] = ["No"]

    metrics_df['Breusch-Pagan LM Statistic'] = [lm]
    metrics_df['Breusch-Pagan LM P-Value'] = [lm_p_value]
    metrics_df['Breusch-Pagan F-Stat'] = [fvalue]
    metrics_df['Breusch-Pagan F P-Value'] = [f_p_value]

    return metrics_df


In [None]:
residuals = y_test - lr_pred
model_name = 'Linear Regression'
lr_metrics = calculate_metrics(y_test, lr_pred, covariates, model_name)
lr_metrics.T


In [None]:
# Histogram for residuals
plot_residual_histogram(residuals, model_name)


In [None]:
# Scatter plot (Residual vs Covariate)
plot_residual_vs_covariates(residuals, covariates, model_name)


In [None]:
residuals = y_test - gpr_pred
model_name = 'Gaussian Process'
gpr_metrics = calculate_metrics(y_test, gpr_pred, covariates, model_name)
gpr_metrics.T


In [None]:
# Histogram for residuals
plot_residual_histogram(residuals, model_name)


In [None]:
# Scatter plot (Residual vs Covariate)
plot_residual_vs_covariates(residuals, covariates, model_name)


In [None]:
combined_metrics = pd.concat([lr_metrics, gpr_metrics], ignore_index=True)
combined_metrics


In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(gpr_model, f)
