In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
dewangmoghe_mobile_phone_price_prediction_path = kagglehub.dataset_download('dewangmoghe/mobile-phone-price-prediction')

print('Data source import complete.')


# I - Overview

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

This Dataset is collected by Scraping the Data Online Websites.
The Columns are as follows.

Name: This column contains the name of the mobile phone.

Rating: This column contains the ratings given to the phone. the minimum rating is 0. and maximum rating is 5.

Spec_score: This column contains the score given to the phone on the basic of its specifications. This minimum value is 0 and maximum value is 100.

No_of_sim: This column contains whether the phone is dual sim, has 3g, 4g, 5g, volte.

RAM: This column contains the information about the RAM of the phone

Battery: This column provides information about battery specification of the phone.

Display: This column contains information about the screen size of the phone.

Camera: This column provides information about the camera, rear and front.

External_Memory: This column contains information whether a device support external memory and
how much.

Android_version: This column tells us about the android version of the phone.

Price: Price of the phone.

Company: The company to which the phone belongs.

Inbuilt_memory: This column provides information about the inbuilt memory of the phone.

fast_charging: It contains whether the device supports fast charging or not. if yes then how much.

Screen_resolution: This describes the screen resolution of the phone.

Processor: This column gives information about the processor of the phone.

Processor_name: This column describes the name of the processor.

In [None]:
df = pd.read_csv("/kaggle/input/mobile-phone-price-prediction/mobile phone price prediction.csv")

In [None]:
pd.set_option("display.max_columns", None)
display(df.info())
display(df.describe())
display(df.head())
display(df.nunique())

# II - Data cleaning

##   a - Basic & functions used

In [None]:
# Changing every title into better version (lowwercase + "_" instead of spaces)
df.columns = [col.lower() for col in df.columns]

# Dropping the empty 'unnamed: 0' column
df = df.drop('unnamed: 0', axis=1)

In [None]:
#### Function that look amount the missing values of every column/row
def get_most_missing_data(df, axis, missing_pourcentage_threshold=0, missing_count_threshold=0):

    #Calculate the number and percentage of missing values per row
    missing_count = df.isnull().sum(axis=axis)
    total = len(df.columns) if axis == 1 else len(df)
    percent_nan = 100 * missing_count / total

    # Create a DataFrame for the results
    missing_data = pd.DataFrame({
        'missing_count': missing_count,
        'percent_missing': percent_nan
    })

    # Apply the two filtering conditions
    missing_data = missing_data[
        (missing_data['percent_missing'] >= missing_pourcentage_threshold) &
        (missing_data['missing_count'] >= missing_count_threshold)
    ]

    # Sort the results
    missing_data = missing_data.sort_values(by='percent_missing', ascending=False)
    return missing_data

In [None]:
#### Function that normalises the text in the column in better condition
def normalize_text_column(df, columns,methods) :
    for column in columns:
            for method in methods:
                if method == "lowercase":
                    df[column] = df[column].str.lower()
                elif method == "space_removal":
                    df[column] = df[column].str.strip()
                elif method == "string_removal":
                    df[column] = df[column].replace(r'[^0-9.]', '', regex=True)
                elif method == "converts_into_integer":
                    df[column] = pd.to_numeric(df[column], errors='coerce')
                elif method == "converts_into_float":
                    df[column] = pd.to_numeric(df[column], errors='coerce',downcast='float')
                elif method == "removes_duplicate_spaces":
                    df[column] = df[column].replace(r'\s+', ' ', regex=True)
                else:
                    raise ValueError("Unknown method: " + str(method))

    return df

#### Function that changes value(s) by the median calculated from other columns
def calculate_median_from_other_columns(df, column_to_replace, columns_where_calculate_median):

    # Avant
    df[column_to_replace] = df[column_to_replace].fillna(df.groupby(columns_where_calculate_median)[column_to_replace].transform('median'))
    return df

def filter_by_occurrence(df, columns, min_occurrences):
    df_filtered = df.copy()

    if isinstance(columns, str):
        columns = [columns]  # Convertir en liste si une seule colonne est donnée

    for col in columns:
        counts = df_filtered[col].value_counts()
        valid_values = counts[counts > min_occurrences].index  # Garde les valeurs ayant STRICTEMENT plus de min_occurrences
        df_filtered = df_filtered[df_filtered[col].isin(valid_values)]

    return df_filtered

#### Function that swap col1, and col2 for the goal to have col1<col2
def swap_columns_if_greater(df, col1, col2):
    mask = df[col1] > df[col2]
    df.loc[mask, [col1, col2]] = df.loc[mask, [col2, col1]].values
    return df


In [None]:
df = normalize_text_column(df, list(df.select_dtypes(include='object')),['lowercase','space_removal','removes_duplicate_spaces'])


## b. general anomalies

#### The row 264 and 1246 are not looking in the right order about the columns. 1239 will be cleaned later with all the other data

In [None]:
display(get_most_missing_data(df,0))
pd.set_option('display.max_colwidth', 30)
display((df[df.index.isin([264,1246,1239,3,11,15,1256])]))

In [None]:
df.loc[264, ['ram', 'battery', 'display', 'camera', 'external_memory', 'inbuilt_memory', 'fast_charging', 'screen_resolution', 'processor_name']] = [
    '6 gb ram',  # ram
    '4300 mah battery',  # battery
    '6.5 inches',  # display
    '48 mp quad rear & 16 mp front camera',  # camera
    'memory card supported, upto 1 tb',  # external memory
    '64 gb inbuilt',  # inbuilt memory
    '30w fast charging',  # fast charging
    '1080 x 2400 px with punch hole',  # screen resolution
    'helio g90t'  # processor name
]

df.loc[1246, ['ram', 'battery', 'display', 'camera', 'external_memory', 'inbuilt_memory', 'fast_charging', 'screen_resolution', 'processor', 'processor_name']] = [
    '4 gb ram',  # ram
    '6000 mah battery with 22.5w fast charging',  # battery
    '6.75 inches',  # display
    '48 mp + 5 mp + 2 mp triple rear & 8 mp front camera',  # camera
    'memory card (hybrid)',  # external memory
    '128 gb inbuilt',  # inbuilt memory
    '22.5w fast charging',  # fast charging
    '720 x 1600 px display',  # screen resolution
    'octa core',  # processor
    'helio g35'  # processor name
]

## c. display - ok

In [None]:
print(df['display'].value_counts(dropna=False).to_string())

#### There is some high values, but it really concerns bigger phones. But the data is quite clean.

In [None]:
#removing not integer infos
df = normalize_text_column(df,['display'],['string_removal'])

#Converts to float
df = normalize_text_column(df,['display'],['converts_into_float'])

print(df['display'].value_counts(dropna=False).to_string())

##   e - spec_values - ok

In [None]:
print(df['spec_score'].value_counts().to_string())

## h. processor_name

In [None]:
print(df['processor_name'].value_counts().to_string())
print(len(np.unique(df['processor_name'])))

In [None]:
#Improve results
df['processor_name'] = df['processor_name'].str.replace('plus', '+')

df['processor_name_contains_plus'] = df['processor_name'].str.contains(r'\+')

# Lambda apply categories
df['processor_name'] = df['processor_name'].apply(
    lambda name: 'snapdragon' if 'snapdragon' in name else
                 'dimensity' if 'dimensity' in name else
                 'helio' if 'helio' in name else
                 'unisoc' if 'unisoc' in name else
                 'exynos' if 'exynos' in name else
                 'tiger' if 'tiger' in name else
                 'tensor' if 'tensor' in name else
                 'other'
)

print(df['processor_name'].value_counts().to_string())
print(df['processor_name_contains_plus'].value_counts().to_string())


## g. no_of_sim

#### We notice that the values are quite clean, but
#### - We could create a sim column
#### - We could create some new colunms for each value

In [None]:
print(df['no_of_sim'].value_counts().to_string())

In [None]:
# Remove last character + removing useless spaces
df['network_proprieties'] = df['no_of_sim'].str.split(',', n=1).str[1].str.strip(',').str.strip()

# Creating a sim column
df['no_of_sim'] = df['no_of_sim'].str.split(',', n=1).str[0]

# Deleting the too few occurences
df = filter_by_occurrence(df, ['no_of_sim', 'network_proprieties'], 1)

# Initialisation of an empty list where we put our unique values
unique_values = []

# Extact of every values
df["network_proprieties"].apply(lambda x: unique_values.extend(x.split(", ")))

# We delete the duplicates
unique_values = list(pd.unique(unique_values))

# Creation of binaries values
for value in unique_values:
    df[value] = df["network_proprieties"].apply(lambda x: 1 if value in x else 0)

# Delete network_proprieties
df = df.drop(columns=["network_proprieties",'3g','4g'])

## h. screen_resolution : ok

In [None]:
print(df['screen_resolution'].value_counts().to_string())

#### We notice that :
#### - We could separate this column into screen_resolution_height, and screen_resolution_width
#### - We should be careful that the value of width is always smaller than the height
#### - We could create a column with differents attributes relative to screen_resolution like screen_area, aspect_ratio, etc.
#### - We need to remove the useless infos like "px" and converts it into integet

In [None]:
#We can separate notch and resolution
df['notch_type'] = df['screen_resolution'].str.extract(r'(?:\S+\s+){4}(.*)')

#Replacing NaN by Unknown
df["notch_type"] = df["notch_type"].replace(np.nan, "Unknown")

#Removing weird value
df = df[~df['screen_resolution'].isin(["full hd+ display with punch hole"])]

#result
print(df['notch_type'].value_counts(dropna=False).to_string())

In [None]:
# Extract numbers
df['screen_resolution'] = df['screen_resolution'].str.extract(r'(\d+\s*x\s*\d+)')

# Extract numbers in différent colunms
df[['screen_resolution_height', 'screen_resolution_width']] = df['screen_resolution'].str.split(' x ', expand=True).astype(int)

# Putting always the lower px in width and higher in height
df = swap_columns_if_greater(df, 'screen_resolution_width', 'screen_resolution_height')

#screen_resolution is now a useless column
df = df.drop('screen_resolution', axis=1)

#Creation of other infos
df['aspect_ratio'] = df['screen_resolution_width'] / df['screen_resolution_height']
df['screen_area'] = df['screen_resolution_width'] * df['screen_resolution_height']

#It's not a redondancy info - 0.33741519821504895
df['aspect_ratio'].corr(df['screen_area'])

## i. battery : ok

In [None]:
print(df['battery'].value_counts().to_string())

In [None]:
display((df[df['battery'].isin(['10500 mah battery'])]))
df = df[~df['battery'].isin(['10500 mah battery'])]

#### We notice that the column is pretty clean but :
#### - We still need to remove the useless values
#### - 10500 mah battery is too high for a phone, it's because it's a tablet, we need to remove it

In [None]:
# Deleting the mah battery + the "mah battery with 22.5w fast charging" value
df['battery'] = df['battery'].str.replace(r'\s*mah battery.*$', '', regex=True)

df = normalize_text_column(df, ['battery'],['converts_into_integer'])

In [None]:
print(df['battery'].value_counts().to_string())

## j. price

In [None]:
pd.set_option("display.max_rows", 50)
display(df['price'].value_counts(dropna=False))

#### We notice that the column is pretty clean but we need to remove the "," we find sometimes on the values and converts it to int

In [None]:
df["price"] = df["price"].str.replace(",", "").astype(int)
df = normalize_text_column(df, ['price'],['converts_into_integer'])

## j. fast_charging

In [None]:
display(df['fast_charging'].value_counts(dropna=False))

In [None]:
#Removing not integer infos
df = normalize_text_column(df,['fast_charging'],['string_removal'])
df['fast_charging'] = df['fast_charging'].replace('', np.nan)

#Creating new column for flaging guessed values
df['fast_charging_was_guessed'] = df['fast_charging'].isna()

#Converting to float
df = normalize_text_column(df, ['fast_charging'],['converts_into_float'])

#Guessing unknown values
df = calculate_median_from_other_columns(df, 'fast_charging', ['inbuilt_memory', 'company'])

#Drop last NA columns
df = df[~df['fast_charging'].isin([np.nan])]

## b. processor

#### We notice that the column processor doesn't really adds any information

In [None]:
display(df['processor'].value_counts(dropna=False))
df = df.drop('processor', axis=1)

## b. inbuilt_memory & ram

In [None]:
display(df['inbuilt_memory'].value_counts(dropna=False))
display(df['ram'].value_counts(dropna=False))

In [None]:
display((df[df['inbuilt_memory'].isin(['16 gb inbuilt',"8 gb inbuilt"])]))
display((df[df['name'].isin(['realme gt flash 5g'])]))
display((df[df['ram'].isin(['1.5 gb ram','1 gb ram','18 gb ram'])]))

#### What do we notice?
#### - When the inbuild_memory is empty, the value is in ram column
#### - The values of 258, 8, 16 on inbuild_memory is looking uncommon and need to be verified. The value of 1 tb needs to be ajusted
#### - The values of 24, 1 and 1.5 on ram are looking uncommon and need to be verified

In [None]:
df.reset_index(drop=True, inplace=True)

#We need to replace the values on inbuilt_memory when ram contains "inbuilt", and then delete the wrong values on ram
df.loc[df['ram'].str.contains('inbuilt',na=False),'inbuilt_memory'] = df['ram']
df.loc[df['ram'].str.contains('inbuilt',na=False),'ram'] = np.nan

#Replace wrong 16Go value
df.loc[(df['name'] == 'realme gt flash 5g') & (df['inbuilt_memory'] == 8.0), 'inbuilt_memory'] = 16.0

# Remplace "258" by "256" & "1" by "1024"
df['inbuilt_memory'] = df['inbuilt_memory'].replace({
    "258 gb inbuilt": "256 gb inbuilt",
    "1 tb inbuilt": "1024 gb inbuilt"
})

# Converts inbuilt_memory in int except nan values
df = normalize_text_column(df, ['inbuilt_memory','ram'],['string_removal','converts_into_integer'])

# Converts NaN ram values into the median of ram from inbuilt_memory and company
df = calculate_median_from_other_columns(df, 'ram', ['inbuilt_memory', 'company'])

In [None]:
display(df['inbuilt_memory'].value_counts(dropna=False))
display(df['ram'].value_counts(dropna=False))

##   f - android_version & external_memory

In [None]:
#Let's see how dirty the columns android_version & external_memory are
pd.set_option("display.max_rows", None)
display(df['android_version'].value_counts(dropna=False))
display(df['external_memory'].value_counts(dropna=False))
pd.set_option("display.max_rows", 30)

In [None]:
df[df['android_version'].isnull()]

#### We notice that when android_version is null, the value can be in "external_memory"
#### There is valeus of versions like "7.1.1" that need to be changed
#### external_memory is too dirty to be usable

In [None]:
#Replacing empty values on andrboid_version by ones that can contains versions in external_memory
for index, row in df.iterrows():
    if pd.isnull(row['android_version']) and (len(row['external_memory']) < 19) and bool(re.search(r'\d', row['external_memory'])):
        df.at[index, 'android_version'] = row['external_memory']
        df.at[index, 'external_memory'] = np.nan

#Removing not integer infos
df = normalize_text_column(df,['android_version'],['string_removal'])

#Changing the version 7.1.1 into 7.11
df['android_version'] = df['android_version'].replace({"7.1.1": "7.11"})

#Converts into float
df = normalize_text_column(df,['android_version'],['converts_into_float'])

# Converts NaN android_version values into the median of android_version from ram and company
df = calculate_median_from_other_columns(df, 'android_version', ['inbuilt_memory', 'company'])

# Deleting external_memory because too dirty
df = df.drop(columns=['external_memory'])

#Result
display(df['android_version'].value_counts(dropna=False))

In [None]:
display(get_most_missing_data(df,0))

## III - Outliers

#### For the outliers, we are going to generate multiple columns that will show us any anomaly in the distribution of the data

In [None]:
import warnings
#Remove warning
warnings.filterwarnings("ignore", category=FutureWarning)

#Show outliers for all numerical columns
name_numerical_columns = list(df.select_dtypes(exclude='object'))

fig, axes = plt.subplots(len(name_numerical_columns), 3, figsize=(30, 120),dpi = 100)

for cpt in range(len(name_numerical_columns)):
    sns.scatterplot(ax=axes[cpt, 0], x=name_numerical_columns[cpt], y='spec_score', data=df.select_dtypes(exclude='object'))
    sns.kdeplot(ax=axes[cpt, 1], x=name_numerical_columns[cpt], data=df.select_dtypes(exclude='object'))
    try:
        sns.boxplot(ax=axes[cpt, 2], y=name_numerical_columns[cpt], data=df.select_dtypes(exclude='object'))
    except Exception as e:
        print(f"Boxplot skipped for {name_numerical_columns[cpt]}: {e}")

#price - weird value
df = df.drop(df[(df['price'] > 14000) & (df['spec_score'] < 75)].index)


In [None]:
plt.figure(figsize=(12, 8))
sns.clustermap(df.select_dtypes(exclude='object').corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation clustermap")
plt.show()

In [None]:
#Has too much corr with screen_resolution_width
df = df.drop('screen_area', axis=1)
df = df.drop(['name','camera'], axis=1)

## IV - Dummies

In [None]:
# Separate numerical and categorical variables
df_nums = df.select_dtypes(exclude='object')
df_objs = df.select_dtypes(include='object')

# Convert categorical variables into dummies (one-hot encoding) while removing the first category to avoid multicollinearity
df_objs_dummies = pd.get_dummies(df_objs, drop_first=True)

# Combine numerical and encoded categorical variables
final_df = pd.concat([df_nums, df_objs_dummies], axis=1)

# Display final dataset dimensions
print(f"Final dataset dimensions: {final_df.shape}")

In [None]:
#Final analystic
pd.set_option("display.max_columns", None)
display(final_df.info())
display(final_df.describe())
display(final_df.head())
pd.set_option("display.max_rows", None)
display(final_df.nunique())
pd.set_option("display.max_rows", 30)

In [None]:
#See correlations on our final df
pd.set_option("display.max_rows", None)
final_df.select_dtypes(exclude='object').corr()['price'].sort_values(ascending=True)

## V - Lazy predict

In [None]:
# Imports
!pip install lazypredict pandas scikit-learn

from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyRegressor
from sklearn.preprocessing import StandardScaler  # Changed from MinMaxScaler
import pandas as pd

# Load the dataset
X = final_df.drop(columns=['price'])  # Features
y = final_df['price']  # Target variable

# Normalize the data using StandardScaler
scaler = StandardScaler()  # Changed from MinMaxScaler
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize LazyRegressor
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)

# Train the models with all features
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

# Find the best R² score among all models
best_r2_reference = models['R-Squared'].max()

# Display the results
print("\n📊 LazyRegressor Model Results:\n")
print(models)
