# Used Bike Price Prediction - Exploratory Data Analysis (EDA)

This notebook performs a thorough and reproducible EDA on the used bike dataset. It includes data loading, cleaning, feature engineering, visualization, and insights to support robust model development.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the dataset
file_path = '../bikes.csv'  # Adjust path if needed
df = pd.read_csv(file_path)
display(df.head())
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

In [None]:
# Basic info and missing values
display(df.info())
display(df.describe(include='all'))
print('Missing values per column:')
display(df.isnull().sum())

In [None]:
# Data cleaning and feature engineering
df = df.copy()

# Extract brand from model_name
if 'model_name' in df.columns:
    df['brand'] = df['model_name'].apply(lambda x: str(x).split()[0])

# Calculate bike age from model_year
if 'model_year' in df.columns:
    df['bike_age'] = 2025 - df['model_year']

# Convert numeric columns and handle missing values
for col in ['mileage', 'power', 'price']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col].fillna(df[col].mean(), inplace=True)

# Fill missing categorical values
df['brand'] = df['brand'].fillna('Unknown')
if 'owner' in df.columns:
    df['owner'] = df['owner'].fillna('Unknown')

# Remove duplicates
df.drop_duplicates(inplace=True)

# Show cleaned data
display(df.head())
print('Missing values after cleaning:')
display(df.isnull().sum())

In [None]:
# Visualize distributions of key numerical features
num_cols = ['price', 'mileage', 'power', 'bike_age']
for col in num_cols:
    if col in df.columns:
        plt.figure(figsize=(7, 4))
        sns.histplot(df[col], kde=True, bins=30)
        plt.title(f'Distribution of {col}')
        plt.show()

In [None]:
# Visualize categorical feature counts
cat_cols = ['brand', 'owner']
for col in cat_cols:
    if col in df.columns:
        plt.figure(figsize=(10, 4))
        df[col].value_counts().head(20).plot(kind='bar')
        plt.title(f'Top 20 {col} counts')
        plt.ylabel('Count')
        plt.show()

In [None]:
# Boxplots for outlier detection
for col in num_cols:
    if col in df.columns:
        plt.figure(figsize=(7, 2))
        sns.boxplot(x=df[col])
        plt.title(f'Boxplot of {col}')
        plt.show()

In [None]:
# Pairwise scatterplots and correlation heatmap
sns.pairplot(df[num_cols])
plt.suptitle('Pairwise Scatterplots', y=1.02)
plt.show()

plt.figure(figsize=(8, 6))
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Price distribution by brand and owner
if 'brand' in df.columns:
    plt.figure(figsize=(14, 6))
    top_brands = df['brand'].value_counts().head(10).index
    sns.boxplot(x='brand', y='price', data=df[df['brand'].isin(top_brands)])
    plt.title('Price Distribution by Top Brands')
    plt.xticks(rotation=45)
    plt.show()

if 'owner' in df.columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x='owner', y='price', data=df)
    plt.title('Price Distribution by Owner Type')
    plt.show()

## EDA Summary & Next Steps

- The dataset has been cleaned and key features engineered (brand, bike_age).
- Distributions, outliers, and correlations have been visualized for all major features.
- Price varies significantly by brand and owner type.
- Next steps: advanced feature engineering, encoding, and model training for price prediction.