# EDA Assignment - Bike Details Dataset
Assignment Code: DA-AG-009

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("BIKE DETAILS.csv")

### Question 1
Read the Bike Details dataset into a Pandas DataFrame and display its first 10 rows.

**Theory:** The first step in EDA is reading the dataset to understand its structure. Shape and column names provide insights into dataset size and features.

In [None]:
print("Shape:", df.shape)
print("Columns:", df.columns)
df.head(10)

### Question 2
Check for missing values in all columns.

**Theory:** Missing values can be dropped or imputed (mean, median, mode, interpolation). Choice depends on data importance.

In [None]:
df.isnull().sum()

### Question 3
Plot the distribution of selling prices.

**Theory:** A histogram shows how selling prices are distributed and highlights skewness or concentration.

In [None]:
plt.figure(figsize=(8,5))
plt.hist(df['selling_price'], bins=50, edgecolor='black')
plt.title('Distribution of Selling Prices')
plt.xlabel('Selling Price')
plt.ylabel('Frequency')
plt.show()

### Question 4
Create a bar plot of average selling price per seller_type.

**Theory:** Seller type impacts resale value. Dealers may charge more due to warranties/services.

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x='seller_type', y='selling_price', data=df)
plt.title('Average Selling Price by Seller Type')
plt.show()

### Question 5
Compute average km_driven per ownership type.

**Theory:** Ownership history affects value. First-owner bikes are typically better maintained and priced higher.

In [None]:
avg_km = df.groupby('owner')['km_driven'].mean().reset_index()
plt.figure(figsize=(8,5))
sns.barplot(x='owner', y='km_driven', data=avg_km)
plt.title('Average Km Driven by Ownership Type')
plt.xticks(rotation=45)
plt.show()

### Question 6
Remove outliers in km_driven using IQR.

**Theory:** Outliers distort averages. The IQR method removes extreme values beyond 1.5*IQR from Q1/Q3.

In [None]:
Q1 = df['km_driven'].quantile(0.25)
Q3 = df['km_driven'].quantile(0.75)
IQR = Q3 - Q1

before = df['km_driven'].describe()
df_clean = df[(df['km_driven'] >= Q1 - 1.5*IQR) & (df['km_driven'] <= Q3 + 1.5*IQR)]
after = df_clean['km_driven'].describe()

print("Before:\n", before)
print("\nAfter:\n", after)

### Question 7
Scatter plot of year vs selling_price.

**Theory:** Newer bikes generally sell for higher prices, showing depreciation with age.

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(df['year'], df['selling_price'], alpha=0.5)
plt.title('Year vs. Selling Price')
plt.xlabel('Year')
plt.ylabel('Selling Price')
plt.show()

### Question 8
One-hot encode seller_type.

**Theory:** Machine learning models require numeric inputs. One-hot encoding converts categorical values into binary columns.

In [None]:
encoded = pd.get_dummies(df, columns=['seller_type'])
encoded.head()

### Question 9
Correlation heatmap.

**Theory:** Heatmaps show correlations (-1 to +1). Strong positive/negative correlations indicate relationships or redundancy.

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

### Question 10
Summary Findings.

**Theory:**
- **Important factors:** Year (age), km_driven, seller_type, ownership history.
- **Data cleaning:** Missing values handled, outliers removed.
- **Feature engineering:** One-hot encoding categorical variables.

Overall, depreciation, mileage, and ownership history strongly impact bike resale value.