# Exploratory Data Analysis (EDA) Notebook

In [None]:
import matplotlib.pyplot as plt
# Importing Necessary Libraries
import numpy as np
import pandas as pd

In [None]:
# Setting Visualization Aesthetics
sns.set(style="whitegrid")

# Section 1: Understanding Random Variables
Random variables are columns in a dataset that can be either numerical or categorical.
Numerical variables can be:
  - Continuous: Values with decimals (e.g., height in cm)
  - Discrete: Whole numbers (e.g., count of items)
Categorical variables can be:
  - Nominal: Unordered categories (e.g., eye color)
  - Ordinal: Ordered categories (e.g., education level)

In [None]:
from sklearn.datasets import fetch_california_housing

# Load example dataset
dataset = fetch_california_housing(download_if_missing=True)

# Create a Pandas DataFrame
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target

# Display the DataFrame
df.head()

In [None]:
### Exercise 1.1
"""
For each column in the dataset, identify its random variable type.
"""


In [None]:
df.iloc[0]

# Section 2: Descriptive Statistics
Exploring key metrics such as mean, median, mode, variance, and IQR.


In [None]:
df.columns

In [None]:
# Compute Descriptive Statistics
age = df['HouseAge']

In [None]:
# Mean: Sum of all values divided by the number of values
mean_age = df['HouseAge'].mean()
mean_age

In [None]:
### Exercise 2.1
"""
Write the mean function from scratch.
"""

In [None]:
# Median: Middle value when sorted
sorted_age = sorted(age)
n = len(sorted_age)
if n % 2 == 0:
    median_age = (sorted_age[n // 2 - 1] + sorted_age[n // 2]) / 2
else:
    median_age = sorted_age[n // 2]

median_age

In [None]:
### Exercise 2.2
"""
What's the function to calculate the median in Pandas?
"""


In [None]:
# Mode: Most frequently occurring value
mode_age = age.mode()[0]

In [None]:
# Variance: Average squared deviation from the mean
variance_age = sum((x - mean_age) ** 2 for x in age) / (len(age) - 1)
variance_age

In [None]:
# Standard Deviation: Square root of variance
std_dev_age = variance_age ** 0.5

In [None]:
### Exercise 2.3
"""
What's the function to calculate the standard deviation in Pandas?
"""


In [None]:
# IQR: Q3 - Q1
q1 = np.percentile(age, 25)
q3 = np.percentile(age, 75)
iqr = q3 - q1
iqr

In [None]:
print("\nDescriptive Statistics for 'HouseAge':")
print(f"Mean: {mean_age}")
print(f"Median: {median_age}")
print(f"Mode: {mode_age}")
print(f"Variance: {variance_age}")
print(f"Standard Deviation: {std_dev_age}")
print(f"IQR: {iqr}")

In [None]:
### Exercise 2.4
"""
Is there a function to print the statistics of a dataset?
"""

In [None]:
### Exercise 2.5
"""
Explore the fetch_20newsgroups dataset and answer the following questions:
1. How many different categories are there?
2. What are the names of the categories?
3. Distribution of samples across categories.

"""
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
df_news = pd.DataFrame({'text': newsgroups.data, 'target': newsgroups.target})

In [None]:
### Exercise 2.6
"""
Explore the fetch_20newsgroups dataset and answer the following questions:
1. Calculate the average length of the text documents
2. Explore the most frequent words in the dataset using a word cloud or frequency distribution. (Requires additional libraries like nltk and wordcloud)
"""



# Section 3: Data Cleaning and Preparation
Ensuring data quality by handling missing values, duplicates, and inconsistencies.


In [None]:
# Example: Handling Missing Values
non_cleaned_data = df.copy()
non_cleaned_data.loc[2, 'AveRooms'] = np.nan
print("\nData with Missing Value:")
non_cleaned_data

In [None]:
# Fill Missing Values with Median
sorted_AveRooms = non_cleaned_data['AveRooms'].dropna().sort_values()
median = sorted_AveRooms.median()
median

In [None]:
non_cleaned_data['AveRooms'] = non_cleaned_data['AveRooms'].fillna(median)
print("\nData with Filled Missing Values:")
non_cleaned_data

# Section 4: Data Visualization
Visualizing data using histograms, scatter plots, and box plots.

In [None]:
# Univariate Visualization
plt.figure(figsize=(6, 4))
plt.hist(df['HouseAge'], bins=30, edgecolor='black')  # Adjust the number of bins as needed

plt.title("Age Distribution")
plt.xlabel("House Age")
plt.ylabel("Frequency")

plt.show()

In [None]:
plt.figure(figsize=(6, 4))
scatter = plt.scatter(df["HouseAge"], df["AveRooms"], c=df["MedInc"], cmap="viridis", edgecolor='k', alpha=0.7)

plt.colorbar(scatter, label="MedInc")
plt.title("Age vs AveRooms")
plt.xlabel("House Age")
plt.ylabel("Average Rooms")

plt.show()

In [None]:
# Multivariate Visualization
import pandas as pd
from pandas.plotting import parallel_coordinates

df_subset = df[['HouseAge', 'MedInc']][:100]  # Selecting relevant columns and a subset
df_subset = df_subset.assign(
    Category=pd.qcut(df['MedInc'], q=3, labels=['Low', 'Medium', 'High']))  # Categorizing MedInc

plt.figure(figsize=(8, 5))
parallel_coordinates(df_subset, class_column='Category', colormap=plt.cm.viridis, alpha=0.7)

plt.title("Parallel Coordinates Plot")
plt.xlabel("Features")
plt.ylabel("Values")
plt.xticks(rotation=20)  # Rotate labels for better readability
plt.grid(True)

plt.show()

In [None]:
### Exercise 4.1
"""
Create a histogram of the 'MedInc' (Median Income) column.
Customize the plot with:
  - A title: "Distribution of Median Income"
  - X-axis label: "Median Income"
  - Y-axis label: "Frequency"
  - A different color for the bars (e.g., 'green')
  - A specified number of bins (e.g., 20)
"""


In [None]:
### Exercise 4.2
"""
Create a scatter plot of 'AveRooms' (Average Rooms) vs. 'target' (Housing Price).
Add a regression line to the plot to show the trend.
Customize the plot with appropriate labels and a title.
"""

In [None]:
### Exercise 4.3
"""
Create a box plot of 'target' (Housing Price) grouped by 'MedInc' categories (Median Income).
Use the pandas `qcut` function to create 4 income categories.
Customize the plot with appropriate labels and a title.
"""

In [None]:
### Exercise 4.4
"""
Make 3 Hypothesis on the dataset and test them using the plots you created or more.
"""

# Section 5: Exercises
1. Load a dataset of your choice and identify its random variables. (https://scikit-learn.org/1.5/api/sklearn.datasets.html)
2. Explore each column of the dataset.
3. Calculate descriptive statistics (mean, median, mode, variance, standard deviation, IQR) for at least two numerical variables.
4. Clean the data by handling missing values and removing duplicates.
5. Identify outliers using the IQR method and handle them appropriately.