In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Analysis Project\n",
    "## Exploring the Iris Dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Task 1: Load and Explore the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.datasets import load_iris\n",
    "\n",
    "# Load the Iris dataset\n",
    "try:\n",
    "    iris = load_iris()\n",
    "    df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],\n",
    "                     columns=iris['feature_names'] + ['target'])\n",
    "    df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)\n",
    "    \n",
    "    print(\"Dataset loaded successfully!\")\n",
    "    print(f\"Shape of dataset: {df.shape}\")\n",
    "except Exception as e:\n",
    "    print(f\"Error loading dataset: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display first few rows\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explore dataset structure\n",
    "print(\"\\nData types:\")\n",
    "print(df.dtypes)\n",
    "\n",
    "print(\"\\nMissing values:\")\n",
    "print(df.isnull().sum())\n",
    "\n",
    "# No missing values in this dataset, but here's how we would handle them:\n",
    "# df = df.dropna()  # to drop rows with missing values\n",
    "# OR\n",
    "# df = df.fillna(df.mean())  # to fill with mean values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Task 2: Basic Data Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Basic statistics\n",
    "print(\"Basic statistics for numerical columns:\")\n",
    "print(df.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Group by species and compute means\n",
    "print(\"\\nMean measurements by species:\")\n",
    "print(df.groupby('species').mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Interesting findings\n",
    "print(\"\\nKey Observations:\")\n",
    "print(\"1. Setosa has significantly smaller petal dimensions than other species\")\n",
    "print(\"2. Virginica has the largest measurements on average\")\n",
    "print(\"3. All species have similar sepal widths (around 3 cm)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Task 3: Data Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set style\n",
    "sns.set_style(\"whitegrid\")\n",
    "plt.figure(figsize=(12, 8))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Line chart (showing trends by index since we don't have time data)\n",
    "plt.subplot(2, 2, 1)\n",
    "df['sepal length (cm)'].plot(title='Sepal Length Trend', color='green')\n",
    "plt.ylabel('cm')\n",
    "plt.xlabel('Sample Index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. Bar chart (average petal length by species)\n",
    "plt.subplot(2, 2, 2)\n",
    "sns.barplot(x='species', y='petal length (cm)', data=df)\n",
    "plt.title('Average Petal Length by Species')\n",
    "plt.ylabel('cm')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. Histogram (sepal width distribution)\n",
    "plt.subplot(2, 2, 3)\n",
    "sns.histplot(df['sepal width (cm)'], bins=15, kde=True, color='purple')\n",
    "plt.title('Sepal Width Distribution')\n",
    "plt.xlabel('cm')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. Scatter plot (sepal length vs petal length)\n",
    "plt.subplot(2, 2, 4)\n",
    "sns.scatterplot(x='sepal length (cm)', y='petal length (cm)', \n",
    "                hue='species', data=df, palette='deep')\n",
    "plt.title('Sepal vs Petal Length')\n",
    "plt.xlabel('Sepal Length (cm)')\n",
    "plt.ylabel('Petal Length (cm)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Additional Visualizations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pairplot to show all relationships\n",
    "sns.pairplot(df, hue='species')\n",
    "plt.suptitle('Pairwise Relationships in Iris Dataset', y=1.02)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Boxplot to show distribution by species\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.boxplot(x='species', y='petal width (cm)', data=df)\n",
    "plt.title('Petal Width Distribution by Species')\n",
    "plt.ylabel('cm')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Final Observations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\nFinal Findings:\")\n",
    "print(\"1. The three iris species are clearly separable based on petal measurements\")\n",
    "print(\"2. Setosa has the most distinct characteristics with smaller petals\")\n",
    "print(\"3. Versicolor and Virginica show some overlap in sepal measurements\")\n",
    "print(\"4. Petal measurements are more reliable for species classification than sepal measurements\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}