In [None]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import necessary libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from scipy.stats import zscore\n",
    "\n",
    "# Load dataset\n",
    "df = pd.read_csv('solar_data.csv')\n",
    "\n",
    "# Display the first few rows\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Summary Statistics\n",
    "summary_stats = df.describe()\n",
    "print(summary_stats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data Quality Check\n",
    "missing_values = df.isnull().sum()\n",
    "print(\"Missing Values:\\n\", missing_values)\n",
    "\n",
    "# Checking for negative values in columns where only positive values are expected\n",
    "negative_values = df[['GHI', 'DNI', 'DHI']].lt(0).sum()\n",
    "print(\"Negative Values:\\n\", negative_values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Time Series Analysis\n",
    "df['Timestamp'] = pd.to_datetime(df['Timestamp'])\n",
    "df.set_index('Timestamp', inplace=True)\n",
    "\n",
    "# Plot GHI, DNI, DHI, and Tamb over time\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.plot(df['GHI'], label='GHI')\n",
    "plt.plot(df['DNI'], label='DNI')\n",
    "plt.plot(df['DHI'], label='DHI')\n",
    "plt.plot(df['Tamb'], label='Tamb', alpha=0.7)\n",
    "plt.legend()\n",
    "plt.title('Time Series Analysis of Solar Radiation and Temperature')\n",
    "plt.xlabel('Timestamp')\n",
    "plt.ylabel('Values')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Impact of Cleaning on ModA and ModB\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.boxplot(x='Cleaning', y='ModA', data=df)\n",
    "plt.title('Impact of Cleaning on ModA')\n",
    "plt.show()\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.boxplot(x='Cleaning', y='ModB', data=df)\n",
    "plt.title('Impact of Cleaning on ModB')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation Analysis\n",
    "correlation_matrix = df.corr()\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')\n",
    "plt.title('Correlation Matrix')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Wind Analysis: Wind Rose\n",
    "from windrose import WindroseAxes\n",
    "\n",
    "ax = WindroseAxes.from_ax()\n",
    "ax.bar(df['WD'], df['WS'], normed=True, opening=0.8, edgecolor='white')\n",
    "ax.set_legend()\n",
    "plt.title('Wind Rose')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Temperature vs Humidity Analysis\n",
    "plt.figure(figsize=(8, 6))\n",
    "sns.scatterplot(x='RH', y='Tamb', data=df, alpha=0.7)\n",
    "plt.title('Temperature vs Relative Humidity')\n",
    "plt.xlabel('Relative Humidity (%)')\n",
    "plt.ylabel('Ambient Temperature (°C)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Histogram of Variables\n",
    "variables = ['GHI', 'DNI', 'DHI', 'WS', 'Tamb']\n",
    "for var in variables:\n",
    "    plt.figure(figsize=(8, 6))\n",
    "    sns.histplot(df[var], bins=30, kde=True)\n",
    "    plt.title(f'Histogram of {var}')\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Z-Score Analysis\n",
    "z_scores = df[['GHI', 'DNI', 'DHI', 'Tamb']].apply(zscore)\n",
    "outliers = (z_scores.abs() > 3).any(axis=1)\n",
    "print(f'Number of outliers: {outliers.sum()}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Bubble Chart\n",
    "plt.figure(figsize=(10, 8))\n",
    "bubble = plt.scatter(df['GHI'], df['Tamb'], c=df['RH'], s=df['BP']/10, alpha=0.5, cmap='viridis')\n",
    "plt.colorbar(bubble, label='Relative Humidity (%)')\n",
    "plt.title('GHI vs Tamb vs RH (Bubble Size: Barometric Pressure)')\n",
    "plt.xlabel('GHI (W/m²)')\n",
    "plt.ylabel('Ambient Temperature (°C)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data Cleaning Example\n",
    "# Replace negative values in GHI, DNI, and DHI with NaN\n",
    "df.loc[df[['GHI', 'DNI', 'DHI']] < 0, ['GHI', 'DNI', 'DHI']] = np.nan\n",
    "\n",
    "# Fill missing values using forward fill\n",
    "df.fillna(method='ffill', inplace=True)\n",
    "df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
