In [None]:
# Week 2: Exploratory Data Analysis (EDA)
# Virtual Water Trade Project

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset (update filename if needed)
df = pd.read_csv("virtual_water_trade.csv")

# Basic information
print("Dataset Info:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())

# Preview first rows
print("\nFirst 5 rows:")
print(df.head())

# Distribution plots for key numeric features
numeric_features = ['virtual_water_export_m3', 'virtual_water_import_m3', 'trade_value_usd', 'water_intensity']
df[numeric_features].hist(figsize=(12,8), bins=30)
plt.suptitle("Distribution of Key Virtual Water Trade Metrics", y=1.02)
plt.tight_layout()
plt.show()

# Trade Type distribution plot (if column exists)
if 'trade_type' in df.columns:
    sns.countplot(x='trade_type', data=df)
    plt.title("Trade Type Distribution")
    plt.xlabel("Trade Type")
    plt.ylabel("Count")
    plt.show()

    # Boxplots of export by trade_type
    plt.figure(figsize=(10,6))
    sns.boxplot(x='trade_type', y='virtual_water_export_m3', data=df)
    plt.title("Virtual Water Export Volume by Trade Type")
    plt.show()

    # Boxplots of import by trade_type
    plt.figure(figsize=(10,6))
    sns.boxplot(x='trade_type', y='virtual_water_import_m3', data=df)
    plt.title("Virtual Water Import Volume by Trade Type")
    plt.show()

# Handle missing data
df.fillna(df.mean(numeric_only=True), inplace=True)

# Create net trade balance feature
df['net_virtual_water_trade'] = df['virtual_water_export_m3'] - df['virtual_water_import_m3']

print("\nSample net_virtual_water_trade values:")
print(df[['virtual_water_export_m3', 'virtual_water_import_m3', 'net_virtual_water_trade']].head())

# Feature correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Heatmap - Virtual Water Trade')
plt.show()

# Prepare features and target for potential modeling
features = ['trade_value_usd', 'water_intensity', 'gdp_per_capita', 'population', 'agricultural_land_percent']
target = 'net_virtual_water_trade'

if all(col in df.columns for col in features + [target]):
    X = df[features]
    y = df[target]
    print("\nFeature matrix shape:", X.shape)
    print("Target vector shape:", y.shape)
else:
    print("\nPlease check that all features and target columns exist in your dataset.")
