In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline


In [None]:
# Load raw data
df = pd.read_csv('../data/raw/crop_yield_data.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()


In [None]:
# Filter for Yield data
df_yield = df[df['Element'] == 'Yield'].copy()
df_yield = df_yield[df_yield['Value'].notna()].copy()
df_yield = df_yield[['Area', 'Item', 'Year', 'Value']].copy()
df_yield.columns = ['district', 'crop', 'year', 'yield']

print(f"Yield data shape: {df_yield.shape}")
print(f"\nNumber of unique crops: {df_yield['crop'].nunique()}")
print(f"Years: {sorted(df_yield['year'].unique())}")
df_yield.head()


In [None]:
# Basic statistics
print("Basic Statistics:")
print(df_yield.describe())

print("\nMissing values:")
print(df_yield.isnull().sum())


In [None]:
# Average yield by year
yearly_avg = df_yield.groupby('year')['yield'].mean().reset_index()

plt.figure(figsize=(12, 6))
plt.plot(yearly_avg['year'], yearly_avg['yield'], marker='o', linewidth=2, markersize=8)
plt.title('Average Crop Yield Trend Over Years', fontsize=16, fontweight='bold')
plt.xlabel('Year', fontsize=12)
plt.ylabel('Average Yield', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../images/yearly_yield_trend.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Top 20 crops by average yield
top_crops = df_yield.groupby('crop')['yield'].mean().sort_values(ascending=False).head(20)

plt.figure(figsize=(14, 8))
top_crops.plot(kind='barh')
plt.title('Top 20 Crops by Average Yield', fontsize=16, fontweight='bold')
plt.xlabel('Average Yield', fontsize=12)
plt.ylabel('Crop', fontsize=12)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../images/top_crops_yield.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Yield distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df_yield['yield'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Yield Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Yield', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(df_yield['yield'])
axes[1].set_title('Yield Box Plot', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Yield', fontsize=12)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../images/yield_distribution.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Select top 15 crops for heatmap
top_15_crops = df_yield.groupby('crop')['yield'].mean().nlargest(15).index
df_top = df_yield[df_yield['crop'].isin(top_15_crops)]

# Create pivot table
pivot_data = df_top.pivot_table(values='yield', index='crop', columns='year', aggfunc='mean')

# Create heatmap
plt.figure(figsize=(10, 12))
sns.heatmap(pivot_data, annot=True, fmt='.0f', cmap='YlOrRd', cbar_kws={'label': 'Yield'})
plt.title('Crop Yield Heatmap (Top 15 Crops)', fontsize=16, fontweight='bold')
plt.xlabel('Year', fontsize=12)
plt.ylabel('Crop', fontsize=12)
plt.tight_layout()
plt.savefig('../images/yield_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()
