# Crop Type Classification - Geometry-Based Starter

Using only the provided GeoJSON files to create features from field geometries

In [None]:
import geopandas as gpd
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from shapely.geometry import Polygon, MultiPolygon

## 1. Load and Prepare Data

In [None]:
# Load training data
train = gpd.read_file('/kaggle/input/ctedivoire-byte-sizedagriculturechallengedataset/train.geojson')
print(f"Training samples: {len(train)}")
print("Crop types distribution:")
print(train['crop'].value_counts())

# Load test data
test = gpd.read_file('/kaggle/input/ctedivoire-byte-sizedagriculturechallengedataset/test.geojson')
print(f"\nTest samples: {len(test)}")

## 2. Feature Engineering from Geometries

In [None]:
def extract_geometry_features(gdf):
    """Extract features from polygon geometries"""
    features = []
    
    for geom in gdf.geometry:
        # Handle both Polygon and MultiPolygon
        if geom.geom_type == 'MultiPolygon':
            areas = [p.area for p in geom.geoms]
            perims = [p.length for p in geom.geoms]
            poly = geom.geoms[0]  # Take largest polygon for other features
        else:
            areas = [geom.area]
            perims = [geom.length]
            poly = geom
            
        # Basic shape features
        area = sum(areas)
        perimeter = sum(perims)
        compactness = (4 * np.pi * area) / (perimeter ** 2)
        
        # Convex hull features
        convex_hull = poly.convex_hull
        hull_area = convex_hull.area
        solidity = area / hull_area if hull_area > 0 else 0
        
        # Bounding box features
        minx, miny, maxx, maxy = poly.bounds
        width = maxx - minx
        height = maxy - miny
        aspect_ratio = width / height if height > 0 else 0
        
        # Moment invariants
        coords = np.array(poly.exterior.coords)
        dx = coords[:,0] - minx
        dy = coords[:,1] - miny
        m00 = area
        m10 = np.sum(dx)
        m01 = np.sum(dy)
        centroid_x = m10 / m00
        centroid_y = m01 / m00
        mu20 = np.sum((dx - centroid_x)**2) / m00
        mu02 = np.sum((dy - centroid_y)**2) / m00
        mu11 = np.sum((dx - centroid_x)*(dy - centroid_y)) / m00
        
        features.append({
            'area': area,
            'perimeter': perimeter,
            'compactness': compactness,
            'solidity': solidity,
            'aspect_ratio': aspect_ratio,
            'num_polygons': len(areas),
            'largest_area': max(areas),
            'smallest_area': min(areas),
            'mean_area': np.mean(areas),
            'area_std': np.std(areas),
            'mu20': mu20,
            'mu02': mu02,
            'mu11': mu11,
            'width': width,
            'height': height
        })
    
    return pd.DataFrame(features)

# Extract features
train_features = extract_geometry_features(train)
test_features = extract_geometry_features(test)

# Add target
train_features['crop'] = train['crop'].values

# Show features
print("\nTraining features:")
print(train_features.head())

## 3. Train/Test Split

In [None]:
# Prepare data
X = train_features.drop('crop', axis=1)
y = train_features['crop']

# Split into train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 4. Model Training

In [None]:
# Train Random Forest
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    class_weight='balanced'
)
model.fit(X_train, y_train)

# Evaluate
val_preds = model.predict(X_val)
accuracy = accuracy_score(y_val, val_preds)
f1 = f1_score(y_val, val_preds, average='weighted')

print(f"Validation Accuracy: {accuracy:.3f}")
print(f"Validation F1 Score: {f1:.3f}")

# Feature importance
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Important Features')
plt.show()

## 5. Make Predictions and Create Submission

In [None]:
# Predict on test set
test_preds = model.predict(test_features)
test_ids = test['ID']
# Create submission
submission = pd.DataFrame({
    'ID':test_ids ,
    'Target': test_preds  # Assuming the submission requires this column name
})


# Save
submission.to_csv('submission.csv', index=False)
print("\nSubmission saved:")
print(submission.head())

## 6. Visualize Feature Relationships

In [None]:
# Plot some key feature relationships
plt.figure(figsize=(15, 5))

plt.subplot(131)
for crop in train['crop'].unique():
    subset = train_features[train_features['crop'] == crop]
    plt.scatter(subset['area'], subset['perimeter'], label=crop, alpha=0.5)
plt.xlabel('Area')
plt.ylabel('Perimeter')
plt.legend()

plt.subplot(132)
for crop in train['crop'].unique():
    subset = train_features[train_features['crop'] == crop]
    plt.scatter(subset['compactness'], subset['solidity'], label=crop, alpha=0.5)
plt.xlabel('Compactness')
plt.ylabel('Solidity')

plt.subplot(133)
for crop in train['crop'].unique():
    subset = train_features[train_features['crop'] == crop]
    plt.scatter(subset['aspect_ratio'], subset['mu20'], label=crop, alpha=0.5)
plt.xlabel('Aspect Ratio')
plt.ylabel('Moment 20')

plt.tight_layout()
plt.show()