In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import gzip
import json
import pandas as pd
from joblib import dump, load

def load_puzzle_data(file_path):
    with gzip.open(file_path, 'rt') as f:
        puzzle_data = json.load(f)
    return puzzle_data

def extract_features(puzzles):
    features = []
    outcomes = []

    for puzzle in puzzles:
        matrix = puzzle['mtx']
        total_words = puzzle['tw']
        
        # Extract features from the matrix
        feature_dict = {
            'num_rows': len(matrix),
            'num_cols': len(matrix[0]),
            'unique_letters': len(set(''.join(matrix))),
            'vowels': sum(1 for row in matrix for char in row if char in 'aeiou'),
            'consonants': sum(1 for row in matrix for char in row if char not in 'aeiou'),
        }
        
        features.append(feature_dict)
        outcomes.append(total_words)

    return pd.DataFrame(features), np.array(outcomes)

def preprocess_data(features, outcomes):
    X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def build_model():
    return RandomForestRegressor(n_estimators=100, random_state=42)

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    return mae, mse

def plot_feature_importance(model, feature_names):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(10, 6))
    plt.title("Feature Importances")
    plt.bar(range(len(importances)), importances[indices])
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45)
    plt.tight_layout()
    plt.show()

def main():
    # Load data
    puzzle_data = load_puzzle_data('buggle-training-data/training_data.json')
    print(f"Got {len(puzzle_data)} puzzles.")

    # Extract features
    features, outcomes = extract_features(puzzle_data)
    print(f"Extracted features for {len(features)} puzzles.")

    # Preprocess data
    X_train, X_test, y_train, y_test = preprocess_data(features, outcomes)
    print(f"Preprocessed {len(X_train)} training examples and {len(X_test)} test examples.")

    # Build and train the model
    model = build_model()
    model.fit(X_train, y_train)
    print("Model trained.")

    # Evaluate the model
    mae, mse = evaluate_model(model, X_test, y_test)
    print(f"Test MAE: {mae:.2f}")
    print(f"Test MSE: {mse:.2f}")

    # Plot feature importance
    plot_feature_importance(model, features.columns)

    # Save the model
    dump(model, 'random_forest_puzzle_predictor.joblib')
    print("Model saved as 'random_forest_puzzle_predictor.joblib'")

if __name__ == '__main__':
    main()