In [2]:
import tensorflow as tf
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Load the puzzle data
with open('training_data.json', 'r') as f:
    puzzle_data = json.load(f)

# Extract features into arrays
letter_frequencies = np.array([puzzle['letterFrequencies'] for puzzle in puzzle_data])
average_word_length = np.array([puzzle['averageWordLength'] for puzzle in puzzle_data])
percent_uncommon = np.array([puzzle['percentUncommon'] for puzzle in puzzle_data])
total_words = np.array([puzzle['totalWords'] for puzzle in puzzle_data])
compactness = np.array([puzzle['compactness'] for puzzle in puzzle_data])
path_complexity = np.array([puzzle['pathComplexity'] for puzzle in puzzle_data])
intersection_count = np.array([puzzle['intersectionCount'] for puzzle in puzzle_data])
vowel_consonant_ratio = np.array([puzzle['vowelConsonantRatio'] for puzzle in puzzle_data])

# Additional features related to word lengths
word_lengths = np.array([
    [
        puzzle.get('wordLength_3', 0),
        puzzle.get('wordLength_4', 0),
        puzzle.get('wordLength_5', 0),
        puzzle.get('wordLength_6', 0),
        puzzle.get('wordLength_7', 0),
        puzzle.get('wordLength_8', 0),
        puzzle.get('wordLength_9', 0),
        puzzle.get('wordLength_10', 0),
        puzzle.get('wordLength_11', 0),
        puzzle.get('wordLength_12', 0),
        puzzle.get('wordLength_13', 0),
        puzzle.get('wordLength_14', 0),
        puzzle.get('wordLength_15', 0),
    ]
    for puzzle in puzzle_data
])

# Min and Max word counts
min_max_words = np.array([
    [
        puzzle.get('totalWordMin', 0),
        puzzle.get('totalWordMax', 99999),
        puzzle.get('min3', 0),
        puzzle.get('max3', 99999),
        puzzle.get('min4', 0),
        puzzle.get('max4', 99999),
        puzzle.get('min5', 0),
        puzzle.get('max5', 99999),
        puzzle.get('min6', 0),
        puzzle.get('max6', 99999),
        puzzle.get('min7', 0),
        puzzle.get('max7', 99999),
        puzzle.get('min8', 0),
        puzzle.get('max8', 99999),
        puzzle.get('min9', 0),
        puzzle.get('max9', 99999),
        puzzle.get('min10', 0),
        puzzle.get('max10', 99999),
        puzzle.get('min11', 0),
        puzzle.get('max11', 99999),
        puzzle.get('min12', 0),
        puzzle.get('max12', 99999),
        puzzle.get('min13', 0),
        puzzle.get('max13', 99999),
        puzzle.get('min14', 0),
        puzzle.get('max14', 99999),
        puzzle.get('min15', 0),
        puzzle.get('max15', 99999),
    ]
    for puzzle in puzzle_data
])

# Combine all features into a single array
X = np.hstack((
    letter_frequencies,
    word_lengths,
    min_max_words,
    average_word_length.reshape(-1, 1),
    percent_uncommon.reshape(-1, 1),
    compactness.reshape(-1, 1),
    path_complexity.reshape(-1, 1),
    intersection_count.reshape(-1, 1),
    vowel_consonant_ratio.reshape(-1, 1)
))

# Labels: total words
y = total_words

# Normalize features (optional but recommended)
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Step 1: Calculate the mean of totalWords
mean_total_words = np.mean(y)

# Step 2: Create baseline predictions (all predictions are the mean of totalWords)
baseline_predictions = np.full_like(y, mean_total_words)

# Step 3: Calculate MAE and MSE for the baseline model
baseline_mae = mean_absolute_error(y, baseline_predictions)
baseline_mse = mean_squared_error(y, baseline_predictions)

# Print the baseline results
print(f"Baseline Mean Absolute Error: {baseline_mae}")
print(f"Baseline Mean Squared Error: {baseline_mse}")

# Evaluate the model's performance using mean_absolute_error and mean_squared_error
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")


ModuleNotFoundError: No module named 'tensorflow'