# Inspect Training Data Generator Logic

### Import

In [26]:
import sys
sys.path.append('source')
from training_data_generator import TrainingDataGenerator

### Initialise

In [27]:
generator = TrainingDataGenerator(
    'question_bank.md',
    'data/trees_df.csv',
    'data/patches_df.csv'
)

### Load Data

In [28]:
generator.load_data()

### Check Templates

In [29]:
generator.parse_question_bank()

{'tree_recommendations': {'fill_with': 'level numbers from trees_df.level_requirement',
  'templates': ['What is the best tree at level ___?',
   'Which are the best trees at level ___?',
   'Which tree should I grow at level ___?',
   'What tree should I plant at level ___?',
   'What tree is best at level ___?',
   'What should I grow at level ___?',
   'What should I plant at level ___?']},
 'level_requirements': {'fill_with': 'tree names from trees_df.tree_name',
  'templates': ['What level do I need for ___ trees?',
   'What level do I need to plant ___?',
   'What level is needed to grow ___ trees?',
   'What level is required for ___?',
   'What level is required for growing ___ trees?',
   'What is the level requirement for ___?',
   'What farming level do I need for ___ trees?',
   'At what level can I plant ___?']},
 'quest_requirements': {'fill_with': 'patch names from patches_df.location_simple',
  'templates': ['Which quests are required to use the ___ patch?',
   'What qu

### Check Fill Values

In [30]:
print(generator.get_fill_values('level numbers'))
print(generator.get_fill_values('tree names'))
print(generator.get_fill_values('patch names'))

['15', '27', '30', '33', '39', '42', '45', '51', '57', '60', '68', '75', '81']
['oak', 'willow', 'maple', 'yew', 'magic', 'apple', 'banana', 'orange', 'curry', 'pineapple', 'papaya', 'palm', 'dragonfruit']
['lumbridge', 'varrock', 'falador', 'taverley', 'gnome stronghold', 'farming guild', 'nemus retreat', 'gnome stronghold', 'catherby', 'tree gnome village', 'brimhaven', 'lletya', 'farming guild', 'kastori']


# Generate Training Data

In [31]:
import pandas as pd
df = generator.generate_dataset()

# Build Intent Classifier

### Split Data

In [32]:
from sklearn.model_selection import train_test_split

X = df['text'] # Questions
y = df['intent'] # Intent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training: {len(X_train)} examples")
print(f"Test: {len(X_test)} examples")

Training: 409 examples
Test: 103 examples


### Vectorisation (TF-IDF)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectoriser = TfidfVectorizer(
    max_features=500, # Keep top 500 words
    ngram_range=(1, 2), # Use single words and word pairs
    lowercase=True
    )

X_train_tfidf = vectoriser.fit_transform(X_train)
X_test_tfidf = vectoriser.transform(X_test)
print("Feature vectorisation complete.")

Feature vectorisation complete.


### Train Model

In [34]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(
    max_iter=1000,
    random_state=42
)

classifier.fit(X_train_tfidf, y_train)
print("Model training complete.")

Model training complete.


### Evaluate Model Performance

In [35]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 100.00%

Classification Report:
                      precision    recall  f1-score   support

         growth_time       1.00      1.00      1.00        13
  level_requirements       1.00      1.00      1.00        21
             patches       1.00      1.00      1.00        16
             payment       1.00      1.00      1.00        16
  quest_requirements       1.00      1.00      1.00         7
      transportation       1.00      1.00      1.00        12
tree_recommendations       1.00      1.00      1.00        18

            accuracy                           1.00       103
           macro avg       1.00      1.00      1.00       103
        weighted avg       1.00      1.00      1.00       103



### Save Model

In [36]:
import joblib

joblib.dump(vectoriser, 'models/vectoriser.pkl')
joblib.dump(classifier, 'models/classifier.pkl')
print("Model saving complete.")

Model saving complete.


# Test Intent Classifier

In [37]:
import sys
sys.path.append('source')
from intent_classifier import IntentClassifier

classifier = IntentClassifier(model_dir='models')

test_questions = [
    "What level do I need for magic trees?",
    "How long does yew take to grow?",
    "What should I plant at level 45?",
    "Where can I grow apple trees?"
]

for question in test_questions:
    result = classifier.predict(question)
    print(f"Q: {question}")
    print(f"   Intent: {result['intent']} (confidence: {result['confidence']:.2%})\n")

Q: What level do I need for magic trees?
   Intent: level_requirements (confidence: 85.39%)

Q: How long does yew take to grow?
   Intent: growth_time (confidence: 81.07%)

Q: What should I plant at level 45?
   Intent: tree_recommendations (confidence: 83.33%)

Q: Where can I grow apple trees?
   Intent: patches (confidence: 72.70%)

