In [4]:
# destination_classifier.py

import pandas as pd
import pickle
import string
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from custom_preprocessors import PreprocessText  # Import from separate file

def build_destination_classifier(data_path):
    """
    Build and train a destination classifier with proper preprocessing
    
    Parameters:
    data_path (str): Path to the CSV file containing destination data
    
    Returns:
    tuple: (trained_pipeline, evaluation_metrics)
    """
    try:
        # Load data
        data = pd.read_csv(data_path)
        print(f"Loaded data shape: {data.shape}")
        
        # Validate data
        for col in ['Description', 'Country']:
            if col not in data.columns:
                raise ValueError(f"Missing required column: {col}")
            if data[col].isnull().any():
                raise ValueError(f"Found null values in column: {col}")
        
        # Prepare features and target
        X = data['Description']
        y = data['Country']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, 
            stratify=y if len(y.unique()) > 1 else None
        )
        print(f"Training set size: {len(X_train)}")
        print(f"Test set size: {len(X_test)}")
        
        # Define stopwords
        new_stopwords = ['and', 'the', 'or', 'but'] + list(string.punctuation)
        
        # Build pipeline
        pipe = Pipeline([
            ('preprocess', PreprocessText()),
            ('vectorize', CountVectorizer(
                analyzer='word',
                stop_words=new_stopwords,
                decode_error='ignore',
                min_df=2
            )),
            ('classifier', GradientBoostingClassifier(
                n_estimators=100,
                random_state=42
            ))
        ])
        
        # Train pipeline
        print("Training pipeline...")
        pipe.fit(X_train, y_train)
        
        # Evaluate
        metrics = evaluate_model(pipe, X_train, X_test, y_train, y_test)
        
        return pipe, metrics
        
    except Exception as e:
        raise Exception(f"Error in pipeline: {str(e)}")

def evaluate_model(pipe, X_train, X_test, y_train, y_test):
    """
    Evaluate the model and return performance metrics
    """
    metrics = {
        'train_accuracy': accuracy_score(y_train, pipe.predict(X_train)),
        'test_accuracy': accuracy_score(y_test, pipe.predict(X_test)),
        'train_f1': f1_score(y_train, pipe.predict(X_train), average='weighted'),
        'test_f1': f1_score(y_test, pipe.predict(X_test), average='weighted'),
        'classification_report': classification_report(y_test, pipe.predict(X_test))
    }
    return metrics

def save_model(pipe, filename='destination_pipeline.pkl'):
    """
    Save the trained pipeline to a file
    """
    with open(filename, 'wb') as f:
        pickle.dump(pipe, f)
    print(f"\nPipeline saved as '{filename}'")

def load_model(filename='destination_pipeline.pkl'):
    """
    Load a trained pipeline from a file
    """
    with open(filename, 'rb') as f:
        pipe = pickle.load(f)
    return pipe

if __name__ == "__main__":
    try:
        # Build and evaluate model
        pipe, metrics = build_destination_classifier("best_travel_destinations_for_2025.csv")
        
        # Print results
        print("\nModel Performance:")
        print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
        print(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
        print(f"Training F1: {metrics['train_f1']:.4f}")
        print(f"Test F1: {metrics['test_f1']:.4f}")
        print("\nDetailed Classification Report:")
        print(metrics['classification_report'])
        
        # Save model
        save_model(pipe)
        
    except Exception as e:
        print(f"Error: {str(e)}")

Loaded data shape: (18040, 4)
Training set size: 14432
Test set size: 3608
Training pipeline...
Original input shape: 14432
Transformed output shape: 14432
Original input shape: 14432
Transformed output shape: 14432
Original input shape: 3608
Transformed output shape: 3608
Original input shape: 14432
Transformed output shape: 14432
Original input shape: 3608
Transformed output shape: 3608
Original input shape: 3608
Transformed output shape: 3608

Model Performance:
Training Accuracy: 0.7455
Test Accuracy: 0.5496
Training F1: 0.7634
Test F1: 0.5709

Detailed Classification Report:
                      precision    recall  f1-score   support

           Argentina       0.60      0.29      0.39        72
           Australia       0.44      0.56      0.49       240
              Brazil       0.84      0.47      0.61       120
              Canada       0.20      0.65      0.30       240
               Chile       0.58      0.30      0.39        64
               China       0.69      0.5