In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectKBest, chi2
import warnings
warnings.filterwarnings('ignore')

# ============================
# 1. Load the data
# ============================
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

# ============================
# 2. ULTRA-ADVANCED TEXT PREPROCESSING
# ============================
def ultra_advanced_preprocess_text(text):
    """Combined best preprocessing from both approaches"""
    text = str(text).lower()
    
    # Handle numbers specially - they might indicate different subjects
    text = re.sub(r'\b\d{4}\b', ' YEAR ', text)  # Years (like 1961, 2007)
    text = re.sub(r'\b\d+\.\d+\b', ' DECIMAL ', text)  # Decimals (like 50.5)
    text = re.sub(r'\b\d+\b', ' NUM ', text)  # Other numbers
    
    # Remove special characters but keep some punctuation patterns that might be meaningful
    text = re.sub(r'[^a-zA-Z\s\.\!\?]', ' ', text)
    
    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("   Applying ultra-advanced preprocessing...")
train_df['Text'] = train_df['Text'].apply(ultra_advanced_preprocess_text)
test_df['Text'] = test_df['Text'].apply(ultra_advanced_preprocess_text)

# ============================
# 3. ULTRA-HIGH DIMENSION FEATURE COMBINATIONS
# ============================

# Ultra Feature Set 1: Maximum word features
ultra_word_1 = TfidfVectorizer(
    stop_words="english",
    max_features=300000,  # Ultra high!
    ngram_range=(1, 3),
    min_df=1,
    max_df=0.95,
    sublinear_tf=False
)

ultra_char_1 = TfidfVectorizer(
    analyzer="char",
    max_features=100000,  # Ultra high!
    ngram_range=(2, 7),   # Wide range
    min_df=1
)

ultra_features_1 = FeatureUnion([
    ('word_ultra', ultra_word_1),
    ('char_ultra', ultra_char_1)
])

# Ultra Feature Set 2: Different configuration
ultra_word_2 = TfidfVectorizer(
    stop_words="english",
    max_features=250000,
    ngram_range=(1, 4),  # Up to 4-grams
    min_df=1,
    max_df=0.98,
    sublinear_tf=True
)

ultra_char_2 = TfidfVectorizer(
    analyzer="char",
    max_features=80000,
    ngram_range=(3, 6)
)

ultra_features_2 = FeatureUnion([
    ('word_ultra_2', ultra_word_2),
    ('char_ultra_2', ultra_char_2)
])

# Ultra Feature Set 3: Extreme word-only features
ultra_features_3 = TfidfVectorizer(
    stop_words="english",
    max_features=400000,  # Extreme!
    ngram_range=(1, 5),   # Very wide n-grams
    min_df=1,
    max_df=0.9,
    sublinear_tf=True
)

# Standard Feature Set (for diversity)
standard_features = TfidfVectorizer(
    stop_words="english",
    max_features=70000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

# ============================
# 4. CREATE ULTRA-OPTIMIZED CLASSIFIERS
# ============================

# Ultra NB 1: Maximum dimensions with minimal smoothing
ultra_nb_1 = Pipeline([
    ('features', ultra_word_1),
    ('clf', MultinomialNB(alpha=0.01))  # Very low alpha
])

# Ultra NB 2: Different config
ultra_nb_2 = Pipeline([
    ('features', ultra_word_2),
    ('clf', MultinomialNB(alpha=0.02))
])

# Ultra SVM 1: Combined ultra features
ultra_svm_1 = Pipeline([
    ('features', ultra_features_1),
    ('clf', LinearSVC(C=0.5, class_weight="balanced", random_state=42, max_iter=3000))
])

# Ultra SVM 2: Different feature combination
ultra_svm_2 = Pipeline([
    ('features', ultra_features_2),
    ('clf', LinearSVC(C=0.8, class_weight="balanced", random_state=43, max_iter=3000))
])

# Ultra Logistic Regression
ultra_lr = Pipeline([
    ('features', ultra_features_3),
    ('clf', LogisticRegression(C=1.0, class_weight="balanced", max_iter=2000, random_state=42))
])

# Enhanced Logistic Regression 2
ultra_lr_2 = Pipeline([
    ('features', ultra_features_1),
    ('clf', LogisticRegression(C=2.0, class_weight="balanced", max_iter=1500, random_state=44))
])

# Ultra Random Forest (for diversity)
ultra_rf = Pipeline([
    ('features', standard_features),  # Use smaller features for RF
    ('clf', RandomForestClassifier(n_estimators=300, max_depth=25, 
                                  class_weight="balanced", random_state=45,
                                  n_jobs=-1, min_samples_split=3))
])

# ============================
# 5. CREATE MULTIPLE ENSEMBLE STRATEGIES
# ============================

print("Creating Multiple Advanced Ensembles...")

# Strategy 1: Stacked Ensemble (Meta-learning)
base_estimators_stack = [
    ('ultra_nb_1', ultra_nb_1),
    ('ultra_nb_2', ultra_nb_2),
    ('ultra_svm_1', ultra_svm_1),
    ('ultra_svm_2', ultra_svm_2),
    ('ultra_lr', ultra_lr)
]

stacked_ensemble = StackingClassifier(
    estimators=base_estimators_stack,
    final_estimator=LogisticRegression(C=1.0, class_weight="balanced", random_state=42),
    cv=3,  # 3-fold for meta-learning
    n_jobs=-1,
    passthrough=False
)

# Strategy 2: Hard Voting Ensemble
hard_voting_ensemble = VotingClassifier(
    estimators=[
        ('ultra_nb_1', ultra_nb_1),
        ('ultra_nb_2', ultra_nb_2),
        ('ultra_svm_1', ultra_svm_1),
        ('ultra_lr', ultra_lr),
        ('ultra_rf', ultra_rf)
    ],
    voting='hard',
    n_jobs=-1
)

# Strategy 3: Soft Voting Ensemble (for models that support probabilities)
soft_voting_estimators = [
    ('ultra_nb_1', ultra_nb_1),
    ('ultra_nb_2', ultra_nb_2),
    ('ultra_lr', ultra_lr),
    ('ultra_lr_2', ultra_lr_2),
    ('ultra_rf', ultra_rf)
]

soft_voting_ensemble = VotingClassifier(
    estimators=soft_voting_estimators,
    voting='soft',
    n_jobs=-1
)

# ============================
# 6. EVALUATE ALL STRATEGIES
# ============================
print("   Evaluating All Ensemble Strategies...")

cv_folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

strategies = {
    'Stacked_Ensemble': stacked_ensemble,
    'Hard_Voting': hard_voting_ensemble,
    'Soft_Voting': soft_voting_ensemble
}

results = {}

for name, model in strategies.items():
    print(f"Evaluating {name}...")
    cv_scores = cross_val_score(
        model, 
        train_df['Text'], 
        train_df['Subject'], 
        cv=cv_folds,
        scoring='f1_macro',
        n_jobs=-1
    )
    results[name] = {
        'scores': cv_scores,
        'mean': cv_scores.mean() * 100,
        'std': cv_scores.std() * 100
    }
    print(f"   {name}: {cv_scores.mean()*100:.4f} (+/- {cv_scores.std()*2*100:.4f})")

# ============================
# 7. EVALUATE TOP INDIVIDUAL MODELS
# ============================
print("\n   Evaluating Top Individual Models...")

individual_models = {
    'Ultra_NB_1': ultra_nb_1,
    'Ultra_NB_2': ultra_nb_2,
    'Ultra_SVM_1': ultra_svm_1,
    'Ultra_LR': ultra_lr
}

for name, model in individual_models.items():
    cv_scores = cross_val_score(
        model, 
        train_df['Text'], 
        train_df['Subject'], 
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='f1_macro',
        n_jobs=-1
    )
    results[name] = {
        'scores': cv_scores,
        'mean': cv_scores.mean() * 100,
        'std': cv_scores.std() * 100
    }
    print(f"{name}: {cv_scores.mean()*100:.4f}")

# ============================
# 8. DETERMINE BEST APPROACH AND TRAIN
# ============================
print(f"\n FINAL COMPETITION SCORE COMPARISON:")
best_approach = max(results, key=lambda x: results[x]['mean'])
best_score = results[best_approach]['mean']

for name, result in sorted(results.items(), key=lambda x: x[1]['mean'], reverse=True):
    
    print(f"{indicator} {name}: {result['mean']:.4f} (+/- {result['std']*2:.4f})")

print(f"\n BEST APPROACH: {best_approach} ({best_score:.4f} points)")

# Train the best model
if best_approach in strategies:
    best_model = strategies[best_approach]
elif best_approach in individual_models:
    best_model = individual_models[best_approach]

print(f" Training {best_approach} on full training data...")
best_model.fit(train_df['Text'], train_df['Subject'])

print("Predicting on test data...")
final_predictions = best_model.predict(test_df['Text'])

# ============================
# 9. CREATE SUBMISSION
# ============================
submission_filename = f"{best_approach.lower()}_ultra_submission.csv"
final_submission = pd.DataFrame({
    "ID": test_df["ID"],
    "Subject": final_predictions
})
final_submission.to_csv(submission_filename, index=False)

print(f"   Best model submission saved as: {submission_filename}")

# ============================
# 10. FINAL RECOMMENDATIONS
# ============================
print(f"\nFINAL ANALYSIS:")
print(f"Expected competition score: {best_score:.4f}")


# Additional backup submissions
if results['Stacked_Ensemble']['mean'] >= 89.0:
    print("\nCreating backup Stacked Ensemble submission...")
    stacked_ensemble.fit(train_df['Text'], train_df['Subject'])
    stacked_preds = stacked_ensemble.predict(test_df['Text'])
    backup_submission = pd.DataFrame({
        "ID": test_df["ID"],
        "Subject": stacked_preds
    })
    backup_submission.to_csv("backup_stacked_ensemble.csv", index=False)
    print("Backup stacked ensemble saved!")


ENHANCED PUSH: Advanced Feature Engineering + Multiple Ensembles + Stacking
   Applying ultra-advanced preprocessing...
 Creating Multiple Advanced Ensembles...
Evaluating All Ensemble Strategies...
Evaluating Stacked_Ensemble...
 Stacked_Ensemble: 88.7833 (+/- 1.4206)
Evaluating Hard_Voting...
Hard_Voting: 87.8908 (+/- 1.3267)
Evaluating Soft_Voting...
Soft_Voting: 88.2930 (+/- 0.9658)

Evaluating Top Individual Models...
Ultra_NB_1: 87.2149
Ultra_NB_2: 87.1269
Ultra_SVM_1: 87.6898
Ultra_LR: 84.2396

FINAL COMPETITION SCORE COMPARISON:
Stacked_Ensemble: 88.7833 (+/- 1.4206)
Soft_Voting: 88.2930 (+/- 0.9658)
Hard_Voting: 87.8908 (+/- 1.3267)
Ultra_SVM_1: 87.6898 (+/- 1.2101)
Ultra_NB_1: 87.2149 (+/- 1.4652)
Ultra_NB_2: 87.1269 (+/- 1.3946)
 Ultra_LR: 84.2396 (+/- 0.7592)

BEST APPROACH: Stacked_Ensemble (88.7833 points)
Training Stacked_Ensemble on full training data...
 Predicting on test data...
Best model submission saved as: stacked_ensemble_ultra_submission.csv
