# News Article Classifier
This notebook contains all the functionality for classifying news articles, including:
- Text preprocessing
- Model training
- Article classification
- Key phrase extraction
- Context analysis

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
import joblib
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
from langdetect import detect

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

## Context Feature Extractor
This class extracts context-specific features from articles

In [None]:
class ContextFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.contexts = {
            'tech': {
                'keywords': ['technology', 'tech', 'ai', 'software', 'hardware', 'digital', 'innovation', 
                           'smartphone', 'iphone', 'android', 'app', 'computer', 'artificial intelligence',
                           'machine learning', 'cloud', 'cybersecurity', '5g', 'blockchain', 'mobile',
                           'device', 'platform', 'algorithm', 'interface', 'processor', 'chip'],
                'companies': ['apple', 'google', 'microsoft', 'amazon', 'meta', 'tesla', 'nvidia', 
                            'samsung', 'intel', 'ibm', 'oracle', 'cisco', 'qualcomm', 'adobe']
            },
            'business': {
                'keywords': ['earnings', 'revenue', 'profit', 'market', 'stock', 'shares', 'investors',
                           'quarterly', 'financial', 'economy', 'growth', 'sales', 'trading', 'price',
                           'investment', 'dividend', 'merger', 'acquisition', 'fiscal', 'shareholder'],
                'terms': ['q1', 'q2', 'q3', 'q4', 'year-over-year', 'yoy', 'quarter', 'fiscal']
            },
            'sports': {
                'keywords': ['game', 'match', 'tournament', 'championship', 'league', 'score', 'win',
                           'victory', 'team', 'player', 'season', 'coach', 'stadium', 'sports'],
                'terms': ['goal', 'points', 'referee', 'injury', 'transfer', 'contract']
            },
            'entertainment': {
                'keywords': ['movie', 'film', 'show', 'music', 'album', 'celebrity', 'actor', 'actress',
                           'director', 'performance', 'award', 'entertainment', 'concert', 'premiere'],
                'terms': ['box office', 'rating', 'review', 'star', 'episode', 'season']
            },
            'politics': {
                'keywords': ['government', 'policy', 'election', 'political', 'minister', 'president',
                           'congress', 'senate', 'law', 'legislation', 'vote', 'campaign', 'party'],
                'terms': ['bill', 'reform', 'regulation', 'democratic', 'republican', 'parliament']
            }
        }

    def get_context_features(self, text):
        text_lower = text.lower()
        features = {}
        
        for context, indicators in self.contexts.items():
            # Keyword score
            keyword_matches = sum(1 for keyword in indicators['keywords'] 
                                if keyword in text_lower)
            features[f'{context}_keyword_score'] = keyword_matches
            
            # Special terms score
            if 'terms' in indicators:
                term_matches = sum(1 for term in indicators['terms'] 
                                 if term in text_lower)
                features[f'{context}_term_score'] = term_matches
            
            # Company/Entity score (for tech)
            if 'companies' in indicators:
                company_matches = sum(1 for company in indicators['companies'] 
                                    if company in text_lower)
                features[f'{context}_company_score'] = company_matches * 2  # Weight companies more
                
        return features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features_list = []
        for text in X:
            features = self.get_context_features(text)
            features_list.append(features)
        return pd.DataFrame(features_list)

## Key Phrase Extraction
Function to extract important phrases from articles

In [None]:
def get_key_phrases(text, lang='en', top_n=8):
    try:
        # Tokenize and clean
        tokens = word_tokenize(text.lower())
        
        # Use appropriate stopwords based on language
        try:
            if lang == 'fr':
                stop_words = set(stopwords.words('french'))
            else:
                stop_words = set(stopwords.words('english'))
        except:
            stop_words = set()
        
        # Add custom stopwords
        custom_stops = {'said', 'says', 'will', 'would', 'could', 'may', 'might', 'also', 'one', 'two', 'three', 'new'}
        stop_words.update(custom_stops)
        
        # Extract words and bigrams
        words = [word for word in tokens if re.match(r'^[a-zA-ZÀ-ÿ]+$', word) and word not in stop_words]
        bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
        
        # Combine and get frequency
        all_phrases = words + bigrams
        freq_dist = nltk.FreqDist(all_phrases)
        
        # Get top phrases
        phrases = [(phrase, count) for phrase, count in freq_dist.most_common(top_n * 2)
                  if len(phrase) > 1][:top_n]
        
        return phrases
    except:
        return []

## Model Training
Function to train the enhanced models

In [None]:
def train_enhanced_models():
    print("Loading data...")
    df = pd.read_csv('bbc-text-cleaned.csv')
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        df['text'], df['category'], 
        test_size=0.2, 
        random_state=42, 
        stratify=df['category']
    )
    
    print("\nTraining enhanced models...")
    
    # Create enhanced pipeline
    enhanced_pipeline = Pipeline([
        ('features', FeatureUnion([
            ('tfidf', TfidfVectorizer(
                max_features=5000,
                ngram_range=(1, 2),
                stop_words='english'
            )),
            ('context', ContextFeatureExtractor())
        ])),
        ('classifier', GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=5
        ))
    ])
    
    # Train models
    enhanced_pipeline.fit(X_train, y_train)
    
    # Create ensemble
    enhanced_ensemble = VotingClassifier(estimators=[
        ('gb', enhanced_pipeline),
        ('lr', Pipeline([
            ('features', FeatureUnion([
                ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
                ('context', ContextFeatureExtractor())
            ])),
            ('classifier', LogisticRegression(max_iter=1000))
        ])),
        ('svm', Pipeline([
            ('features', FeatureUnion([
                ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
                ('context', ContextFeatureExtractor())
            ])),
            ('classifier', LinearSVC(max_iter=1000))
        ]))
    ], voting='hard')
    
    enhanced_ensemble.fit(X_train, y_train)
    
    # Save models
    print("\nSaving models...")
    joblib.dump(enhanced_pipeline, 'enhanced_models/gradient_boosting_model.joblib')
    joblib.dump(enhanced_ensemble, 'enhanced_models/ensemble_model.joblib')
    
    # Evaluate
    print("\nModel Evaluation:")
    print(f"Gradient Boosting Accuracy: {enhanced_pipeline.score(X_test, y_test):.4f}")
    print(f"Ensemble Accuracy: {enhanced_ensemble.score(X_test, y_test):.4f}")

## Article Classification
Function to classify a single article

In [None]:
def classify_article(text, model_name='ensemble'):
    # Load models
    try:
        model = joblib.load(f'enhanced_models/{model_name}_model.joblib')
    except:
        print("Error loading model")
        return None
    
    # Make prediction
    prediction = model.predict([text])[0]
    
    # Get confidence scores
    if hasattr(model, 'predict_proba'):
        probs = model.predict_proba([text])[0]
        confidence_scores = dict(zip(model.classes_, probs))
    else:
        confidence_scores = None
    
    # Get key phrases
    key_phrases = get_key_phrases(text)
    
    # Get context analysis
    context_extractor = ContextFeatureExtractor()
    context_scores = context_extractor.get_context_features(text)
    
    return {
        'category': prediction,
        'confidence_scores': confidence_scores,
        'key_phrases': key_phrases,
        'context_scores': context_scores
    }

## Example Usage

In [None]:
# Example article
article = """
Apple reports record quarterly earnings as iPhone sales surge in emerging markets. 
The tech giant saw a 15% increase in revenue, largely driven by strong performance 
in India and Southeast Asia. CEO Tim Cook announced plans for expanding their AI initiatives.
"""

# Classify article
result = classify_article(article)

# Print results
print(f"Category: {result['category']}\n")

print("Confidence Scores:")
for category, score in result['confidence_scores'].items():
    print(f"{category}: {score:.2%}")

print("\nKey Phrases:")
for phrase, count in result['key_phrases']:
    print(f"- {phrase} ({count} occurrences)")

print("\nContext Analysis:")
for context, score in result['context_scores'].items():
    if score > 0:
        print(f"- {context}: {score}")