# News Article Classifier with Sample Data
This notebook contains everything needed to classify news articles, including sample data for testing.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
import joblib
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
from langdetect import detect

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

## Sample Training Data
Creating a small dataset for testing

In [None]:
# Sample training data
sample_data = {
    'text': [
        # Technology articles
        "Microsoft unveils groundbreaking AI features for Windows 12, integrating advanced machine learning capabilities across the operating system. The new update includes real-time language translation and predictive task automation.",
        "Apple launches new iPhone with revolutionary quantum processor chip. The device features advanced AI capabilities and unprecedented processing power for mobile applications.",
        "Google announces breakthrough in quantum computing, achieving quantum supremacy with its new 1000-qubit processor. The achievement marks a milestone in computing history.",
        
        # Business articles
        "Goldman Sachs reports Q4 earnings beating market expectations, with revenue up 25% year-over-year. The investment bank saw strong growth in trading and investment banking divisions.",
        "Amazon's stock surges after announcing record-breaking holiday sales. The e-commerce giant reported a 30% increase in revenue compared to last year's holiday season.",
        "Federal Reserve announces interest rate decision, impacting global markets. Wall Street analysts expect continued economic growth despite inflation concerns.",
        
        # Sports articles
        "Manchester United wins dramatic Champions League final with last-minute goal. The team's victory marks their fourth European championship title.",
        "LeBron James breaks NBA all-time scoring record in Lakers victory. The basketball legend surpassed Kareem Abdul-Jabbar's long-standing record.",
        "Serena Williams announces retirement after winning her 24th Grand Slam title. The tennis icon's career spans over two decades of dominance.",
        
        # Entertainment articles
        "New Marvel superhero movie breaks box office records with $300 million opening weekend. Critics praise the film's special effects and storyline.",
        "Taylor Swift's latest album becomes fastest-selling record of the decade. The pop star's world tour has already sold out in major cities.",
        "Netflix series wins multiple Emmy awards, dominating streaming category. The show's creator credits innovative storytelling for its success.",
        
        # Politics articles
        "Senate passes landmark climate change legislation with bipartisan support. The bill includes major investments in renewable energy infrastructure.",
        "Presidential candidates face off in final debate before election. Poll numbers show a tight race between the leading contenders.",
        "European Union announces new trade agreement with Asian nations. The deal is expected to boost economic cooperation and reduce tariffs."
    ],
    'category': [
        'tech', 'tech', 'tech',
        'business', 'business', 'business',
        'sport', 'sport', 'sport',
        'entertainment', 'entertainment', 'entertainment',
        'politics', 'politics', 'politics'
    ]
}

# Create DataFrame
df = pd.DataFrame(sample_data)
print(f"Sample dataset shape: {df.shape}")
print("\nCategory distribution:")
print(df['category'].value_counts())

## Context Feature Extractor
Extracts context-specific features from articles

In [None]:
class ContextFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.contexts = {
            'tech': {
                'keywords': ['technology', 'tech', 'ai', 'software', 'hardware', 'digital', 'innovation', 
                           'smartphone', 'iphone', 'android', 'app', 'computer', 'artificial intelligence',
                           'machine learning', 'cloud', 'cybersecurity', '5g', 'blockchain', 'mobile',
                           'device', 'platform', 'algorithm', 'interface', 'processor', 'chip'],
                'companies': ['apple', 'google', 'microsoft', 'amazon', 'meta', 'tesla', 'nvidia', 
                            'samsung', 'intel', 'ibm', 'oracle', 'cisco', 'qualcomm', 'adobe']
            },
            'business': {
                'keywords': ['earnings', 'revenue', 'profit', 'market', 'stock', 'shares', 'investors',
                           'quarterly', 'financial', 'economy', 'growth', 'sales', 'trading', 'price',
                           'investment', 'dividend', 'merger', 'acquisition', 'fiscal', 'shareholder'],
                'terms': ['q1', 'q2', 'q3', 'q4', 'year-over-year', 'yoy', 'quarter', 'fiscal']
            },
            'sports': {
                'keywords': ['game', 'match', 'tournament', 'championship', 'league', 'score', 'win',
                           'victory', 'team', 'player', 'season', 'coach', 'stadium', 'sports'],
                'terms': ['goal', 'points', 'referee', 'injury', 'transfer', 'contract']
            },
            'entertainment': {
                'keywords': ['movie', 'film', 'show', 'music', 'album', 'celebrity', 'actor', 'actress',
                           'director', 'performance', 'award', 'entertainment', 'concert', 'premiere'],
                'terms': ['box office', 'rating', 'review', 'star', 'episode', 'season']
            },
            'politics': {
                'keywords': ['government', 'policy', 'election', 'political', 'minister', 'president',
                           'congress', 'senate', 'law', 'legislation', 'vote', 'campaign', 'party'],
                'terms': ['bill', 'reform', 'regulation', 'democratic', 'republican', 'parliament']
            }
        }

    def get_context_features(self, text):
        text_lower = text.lower()
        features = {}
        
        for context, indicators in self.contexts.items():
            # Keyword score
            keyword_matches = sum(1 for keyword in indicators['keywords'] 
                                if keyword in text_lower)
            features[f'{context}_keyword_score'] = keyword_matches
            
            # Special terms score
            if 'terms' in indicators:
                term_matches = sum(1 for term in indicators['terms'] 
                                 if term in text_lower)
                features[f'{context}_term_score'] = term_matches
            
            # Company/Entity score (for tech)
            if 'companies' in indicators:
                company_matches = sum(1 for company in indicators['companies'] 
                                    if company in text_lower)
                features[f'{context}_company_score'] = company_matches * 2
                
        return features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features_list = []
        for text in X:
            features = self.get_context_features(text)
            features_list.append(features)
        return pd.DataFrame(features_list)

## Key Phrase Extraction

In [None]:
def get_key_phrases(text, lang='en', top_n=8):
    try:
        # Tokenize and clean
        tokens = word_tokenize(text.lower())
        
        # Use appropriate stopwords
        try:
            if lang == 'fr':
                stop_words = set(stopwords.words('french'))
            else:
                stop_words = set(stopwords.words('english'))
        except:
            stop_words = set()
        
        # Add custom stopwords
        custom_stops = {'said', 'says', 'will', 'would', 'could', 'may', 'might', 'also'}
        stop_words.update(custom_stops)
        
        # Extract words and bigrams
        words = [word for word in tokens if re.match(r'^[a-zA-ZÀ-ÿ]+$', word) and word not in stop_words]
        bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
        
        # Combine and get frequency
        all_phrases = words + bigrams
        freq_dist = nltk.FreqDist(all_phrases)
        
        # Get top phrases
        phrases = [(phrase, count) for phrase, count in freq_dist.most_common(top_n * 2)
                  if len(phrase) > 1][:top_n]
        
        return phrases
    except:
        return []

## Train Models

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['category'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['category']
)

# Create enhanced pipeline
enhanced_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            stop_words='english'
        )),
        ('context', ContextFeatureExtractor())
    ])),
    ('classifier', GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5
    ))
])

# Train model
enhanced_pipeline.fit(X_train, y_train)

# Evaluate
print(f"Model Accuracy: {enhanced_pipeline.score(X_test, y_test):.4f}")

## Test Classification

In [None]:
# Test articles
test_articles = [
    # Mixed Tech/Business
    """
    Apple reports record quarterly earnings as iPhone sales surge in emerging markets. 
    The tech giant saw a 15% increase in revenue, largely driven by strong performance 
    in India and Southeast Asia. CEO Tim Cook announced plans for expanding their AI initiatives.
    """,
    
    # Pure Tech
    """
    Microsoft unveils groundbreaking AI features for Windows 12, integrating advanced 
    machine learning capabilities across the operating system. The new update includes 
    real-time language translation and predictive task automation.
    """,
    
    # Pure Business
    """
    Goldman Sachs reports Q4 earnings beating market expectations, with revenue up 25% 
    year-over-year. The investment bank saw strong growth in trading and investment 
    banking divisions, leading to increased shareholder dividends.
    """
]

# Test each article
for i, article in enumerate(test_articles, 1):
    print(f"\nTest Article {i}:")
    print("-" * 50)
    print(f"Text: {article.strip()}\n")
    
    # Make prediction
    prediction = enhanced_pipeline.predict([article])[0]
    probs = enhanced_pipeline.predict_proba([article])[0]
    
    # Get confidence scores
    confidence_scores = dict(zip(enhanced_pipeline.classes_, probs))
    
    # Get key phrases
    key_phrases = get_key_phrases(article)
    
    # Get context analysis
    context_extractor = ContextFeatureExtractor()
    context_scores = context_extractor.get_context_features(article)
    
    # Print results
    print(f"Predicted Category: {prediction}\n")
    
    print("Confidence Scores:")
    for category, score in sorted(confidence_scores.items(), key=lambda x: x[1], reverse=True):
        print(f"{category}: {score:.2%}")
    
    print("\nKey Phrases:")
    for phrase, count in key_phrases:
        print(f"- {phrase} ({count} occurrences)")
    
    print("\nContext Analysis:")
    for context, score in context_scores.items():
        if score > 0:
            print(f"- {context}: {score}")