# Smart News AI - Data Exploration and Analysis

This notebook provides an interactive analysis of the Smart News AI system, including:
- Dataset exploration
- Classification model training and evaluation
- Recommendation system analysis
- Visualization of results

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette('husl')

# Import custom modules
from data_generator import create_sample_data
from data_preprocessing import FeatureExtractor, create_train_test_split
from news_classifier import NewsClassifier, ModelComparison
from recommendation_engine import HybridRecommender, generate_sample_interactions

print("Libraries imported successfully!")

## 1. Data Generation and Loading

In [None]:
# Generate sample data if not exists
data_path = '../data'
articles_path = os.path.join(data_path, 'news_articles.csv')

if not os.path.exists(articles_path):
    print("Generating sample data...")
    create_sample_data(data_path)

# Load datasets
articles_df = pd.read_csv(articles_path)
interactions_df = pd.read_csv(os.path.join(data_path, 'user_interactions.csv'))

print(f"Loaded {len(articles_df)} articles and {len(interactions_df)} interactions")
print(f"Categories: {', '.join(articles_df['category'].unique())}")

## 2. Dataset Exploration

In [None]:
# Basic dataset information
print("=== Articles Dataset ===")
print(articles_df.info())
print("\n=== Sample Articles ===")
print(articles_df.head())

In [None]:
# Category distribution
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
category_counts = articles_df['category'].value_counts()
plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Article Category Distribution')

plt.subplot(1, 2, 2)
category_counts.plot(kind='bar')
plt.title('Articles per Category')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("Category Statistics:")
print(category_counts)

In [None]:
# Content length analysis
articles_df['content_length'] = articles_df['content'].str.len()
articles_df['title_length'] = articles_df['title'].str.len()

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(articles_df['content_length'], bins=30, alpha=0.7)
plt.title('Content Length Distribution')
plt.xlabel('Characters')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
plt.hist(articles_df['title_length'], bins=30, alpha=0.7, color='orange')
plt.title('Title Length Distribution')
plt.xlabel('Characters')
plt.ylabel('Frequency')

plt.subplot(1, 3, 3)
sns.boxplot(data=articles_df, x='category', y='content_length')
plt.title('Content Length by Category')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("Content Length Statistics:")
print(articles_df['content_length'].describe())

In [None]:
# User interaction analysis
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
interactions_df['rating'].hist(bins=5, alpha=0.7, color='green')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
rating_by_category = interactions_df.groupby('category')['rating'].mean().sort_values()
rating_by_category.plot(kind='bar', color='skyblue')
plt.title('Average Rating by Category')
plt.xlabel('Category')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)

plt.subplot(1, 3, 3)
user_activity = interactions_df['user_id'].value_counts()
plt.hist(user_activity, bins=20, alpha=0.7, color='purple')
plt.title('User Activity Distribution')
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Users')

plt.tight_layout()
plt.show()

print("Interaction Statistics:")
print(f"Average rating: {interactions_df['rating'].mean():.2f}")
print(f"Total unique users: {interactions_df['user_id'].nunique()}")
print(f"Average interactions per user: {len(interactions_df) / interactions_df['user_id'].nunique():.1f}")

## 3. Text Analysis and Word Clouds

In [None]:
# Create word clouds for different categories
categories = articles_df['category'].unique()[:4]  # Show first 4 categories

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for i, category in enumerate(categories):
    if i >= 4:
        break
        
    category_text = ' '.join(articles_df[articles_df['category'] == category]['content'])
    
    try:
        wordcloud = WordCloud(
            width=400, 
            height=300, 
            background_color='white',
            max_words=50,
            colormap='viridis'
        ).generate(category_text)
        
        axes[i].imshow(wordcloud, interpolation='bilinear')
        axes[i].set_title(f'{category.title()} Category', fontsize=14)
        axes[i].axis('off')
    except Exception as e:
        axes[i].text(0.5, 0.5, f'WordCloud error for {category}', 
                    ha='center', va='center', transform=axes[i].transAxes)
        axes[i].set_title(f'{category.title()} Category', fontsize=14)

plt.tight_layout()
plt.show()

## 4. Classification Model Training and Comparison

In [None]:
# Prepare data for classification
print("Preparing data for classification...")

feature_extractor = FeatureExtractor(max_features=3000)
X = feature_extractor.fit_transform_text(articles_df['content'].tolist())
y = feature_extractor.fit_transform_labels(articles_df['category'].tolist())

X_train, X_test, y_train, y_test = create_train_test_split(X, y, test_size=0.2)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")

In [None]:
# Compare multiple models
print("Comparing classification models...")

model_types = ['random_forest', 'logistic_regression', 'naive_bayes']
comparison = ModelComparison(model_types)
results, best_model = comparison.compare_models(X_train, y_train, X_test, y_test)

# Create comparison visualization
results_df = comparison.get_results_dataframe()
print("\nModel Comparison Results:")
print(results_df)

# Plot comparison
plt.figure(figsize=(12, 8))

metrics = ['accuracy', 'precision', 'recall', 'f1_score']

for i, metric in enumerate(metrics, 1):
    plt.subplot(2, 2, i)
    results_df[metric].plot(kind='bar')
    plt.title(f'{metric.title()} Comparison')
    plt.ylabel(metric.title())
    plt.xticks(rotation=45)
    plt.ylim(0, 1)

plt.tight_layout()
plt.show()

print(f"\nBest performing model: {best_model}")

In [None]:
# Detailed analysis of the best model
best_classifier = comparison.models[best_model]
detailed_metrics = best_classifier.evaluate(X_test, y_test, detailed=True)

print(f"Detailed Analysis - {best_model}")
print("=" * 40)
print(detailed_metrics['classification_report'])

# Confusion Matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = detailed_metrics['confusion_matrix']
categories = feature_extractor.label_encoder.classes_

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=categories, yticklabels=categories)
plt.title(f'Confusion Matrix - {best_model}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Feature importance analysis
feature_names = feature_extractor.get_feature_names()
if len(feature_names) > 0 and hasattr(best_classifier, 'feature_importance'):
    top_features = best_classifier.get_top_features(feature_names, n_features=20)
    
    # Plot top features
    features, importances = zip(*top_features)
    
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(features)), importances)
    plt.yticks(range(len(features)), features)
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Important Features - {best_model}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("Top 10 Important Features:")
    for feature, importance in top_features[:10]:
        print(f"{feature}: {importance:.4f}")
else:
    print("Feature importance not available for this model.")

## 5. Recommendation System Analysis

In [None]:
# Train recommendation system
print("Training hybrid recommendation system...")

recommender = HybridRecommender(content_weight=0.6, collaborative_weight=0.4)
recommender.fit(articles_df, interactions_df)

print("Recommendation system trained successfully!")

In [None]:
# Analyze user preferences
sample_users = interactions_df['user_id'].value_counts().head(5).index.tolist()

print("User Preference Analysis")
print("=" * 40)

for user_id in sample_users:
    user_profile = recommender.get_user_profile(user_id)
    user_interactions = interactions_df[interactions_df['user_id'] == user_id]
    
    print(f"\nUser: {user_id}")
    print(f"Total interactions: {len(user_interactions)}")
    print(f"Average rating: {user_interactions['rating'].mean():.2f}")
    
    # Category preferences
    category_ratings = user_interactions.groupby('category')['rating'].agg(['count', 'mean'])
    print(f"Category preferences:")
    for category, stats in category_ratings.iterrows():
        print(f"  {category}: {stats['count']} articles, avg rating {stats['mean']:.2f}")

In [None]:
# Demo recommendation for sample users
demo_user = sample_users[0]

print(f"Generating recommendations for user: {demo_user}")

recommendations = recommender.get_recommendations(demo_user, n_recommendations=5)

print(f"\nTop 5 Recommendations for {demo_user}:")
print("=" * 50)

for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec['title']} ({rec['category']})")
    print(f"   Hybrid Score: {rec['hybrid_score']:.3f}")
    print(f"   Content Score: {rec['content_score']:.3f}")
    print(f"   Collaborative Score: {rec['collaborative_score']:.3f}")
    print()

## 6. Model Performance Visualization

In [None]:
# Create comprehensive performance dashboard
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Category distribution
axes[0, 0].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
axes[0, 0].set_title('Article Category Distribution')

# 2. Model performance comparison
results_df['accuracy'].plot(kind='bar', ax=axes[0, 1], color='skyblue')
axes[0, 1].set_title('Model Accuracy Comparison')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Content length by category
category_length = articles_df.groupby('category')['content_length'].mean()
category_length.plot(kind='bar', ax=axes[0, 2], color='lightgreen')
axes[0, 2].set_title('Average Content Length by Category')
axes[0, 2].set_ylabel('Characters')
axes[0, 2].tick_params(axis='x', rotation=45)

# 4. Rating distribution
interactions_df['rating'].hist(bins=5, ax=axes[1, 0], alpha=0.7, color='orange')
axes[1, 0].set_title('User Rating Distribution')
axes[1, 0].set_xlabel('Rating')
axes[1, 0].set_ylabel('Frequency')

# 5. User activity
user_activity.head(20).plot(kind='bar', ax=axes[1, 1], color='purple')
axes[1, 1].set_title('Top 20 Most Active Users')
axes[1, 1].set_ylabel('Interactions')
axes[1, 1].tick_params(axis='x', rotation=90, labelsize=8)

# 6. Average rating by category
rating_by_category.plot(kind='bar', ax=axes[1, 2], color='coral')
axes[1, 2].set_title('Average Rating by Category')
axes[1, 2].set_ylabel('Average Rating')
axes[1, 2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Conclusion and Insights

In [None]:
print("Smart News AI - Analysis Summary")
print("=" * 40)
print(f"📊 Dataset Statistics:")
print(f"   • Total articles: {len(articles_df):,}")
print(f"   • Categories: {len(articles_df['category'].unique())}")
print(f"   • User interactions: {len(interactions_df):,}")
print(f"   • Unique users: {interactions_df['user_id'].nunique():,}")

print(f"\n🎯 Classification Performance:")
print(f"   • Best model: {best_model}")
print(f"   • Best accuracy: {results_df.loc[best_model, 'accuracy']:.3f}")
print(f"   • Best F1-score: {results_df.loc[best_model, 'f1_score']:.3f}")

print(f"\n💡 Key Insights:")
most_popular = category_counts.index[0]
highest_rated = rating_by_category.index[-1]
print(f"   • Most popular category: {most_popular} ({category_counts.iloc[0]} articles)")
print(f"   • Highest rated category: {highest_rated} (avg: {rating_by_category.iloc[-1]:.2f})")
print(f"   • Average user engagement: {interactions_df.groupby('user_id').size().mean():.1f} interactions per user")

print(f"\n🚀 System Capabilities:")
print(f"   • Multi-class text classification with {len(feature_extractor.label_encoder.classes_)} categories")
print(f"   • Hybrid recommendation system (content + collaborative filtering)")
print(f"   • Real-time article classification and personalized recommendations")
print(f"   • Interactive CLI interface for easy usage")

print(f"\n📈 Next Steps:")
print(f"   • Experiment with deep learning models (BERT, transformers)")
print(f"   • Implement online learning for real-time model updates")
print(f"   • Add sentiment analysis and topic modeling")
print(f"   • Develop web interface for broader accessibility")