In [6]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import os

In [7]:
def load_and_clean_data(filepath):
    """Load and clean the books dataset"""
    df = pd.read_csv(filepath,
                     encoding='utf-8',
                     quoting=1,
                     escapechar='\\',
                     on_bad_lines='skip')

    df.columns = df.columns.str.strip()

    # Convert dates and numeric columns
    df['publication_date'] = pd.to_datetime(
        df['publication_date'], errors='coerce')
    df['publication_year'] = df['publication_date'].dt.year
    df['num_pages'] = pd.to_numeric(df['num_pages'], errors='coerce')
    df['average_rating'] = pd.to_numeric(df['average_rating'], errors='coerce')
    df['ratings_count'] = pd.to_numeric(df['ratings_count'], errors='coerce')
    df['text_reviews_count'] = pd.to_numeric(
        df['text_reviews_count'], errors='coerce')

    # Remove rows with missing crucial data
    df = df.dropna(subset=['publisher', 'average_rating', 'ratings_count'])

    # Filter out publishers with less than 10 books
    publisher_counts = df['publisher'].value_counts()
    valid_publishers = publisher_counts[publisher_counts >= 10].index
    df = df[df['publisher'].isin(valid_publishers)]

    return df

In [8]:
def calculate_core_metrics(df):
    """Calculate basic publisher performance metrics"""
    core_metrics = df.groupby('publisher').agg({
        'bookID': 'count',
        'average_rating': ['mean', 'std', 'median'],
        'ratings_count': ['sum', 'mean'],
        'text_reviews_count': ['sum', 'mean'],
        'num_pages': ['mean', 'std']
    }).round(2)

    # Flatten column names
    core_metrics.columns = [
        'total_books',
        'avg_rating', 'rating_std', 'median_rating',
        'total_ratings', 'avg_ratings_per_book',
        'total_reviews', 'avg_reviews_per_book',
        'avg_pages', 'pages_std'
    ]

    return core_metrics

In [9]:
def analyze_temporal_trends(df):
    """Analyze publisher performance over time"""
    recent_years = datetime.now().year - 5

    recent_df = df[df['publication_year'] >= recent_years]
    recent_metrics = recent_df.groupby('publisher').agg({
        'average_rating': 'mean',
        'ratings_count': 'mean'
    }).add_prefix('recent_')

    yearly_stats = df.groupby(
        ['publisher', 'publication_year']).size().unstack(fill_value=0)
    growth_rates = yearly_stats.pct_change(axis=1).mean(axis=1)

    return pd.DataFrame({
        'publication_growth': growth_rates,
        'recent_avg_rating': recent_metrics['recent_average_rating'],
        'recent_avg_ratings': recent_metrics['recent_ratings_count']
    })

In [10]:
def analyze_portfolio(df):
    """Analyze publisher's portfolio diversity"""
    page_stats = df.groupby('publisher')['num_pages'].agg(['mean', 'std'])
    rating_diversity = df.groupby(
        'publisher')['average_rating'].agg(lambda x: x.std())

    return pd.DataFrame({
        'portfolio_diversity': rating_diversity,
        'avg_book_length': page_stats['mean'],
        'length_consistency': page_stats['std']
    })

In [11]:
def calculate_final_score(metrics):
    """Calculate comprehensive success score"""
    metrics_to_normalize = [
        'avg_rating',
        'portfolio_diversity', 'total_books'
    ]

    # Handle potential infinity values in metrics
    for col in metrics_to_normalize:
        if col in metrics.columns:
            metrics[col] = metrics[col].replace([np.inf, -np.inf], np.nan)

    normalized = metrics[metrics_to_normalize].apply(
        lambda x: (x - x.min()) / (x.max() - x.min())
    )

    weights = {
        'avg_rating': 0.3,
        'portfolio_diversity': 0.3,
        'total_books': 0.4
    }

    return sum(normalized[metric] * weight
               for metric, weight in weights.items())

In [None]:
def visualize_publisher_performance(df, metrics, top_n=10):
    """Create visualizations for publisher performance"""
    plt.style.use('default')  # Use default style instead of seaborn

    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # Get top publishers
    top_publishers = metrics.nlargest(top_n, 'final_success_score')

    # Plot 1: Success Score Distribution
    axes[0, 0].bar(range(len(top_publishers)),
                   top_publishers['final_success_score'])
    axes[0, 0].set_xticks(range(len(top_publishers)))
    axes[0, 0].set_xticklabels(top_publishers.index, rotation=45, ha='right')
    axes[0, 0].set_title('Top Publishers by Success Score')

    # Plot 2: Rating Distribution for Top Publishers
    box_data = [df[df['publisher'] == publisher]['average_rating']
                for publisher in top_publishers.index]
    # Updated parameter name
    axes[0, 1].boxplot(box_data, tick_labels=top_publishers.index)
    axes[0, 1].set_xticklabels(top_publishers.index, rotation=45, ha='right')
    axes[0, 1].set_title('Rating Distribution for Top Publishers')
    axes[0, 1].set_ylabel('Rating')

    # Plot 3: Total Books Distribution
    axes[1, 0].bar(range(len(top_publishers)), top_publishers['total_books'])
    axes[1, 0].set_xticks(range(len(top_publishers)))
    axes[1, 0].set_xticklabels(top_publishers.index, rotation=45, ha='right')
    axes[1, 0].set_title('Total Books Published')

    # Plot 4: Portfolio Diversity
    axes[1, 1].bar(range(len(top_publishers)),
                   top_publishers['portfolio_diversity'])
    axes[1, 1].set_xticks(range(len(top_publishers)))
    axes[1, 1].set_xticklabels(top_publishers.index, rotation=45, ha='right')
    axes[1, 1].set_title('Portfolio Diversity')

    plt.tight_layout()
    return fig

In [13]:
def analyze_publisher_success(df):
    """Main analysis function"""
    # Calculate all metrics
    core_metrics = calculate_core_metrics(df)
    temporal_metrics = analyze_temporal_trends(df)
    portfolio_metrics = analyze_portfolio(df)

    # Combine metrics
    combined_metrics = core_metrics.join(
        [temporal_metrics, portfolio_metrics], how='left')

    # Calculate success score
    combined_metrics['final_success_score'] = calculate_final_score(
        combined_metrics)

    return combined_metrics.sort_values('final_success_score', ascending=False)

In [None]:
def save_publisher_visualizations(df, metrics):
    """Save visualizations to file"""
    os.makedirs('visualizations', exist_ok=True)
    fig = visualize_publisher_performance(df, metrics)
    fig.savefig('visualizations/publisher_analysis.png',
                dpi=300,
                bbox_inches='tight')
    plt.close(fig)

In [20]:
df = load_and_clean_data('data/books.csv')
success_metrics = analyze_publisher_success(df)

# Print results
print("\nTop 10 Publishers by Success Score:")
print(success_metrics[['final_success_score', 'avg_rating',
                        'total_books']].head(10))

# Save visualizations
save_publisher_visualizations(df, success_metrics)


Top 10 Publishers by Success Score:
                           final_success_score  avg_rating  total_books
publisher                                                              
Vintage                               0.569443        3.89          318
Penguin Books                         0.501338        3.92          261
Penguin Classics                      0.397892        3.94          184
VIZ Media LLC                         0.382974        4.24           88
Andrews McMeel Publishing             0.379577        4.35           13
Mariner Books                         0.378845        3.93          150
Routledge                             0.368694        3.69           33
VIZ Media                             0.351626        4.43           14
Ballantine Books                      0.351337        3.88          144
HarperCollins                         0.349404        4.04          112
