In [10]:
"""
Research Opportunities Dataset Explorer
Comprehensive analysis and visualization of funding opportunities
"""

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

def load_dataset(filepath):
    """Load the merged CSV dataset with proper error handling"""
    try:
        df = pd.read_csv(filepath, encoding='utf-8', quoting=1)
        print(f"‚úÖ Loaded dataset: {len(df)} opportunities")
        return df
    except Exception as e:
        print(f"‚ùå Error loading dataset: {e}")
        return None

def basic_statistics(df):
    """Generate basic statistics about the dataset"""
    print("\n" + "="*80)
    print("üìä BASIC DATASET STATISTICS")
    print("="*80)
    
    print(f"\nüìù Total Opportunities: {len(df)}")
    print(f"üìã Total Columns: {len(df.columns)}")
    print(f"üî¢ Numeric Columns: {len(df.select_dtypes(include=[np.number]).columns)}")
    print(f"üìÑ Text Columns: {len(df.select_dtypes(include=['object']).columns)}")
    
    # Missing data analysis
    print("\nüîç Missing Data Analysis:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing_Count': missing,
        'Percentage': missing_pct
    })
    missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
    
    if len(missing_df) > 0:
        print(missing_df.head(10).to_string())
    else:
        print("   No missing data! üéâ")
    
    # Memory usage
    print(f"\nüíæ Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    return missing_df

def geographic_analysis(df):
    """Analyze geographic distribution of opportunities"""
    print("\n" + "="*80)
    print("üåç GEOGRAPHIC ANALYSIS")
    print("="*80)
    
    # Country distribution
    if 'country' in df.columns:
        country_counts = df['country'].value_counts()
        print(f"\nüìç Top Countries (Total: {len(country_counts)}):")
        print(country_counts.head(15).to_string())
        
        # Plot country distribution
        plt.figure(figsize=(14, 6))
        country_counts.head(15).plot(kind='barh', color='steelblue')
        plt.title('Top 15 Countries by Number of Opportunities', fontsize=16, fontweight='bold')
        plt.xlabel('Number of Opportunities', fontsize=12)
        plt.ylabel('Country', fontsize=12)
        plt.tight_layout()
        plt.savefig(r'D:\D1\WTF\Hakathon\outputs\country_distribution.png', dpi=300, bbox_inches='tight')
        print("   üìä Saved: country_distribution.png")
        plt.close()
    
    # Region analysis
    if 'region' in df.columns:
        region_counts = df['region'].value_counts()
        print(f"\nüåé Regional Distribution:")
        print(region_counts.to_string())
        
        # Plot regions
        plt.figure(figsize=(10, 6))
        region_counts.plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette("Set2"))
        plt.title('Regional Distribution of Opportunities', fontsize=16, fontweight='bold')
        plt.ylabel('')
        plt.tight_layout()
        plt.savefig(r'D:\D1\WTF\Hakathon\outputs\region_distribution.png', dpi=300, bbox_inches='tight')
        print("   üìä Saved: region_distribution.png")
        plt.close()

def funding_analysis(df):
    """Analyze funding amounts and types"""
    print("\n" + "="*80)
    print("üí∞ FUNDING ANALYSIS")
    print("="*80)
    
    # Opportunity types
    if 'opportunity_type' in df.columns:
        type_counts = df['opportunity_type'].value_counts()
        print(f"\nüìã Opportunity Types:")
        print(type_counts.to_string())
        
        plt.figure(figsize=(12, 6))
        type_counts.plot(kind='bar', color='coral')
        plt.title('Distribution by Opportunity Type', fontsize=16, fontweight='bold')
        plt.xlabel('Opportunity Type', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig('/mnt/user-data/outputs/opportunity_types.png', dpi=300, bbox_inches='tight')
        print("   üìä Saved: opportunity_types.png")
        plt.close()
    
    # Funding amount analysis
    funding_cols = ['funding_amount_min', 'funding_amount_max', 'funding_amount_avg']
    available_cols = [col for col in funding_cols if col in df.columns]
    
    if available_cols:
        print(f"\nüíµ Funding Statistics:")
        for col in available_cols:
            non_null = df[col].dropna()
            if len(non_null) > 0:
                print(f"\n   {col}:")
                print(f"      Min:    ${non_null.min():,.2f}")
                print(f"      Max:    ${non_null.max():,.2f}")
                print(f"      Mean:   ${non_null.mean():,.2f}")
                print(f"      Median: ${non_null.median():,.2f}")
    
    # Currency distribution
    if 'currency' in df.columns:
        currency_counts = df['currency'].value_counts()
        print(f"\nüí± Currency Distribution:")
        print(currency_counts.to_string())

def deadline_analysis(df):
    """Analyze application deadlines"""
    print("\n" + "="*80)
    print("üìÖ DEADLINE ANALYSIS")
    print("="*80)
    
    deadline_cols = [col for col in df.columns if 'deadline' in col.lower()]
    
    if deadline_cols:
        for col in deadline_cols:
            if col in df.columns:
                # Try to parse dates
                try:
                    df[f'{col}_parsed'] = pd.to_datetime(df[col], errors='coerce')
                    valid_dates = df[f'{col}_parsed'].dropna()
                    
                    if len(valid_dates) > 0:
                        print(f"\nüìÜ {col}:")
                        print(f"   Earliest: {valid_dates.min()}")
                        print(f"   Latest:   {valid_dates.max()}")
                        print(f"   Valid dates: {len(valid_dates)}/{len(df)}")
                        
                        # Count by year
                        year_counts = valid_dates.dt.year.value_counts().sort_index()
                        print(f"\n   By Year:")
                        print(year_counts.to_string())
                        
                        # Plot deadline distribution by month
                        if len(valid_dates) > 5:
                            plt.figure(figsize=(12, 6))
                            valid_dates.dt.month.value_counts().sort_index().plot(kind='bar', color='mediumseagreen')
                            plt.title(f'Deadline Distribution by Month - {col}', fontsize=16, fontweight='bold')
                            plt.xlabel('Month', fontsize=12)
                            plt.ylabel('Number of Opportunities', fontsize=12)
                            plt.xticks(range(12), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                                                    'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=0)
                            plt.tight_layout()
                            plt.savefig(r'D:\D1\WTF\Hakathon\outputs\deadline_months_{col}.png', dpi=300, bbox_inches='tight')
                            print(f"   üìä Saved: deadline_months_{col}.png")
                            plt.close()
                except Exception as e:
                    print(f"   ‚ö†Ô∏è Could not parse {col}: {e}")

def eligibility_analysis(df):
    """Analyze eligibility criteria"""
    print("\n" + "="*80)
    print("‚úÖ ELIGIBILITY ANALYSIS")
    print("="*80)
    
    # Career stage
    if 'career_stage' in df.columns:
        stage_counts = df['career_stage'].value_counts()
        print(f"\nüë®‚Äçüéì Career Stage Distribution:")
        print(stage_counts.to_string())
        
        plt.figure(figsize=(10, 6))
        stage_counts.plot(kind='barh', color='mediumpurple')
        plt.title('Opportunities by Career Stage', fontsize=16, fontweight='bold')
        plt.xlabel('Number of Opportunities', fontsize=12)
        plt.ylabel('Career Stage', fontsize=12)
        plt.tight_layout()
        plt.savefig(r'D:\D1\WTF\Hakathon\outputs\career_stages.png', dpi=300, bbox_inches='tight')
        print("   üìä Saved: career_stages.png")
        plt.close()
    
    # Field of study
    if 'field_of_study' in df.columns:
        field_counts = df['field_of_study'].value_counts()
        print(f"\nüìö Field of Study Distribution:")
        print(field_counts.head(10).to_string())
    
    # Nationality requirements
    if 'nationality_requirement' in df.columns:
        nat_counts = df['nationality_requirement'].value_counts()
        print(f"\nüåê Nationality Requirements (Top 10):")
        print(nat_counts.head(10).to_string())

def feature_analysis(df):
    """Analyze program features"""
    print("\n" + "="*80)
    print("‚≠ê FEATURE ANALYSIS")
    print("="*80)
    
    feature_cols = [col for col in df.columns if col.startswith('feature_')]
    
    if feature_cols:
        feature_summary = {}
        for col in feature_cols:
            if df[col].dtype == 'object':
                # Count Yes/No
                counts = df[col].value_counts()
                if 'Yes' in counts.index:
                    feature_summary[col.replace('feature_', '')] = counts.get('Yes', 0)
        
        if feature_summary:
            feature_df = pd.DataFrame.from_dict(feature_summary, orient='index', columns=['Count'])
            feature_df = feature_df.sort_values('Count', ascending=False)
            
            print(f"\n‚ú® Most Common Features:")
            print(feature_df.head(15).to_string())
            
            # Plot features
            plt.figure(figsize=(12, 8))
            feature_df.head(15).plot(kind='barh', color='gold', legend=False)
            plt.title('Top 15 Program Features', fontsize=16, fontweight='bold')
            plt.xlabel('Number of Programs Offering', fontsize=12)
            plt.ylabel('Feature', fontsize=12)
            plt.tight_layout()
            plt.savefig(r'D:\D1\WTF\Hakathon\outputs\program_features.png', dpi=300, bbox_inches='tight')
            print("   üìä Saved: program_features.png")
            plt.close()

def competitiveness_analysis(df):
    """Analyze acceptance rates and competitiveness"""
    print("\n" + "="*80)
    print("üéØ COMPETITIVENESS ANALYSIS")
    print("="*80)
    
    if 'acceptance_rate_category' in df.columns:
        comp_counts = df['acceptance_rate_category'].value_counts()
        print(f"\nüèÜ Acceptance Rate Categories:")
        print(comp_counts.to_string())
        
        plt.figure(figsize=(10, 6))
        comp_counts.plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette("RdYlGn_r"))
        plt.title('Competitiveness Distribution', fontsize=16, fontweight='bold')
        plt.ylabel('')
        plt.tight_layout()
        plt.savefig('/mnt/user-data/outputs/competitiveness.png', dpi=300, bbox_inches='tight')
        print("   üìä Saved: competitiveness.png")
        plt.close()
    
    if 'acceptance_rate_percent' in df.columns:
        acc_rate = df['acceptance_rate_percent'].dropna()
        if len(acc_rate) > 0:
            print(f"\nüìä Acceptance Rate Statistics:")
            print(f"   Min:    {acc_rate.min():.1f}%")
            print(f"   Max:    {acc_rate.max():.1f}%")
            print(f"   Mean:   {acc_rate.mean():.1f}%")
            print(f"   Median: {acc_rate.median():.1f}%")

def generate_summary_report(df, output_path):
    """Generate a comprehensive text summary report"""
    report_lines = []
    report_lines.append("="*80)
    report_lines.append("RESEARCH OPPORTUNITIES DATASET - COMPREHENSIVE SUMMARY REPORT")
    report_lines.append("="*80)
    report_lines.append("")
    
    # Basic info
    report_lines.append(f"Total Opportunities: {len(df)}")
    report_lines.append(f"Total Columns: {len(df.columns)}")
    report_lines.append("")
    
    # Top countries
    if 'country' in df.columns:
        report_lines.append("TOP 10 COUNTRIES:")
        for idx, (country, count) in enumerate(df['country'].value_counts().head(10).items(), 1):
            report_lines.append(f"  {idx}. {country}: {count} opportunities")
        report_lines.append("")
    
    # Top institutions
    if 'institution' in df.columns:
        report_lines.append("TOP 10 INSTITUTIONS:")
        for idx, (inst, count) in enumerate(df['institution'].value_counts().head(10).items(), 1):
            report_lines.append(f"  {idx}. {inst}: {count} opportunities")
        report_lines.append("")
    
    # Opportunity types
    if 'opportunity_type' in df.columns:
        report_lines.append("OPPORTUNITY TYPES:")
        for typ, count in df['opportunity_type'].value_counts().items():
            report_lines.append(f"  ‚Ä¢ {typ}: {count}")
        report_lines.append("")
    
    # Save report
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(report_lines))
    
    print(f"\nüìÑ Summary report saved to: {output_path}")

def main():
    """Main execution function"""
    print("\nüöÄ Research Opportunities Dataset Explorer")
    print("="*80)
    
    # Find the merged CSV
    csv_path = r'D:\D1\WTF\Hakathon\Data Batches\research_opportunities_complete.csv'
    
    if not Path(csv_path).exists():
        print(f"‚ùå Dataset not found at: {csv_path}")
        print("   Please run merge_batches.py first!")
        return
    
    # Load dataset
    df = load_dataset(csv_path)
    if df is None:
        return
    
    # Run all analyses
    basic_statistics(df)
    geographic_analysis(df)
    funding_analysis(df)
    deadline_analysis(df)
    eligibility_analysis(df)
    feature_analysis(df)
    competitiveness_analysis(df)
    
    # Generate summary report
    generate_summary_report(df, r'D:\D1\WTF\Hakathon\outputs\dataset_summary_report.txt')
    
    print("\n" + "="*80)
    print("‚úÖ ANALYSIS COMPLETE!")
    print("="*80)
    print("\nüìÅ Generated Files:")
    print("   ‚Ä¢ country_distribution.png")
    print("   ‚Ä¢ region_distribution.png")
    print("   ‚Ä¢ opportunity_types.png")
    print("   ‚Ä¢ deadline_months_*.png")
    print("   ‚Ä¢ career_stages.png")
    print("   ‚Ä¢ program_features.png")
    print("   ‚Ä¢ competitiveness.png")
    print("   ‚Ä¢ dataset_summary_report.txt")
    print("\nüéâ All visualizations saved to /mnt/user-data/outputs/")

if __name__ == "__main__":
    main()


üöÄ Research Opportunities Dataset Explorer
‚úÖ Loaded dataset: 72 opportunities

üìä BASIC DATASET STATISTICS

üìù Total Opportunities: 72
üìã Total Columns: 92
üî¢ Numeric Columns: 3
üìÑ Text Columns: 89

üîç Missing Data Analysis:
                        Missing_Count  Percentage
funding_amount_max                 72  100.000000
funding_amount_min                 72  100.000000
age_restrictions                   68   94.444444
age_limit                          64   88.888889
minimum_education                  42   58.333333
target_career_stage                42   58.333333
target_demographics                42   58.333333
currency_code                      42   58.333333
citizenship_required               42   58.333333
funding_amount_typical             42   58.333333

üíæ Memory Usage: 0.35 MB

üåç GEOGRAPHIC ANALYSIS

üìç Top Countries (Total: 29):
country
United States         14
United Kingdom         7
Multiple               7
Canada                 5
Multiple coun