# Stažení záznamů o všech datasetech v Zenodu za r. 2025

## Základní parametry dotazu:
* `'q': 'resource_type.type:dataset AND publication_date:2025'`
* `'size': page_size,`
* `'page': page,`
* `'sort': 'mostrecent'`

In [None]:
import requests
import pandas as pd
import time
from datetime import datetime
import json
from typing import List, Dict, Any
import os

class ZenodoDatasetRetriever:
    def __init__(self):
        self.base_url = "https://zenodo.org/api/records"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'ZenodoDatasetRetriever/1.0'
        })
        
    def get_datasets_2025(self, page_size: int = 100) -> List[Dict[Any, Any]]:
        """
        Retrieve all dataset records from Zenodo deposited in 2025
        """
        all_records = []
        page = 1
        total_hits = None
        
        while True:
            print(f"Fetching page {page}...")
            
            # Parameters for the API request
            params = {
                'q': 'resource_type.type:dataset AND publication_date:2025',
                'size': page_size,
                'page': page,
                'sort': 'mostrecent'
            }
            
            try:
                response = self.session.get(self.base_url, params=params)
                response.raise_for_status()
                
                data = response.json()
                hits = data.get('hits', {}).get('hits', [])
                
                # Print total hits on first page
                if page == 1:
                    total_hits = data.get('hits', {}).get('total', 0)
                    print(f"Total hits found: {total_hits}")
                
                if not hits:
                    print(f"No more records found. Total pages processed: {page-1}")
                    break
                
                all_records.extend(hits)
                print(f"Retrieved {len(hits)} records from page {page}. Total so far: {len(all_records)}")
                
                # Check if we've reached the last page
                if len(all_records) >= total_hits:
                    print(f"Retrieved all {total_hits} records")
                    break
                
                page += 1
                
                # Be respectful to the API - add a small delay
                time.sleep(0.1)
                
            except requests.exceptions.RequestException as e:
                print(f"Error fetching page {page}: {e}")
                break
                
        return all_records
    
    def extract_record_info(self, record: Dict[Any, Any]) -> Dict[str, Any]:
        """
        Extract relevant information from a Zenodo record
        """
        metadata = record.get('metadata', {})
        
        # Extract creators
        creators = metadata.get('creators', [])
        creator_names = [creator.get('name', '') for creator in creators]
        
        # Extract keywords
        keywords = metadata.get('keywords', [])
        
        # Extract related identifiers
        related_identifiers = metadata.get('related_identifiers', [])
        
        # Extract file information
        files = record.get('files', [])
        file_info = []
        total_size = 0
        
        for file in files:
            file_info.append({
                'filename': file.get('key', ''),
                'size': file.get('size', 0),
                'checksum': file.get('checksum', '')
            })
            total_size += file.get('size', 0)
        
        return {
            'id': record.get('id', ''),
            'doi': record.get('doi', ''),
            'title': metadata.get('title', ''),
            'description': metadata.get('description', ''),
            'publication_date': metadata.get('publication_date', ''),
            'creators': '; '.join(creator_names),
            'keywords': '; '.join(keywords),
            'license': metadata.get('license', {}).get('id', ''),
            'access_right': metadata.get('access_right', ''),
            'resource_type': metadata.get('resource_type', {}).get('type', ''),
            'language': metadata.get('language', ''),
            'version': metadata.get('version', ''),
            'file_count': len(files),
            'total_size_bytes': total_size,
            'total_size_mb': round(total_size / (1024 * 1024), 2),
            'zenodo_url': f"https://zenodo.org/record/{record.get('id', '')}",
            'created': record.get('created', ''),
            'updated': record.get('updated', ''),
            'conceptrecid': record.get('conceptrecid', ''),
            'conceptdoi': record.get('conceptdoi', ''),
            'related_identifiers_count': len(related_identifiers),
            'bucket_url': record.get('links', {}).get('bucket', ''),
            'files_info': json.dumps(file_info) if file_info else ''
        }
    
    def extract_affiliations(self, records: List[Dict[Any, Any]]) -> pd.DataFrame:
        """
        Extract all affiliations from records and count their occurrences
        """
        affiliations = []
        
        for record in records:
            metadata = record.get('metadata', {})
            
            # Check creators for affiliations
            creators = metadata.get('creators', [])
            for creator in creators:
                if isinstance(creator, dict) and 'affiliation' in creator:
                    affiliation = creator['affiliation']
                    if affiliation and affiliation.strip():
                        affiliations.append(affiliation.strip())
            
            # Check contributors for affiliations
            contributors = metadata.get('contributors', [])
            for contributor in contributors:
                if isinstance(contributor, dict) and 'affiliation' in contributor:
                    affiliation = contributor['affiliation']
                    if affiliation and affiliation.strip():
                        affiliations.append(affiliation.strip())
        
        # Count occurrences
        if affiliations:
            affiliation_counts = pd.Series(affiliations).value_counts()
            affiliation_df = pd.DataFrame({
                'instituce': affiliation_counts.index,
                'počet_výskytů': affiliation_counts.values
            })
        else:
            affiliation_df = pd.DataFrame(columns=['instituce', 'počet_výskytů'])
        
        return affiliation_df
        """
        Extract relevant information from a Zenodo record
        """
        metadata = record.get('metadata', {})
        
        # Extract creators
        creators = metadata.get('creators', [])
        creator_names = [creator.get('name', '') for creator in creators]
        
        # Extract keywords
        keywords = metadata.get('keywords', [])
        
        # Extract related identifiers
        related_identifiers = metadata.get('related_identifiers', [])
        
        # Extract file information
        files = record.get('files', [])
        file_info = []
        total_size = 0
        
        for file in files:
            file_info.append({
                'filename': file.get('key', ''),
                'size': file.get('size', 0),
                'checksum': file.get('checksum', '')
            })
            total_size += file.get('size', 0)
        
        return {
            'id': record.get('id', ''),
            'doi': record.get('doi', ''),
            'title': metadata.get('title', ''),
            'description': metadata.get('description', ''),
            'publication_date': metadata.get('publication_date', ''),
            'creators': '; '.join(creator_names),
            'keywords': '; '.join(keywords),
            'license': metadata.get('license', {}).get('id', ''),
            'access_right': metadata.get('access_right', ''),
            'resource_type': metadata.get('resource_type', {}).get('type', ''),
            'language': metadata.get('language', ''),
            'version': metadata.get('version', ''),
            'file_count': len(files),
            'total_size_bytes': total_size,
            'total_size_mb': round(total_size / (1024 * 1024), 2),
            'zenodo_url': f"https://zenodo.org/record/{record.get('id', '')}",
            'created': record.get('created', ''),
            'updated': record.get('updated', ''),
            'conceptrecid': record.get('conceptrecid', ''),
            'conceptdoi': record.get('conceptdoi', ''),
            'related_identifiers_count': len(related_identifiers),
            'bucket_url': record.get('links', {}).get('bucket', ''),
            'files_info': json.dumps(file_info) if file_info else ''
        }
    
    def save_to_files(self, records: List[Dict[Any, Any]], base_filename: str = 'zenodo_datasets_2025'):
        """
        Save records to CSV, XLS, and JSON formats
        """
        # Process records to extract structured information
        processed_records = []
        
        print("Processing records...")
        for i, record in enumerate(records, 1):
            if i % 100 == 0:
                print(f"Processed {i}/{len(records)} records")
            
            processed_record = self.extract_record_info(record)
            processed_records.append(processed_record)
        
        # Create DataFrame
        df = pd.DataFrame(processed_records)
        
        # Extract affiliations
        print("Extracting affiliations...")
        affiliations_df = self.extract_affiliations(records)
        
        # Generate filenames with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        csv_filename = f"{base_filename}_{timestamp}.csv"
        xlsx_filename = f"{base_filename}_{timestamp}.xlsx"
        json_filename = f"{base_filename}_{timestamp}.json"
        affiliations_filename = f"{base_filename}_affiliations_{timestamp}.csv"
        
        # Save raw JSON data
        print(f"Saving to JSON: {json_filename}")
        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump({
                'metadata': {
                    'total_records': len(records),
                    'retrieval_date': datetime.now().isoformat(),
                    'query': 'resource_type.type:dataset AND publication_date:2025',
                    'source': 'Zenodo API'
                },
                'records': records
            }, f, indent=2, ensure_ascii=False)
        
        # Save to CSV
        print(f"Saving to CSV: {csv_filename}")
        df.to_csv(csv_filename, index=False, encoding='utf-8')
        
        # Save affiliations to CSV
        print(f"Saving affiliations to CSV: {affiliations_filename}")
        affiliations_df.to_csv(affiliations_filename, index=False, encoding='utf-8')
        
        # Save to Excel with multiple sheets
        print(f"Saving to Excel: {xlsx_filename}")
        with pd.ExcelWriter(xlsx_filename, engine='openpyxl') as writer:
            # Main data sheet
            df.to_excel(writer, sheet_name='Datasets', index=False)
            
            # Affiliations sheet
            affiliations_df.to_excel(writer, sheet_name='Affiliations', index=False)
            
            # Summary sheet
            summary_data = {
                'Metric': [
                    'Total Datasets',
                    'Date Range',
                    'Most Common License',
                    'Most Common Language',
                    'Total Size (GB)',
                    'Average Files per Dataset',
                    'Unique Affiliations',
                    'Most Common Affiliation',
                    'Retrieval Date'
                ],
                'Value': [
                    len(df),
                    '2025',
                    df['license'].mode().iloc[0] if not df['license'].mode().empty else 'N/A',
                    df['language'].mode().iloc[0] if not df['language'].mode().empty else 'N/A',
                    round(df['total_size_bytes'].sum() / (1024**3), 2),
                    round(df['file_count'].mean(), 2),
                    len(affiliations_df),
                    affiliations_df['instituce'].iloc[0] if len(affiliations_df) > 0 else 'N/A',
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                ]
            }
            
            summary_df = pd.DataFrame(summary_data)
            summary_df.to_excel(writer, sheet_name='Summary', index=False)
        
        print(f"Successfully saved {len(processed_records)} records to all formats")
        print(f"Files created: {csv_filename}, {xlsx_filename}, {json_filename}, {affiliations_filename}")
        print(f"Found {len(affiliations_df)} unique affiliations")
        
        return csv_filename, xlsx_filename, json_filename, affiliations_filename, df, affiliations_df

def main():
    """
    Main function to run the Zenodo dataset retrieval
    """
    print("Starting Zenodo Dataset Retrieval for 2025...")
    print("=" * 50)
    
    retriever = ZenodoDatasetRetriever()
    
    # Retrieve all dataset records from 2025
    records = retriever.get_datasets_2025()
    
    if not records:
        print("No records found for 2025")
        return
    
    print(f"\nTotal records retrieved: {len(records)}")
    
    # Save to files
    csv_file, xlsx_file, json_file, affiliations_file, df, affiliations_df = retriever.save_to_files(records)
    
    # Display basic statistics
    print("\n" + "=" * 50)
    print("SUMMARY STATISTICS:")
    print("=" * 50)
    print(f"Total datasets: {len(df)}")
    print(f"Date range: 2025-01-01 to 2025-12-31")
    print(f"Total size: {df['total_size_bytes'].sum() / (1024**3):.2f} GB")
    print(f"Average files per dataset: {df['file_count'].mean():.2f}")
    print(f"Most common license: {df['license'].mode().iloc[0] if not df['license'].mode().empty else 'N/A'}")
    
    # Top 10 most common keywords
    all_keywords = []
    for keywords_str in df['keywords'].dropna():
        if keywords_str:
            all_keywords.extend([kw.strip() for kw in keywords_str.split(';')])
    
    if all_keywords:
        keyword_counts = pd.Series(all_keywords).value_counts().head(10)
        print(f"\nTop 10 keywords:")
        for keyword, count in keyword_counts.items():
            print(f"  {keyword}: {count}")
    
    print(f"\nTop 10 affiliations:")
    if len(affiliations_df) > 0:
        for i, row in affiliations_df.head(10).iterrows():
            print(f"  {row['instituce']}: {row['počet_výskytů']}")
    else:
        print("  No affiliations found")
    
    print(f"\nFiles saved:")
    print(f"  CSV: {csv_file}")
    print(f"  Excel: {xlsx_file}")
    print(f"  JSON: {json_file}")
    print(f"  Affiliations: {affiliations_file}")
    
    return df, affiliations_df

# Run the script
if __name__ == "__main__":
    df = main()
else:
    # If running in Jupyter, you can call functions individually
    print("Zenodo Dataset Retriever loaded. Use main() to run the full retrieval.")
    print("Or create a ZenodoDatasetRetriever instance for custom usage.")