# Hypothese 6


In [1]:
import gc
import json
import os
import pickle
from collections import Counter
from itertools import combinations

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import pyarrow.parquet as pq
from plotly.offline import iplot

## Daten einlesen

In [2]:
parquet_dataset = "data/input/Data_OpenAlex.parquet"
parquet_dataset = pq.ParquetFile(parquet_dataset)

## Länder zählen

In [3]:
def count_countries(parquet_dataset, chunk_size=100000):
    """
    Zählung der Länderhäufigkeiten in der Parquet-Datei.
    Verwendet Chunking aufgrund von Speicherlimitationen.
    """
    country_counter = Counter()
    total_papers = 0
    
    print(f"Anzahl Row Groups: {parquet_dataset.num_row_groups}")
    print(f"Chunk-Größe: {chunk_size:,} Zeilen")
    
    # Verarbeite die Datei in Batches
    for batch_idx, batch in enumerate(parquet_dataset.iter_batches(
        batch_size=chunk_size, 
        columns=['country_codes']
    )):
        # Konvertiere zu Pandas DataFrame
        df_chunk = batch.to_pandas()
        
        # Verarbeite jede Zeile
        for country_codes_str in df_chunk['country_codes']:
            if pd.notna(country_codes_str) and country_codes_str:
                # Splitte Ländercodes
                if isinstance(country_codes_str, str):
                    countries = []
                    for separator in [',', ';', '|', ' ']:
                        if separator in country_codes_str:
                            countries = [c.strip().upper() for c in country_codes_str.split(separator) if c.strip()]
                            break
                    
                    # Falls keine Trennzeichen gefunden, als einzelnes Land behandeln
                    if not countries:
                        countries = [country_codes_str.strip().upper()]
                    
                    # Zähle jedes Land
                    for country in countries:
                        if country and len(country) <= 3:
                            country_counter[country] += 1
        
        total_papers += len(df_chunk)
        
        if (batch_idx + 1) % 10 == 0:
            print(f"Verarbeitet: {total_papers:,} Papers, {len(country_counter)} verschiedene Länder gefunden")
        
        # Speicher freigeben
        del df_chunk
        gc.collect()
    
    print(f"Total verarbeitete Papers: {total_papers:,}")
    print(f"Verschiedene Länder gefunden: {len(country_counter)}")
    
    return country_counter, total_papers

In [4]:
country_counts, total_papers = count_countries(parquet_dataset)

print(f"Top 20 Länder nach Anzahl Papers")
top_countries = country_counts.most_common(20)
for rank, (country, count) in enumerate(top_countries, 1):
    percentage = (count / total_papers) * 100
    print(f"{rank:2d}. {country:3s}: {count:8,} Papers ({percentage:.2f}%)")

Anzahl Row Groups: 16
Chunk-Größe: 100,000 Zeilen
Verarbeitet: 1,000,000 Papers, 222 verschiedene Länder gefunden
Verarbeitet: 2,000,000 Papers, 223 verschiedene Länder gefunden
Verarbeitet: 3,000,000 Papers, 223 verschiedene Länder gefunden
Verarbeitet: 4,000,000 Papers, 223 verschiedene Länder gefunden
Verarbeitet: 5,000,000 Papers, 224 verschiedene Länder gefunden
Verarbeitet: 6,000,000 Papers, 224 verschiedene Länder gefunden
Verarbeitet: 7,000,000 Papers, 224 verschiedene Länder gefunden
Verarbeitet: 8,000,000 Papers, 224 verschiedene Länder gefunden
Verarbeitet: 9,000,000 Papers, 224 verschiedene Länder gefunden
Verarbeitet: 10,000,000 Papers, 225 verschiedene Länder gefunden
Verarbeitet: 11,000,000 Papers, 225 verschiedene Länder gefunden
Verarbeitet: 12,000,000 Papers, 226 verschiedene Länder gefunden
Verarbeitet: 13,000,000 Papers, 226 verschiedene Länder gefunden
Verarbeitet: 14,000,000 Papers, 226 verschiedene Länder gefunden
Verarbeitet: 15,000,000 Papers, 226 verschiedene 

In [5]:
# Bereite Dataframe für weitere Verarbeitung vor
countries_df = pd.DataFrame([
    {'Country_Code': country, 'Paper_Count': count}
    for country, count in country_counts.items()
]).sort_values('Paper_Count', ascending=False).reset_index(drop=True)

print(f"Länder DataFrame erstellt mit {len(countries_df)} Ländern")
countries_df.head(10)

Länder DataFrame erstellt mit 226 Ländern


Unnamed: 0,Country_Code,Paper_Count
0,GB,3057191
1,DE,2684903
2,FR,2340240
3,US,1715128
4,IT,1628325
5,ES,1490117
6,RU,1454541
7,NL,883788
8,PL,841244
9,CH,700694


## Company Institutionen und EU-Länder Kollaboration

In [6]:
def analyze_company_institutions_and_collaborations(parquet_dataset, chunk_size=5000000):
    """
    Analysiert Zusammenarbeit zwischen EU-Ländern und Industrie
    Sammelt FWCI-Daten für Farbkodierung.
    """
    # EU-27 Mitgliedsstaaten (ISO 2-Letter Codes)
    eu_countries = {
        'AT', 'BE', 'BG', 'HR', 'CY', 'CZ', 'DK', 'EE', 'FI', 'FR',
        'DE', 'GR', 'HU', 'IE', 'IT', 'LV', 'LT', 'LU', 'MT', 'NL',
        'PL', 'PT', 'RO', 'SK', 'SI', 'ES', 'SE'
    }
    
    papers_with_companies = 0
    total_papers_analyzed = 0
    country_paper_counts = Counter()
    company_country_collaborations = Counter()
    country_country_collaborations = Counter()
    country_fwci_scores = {country: [] for country in eu_countries}
    
    print(f"Verarbeite {parquet_dataset.num_row_groups} Row Groups...")
    
    for batch_idx, batch in enumerate(parquet_dataset.iter_batches(
        batch_size=chunk_size,
        columns=['institutions', 'country_codes', 'fwci']
    )):
        
        df_chunk = batch.to_pandas()
      
        for _, row in df_chunk.iterrows():
            total_papers_analyzed += 1
            
            has_company = False
            institutions_data = row['institutions']
            
            # Institutionen parsen  
            if pd.notna(institutions_data) and institutions_data:
                try:
                    if isinstance(institutions_data, str):
                        institutions = json.loads(institutions_data)
                    else:
                        institutions = institutions_data
                    
                    # Prüfe ob Company-Institution vorhanden
                    if isinstance(institutions, list):
                        for institution in institutions:
                            if isinstance(institution, dict) and institution.get('type') == 'company':
                                has_company = True
                                break
                except (json.JSONDecodeError, TypeError):
                    pass
            
            # Parse Ländercodes und FWCI
            countries_in_paper = set()
            country_codes_str = row['country_codes']
            fwci_score = row.get('fwci', None)
            
            if pd.notna(country_codes_str) and country_codes_str:
                if isinstance(country_codes_str, str):
                    countries = []
                    for separator in [',', ';', '|', ' ']:
                        if separator in country_codes_str:
                            countries = [c.strip().upper() for c in country_codes_str.split(separator) if c.strip()]
                            break
                    
                    if not countries:
                        countries = [country_codes_str.strip().upper()]
                    
                    # Nur EU-Länder berücksichtigen
                    for country in countries:
                        if country in eu_countries:
                            countries_in_paper.add(country)
                            
                            # FWCI-Score für dieses Land sammeln
                            if pd.notna(fwci_score) and isinstance(fwci_score, (int, float)) and fwci_score > 0:
                                country_fwci_scores[country].append(fwci_score)
            
            # Wenn mindestens ein EU-Land beteiligt ist
            if countries_in_paper:
                # Zähle Paper pro Land
                for country in countries_in_paper:
                    country_paper_counts[country] += 1
                
                # Wenn Company beteiligt ist
                if has_company:
                    papers_with_companies += 1
                    
                    # Zähle Company-Country Kollaborationen
                    for country in countries_in_paper:
                        company_country_collaborations[country] += 1
                
                # Zähle Country-Country Kollaborationen
                if len(countries_in_paper) > 1:
                    for country1, country2 in combinations(sorted(countries_in_paper), 2):
                        country_country_collaborations[(country1, country2)] += 1
        
        if (batch_idx + 1) % 20 == 0:
            print(f"Verarbeitet: {total_papers_analyzed:,} Papers")
        
        del df_chunk
        gc.collect()
    
    # Berechne Median FWCI für jedes Land
    country_median_fwci = {}
    for country, fwci_list in country_fwci_scores.items():
        if fwci_list:
            country_median_fwci[country] = np.median(fwci_list)
            print(f"{country}: {len(fwci_list):,} Papers mit FWCI, Median: {country_median_fwci[country]:.3f}")

  
    results = {
        'total_papers': total_papers_analyzed,
        'papers_with_companies': papers_with_companies,
        'country_counts': country_paper_counts,
        'company_country_collab': company_country_collaborations,
        'country_country_collab': country_country_collaborations,
        'eu_countries': eu_countries,
        'country_median_fwci': country_median_fwci
    }

    # Save results to disk for caching
    os.makedirs(os.path.dirname("data/cache/"), exist_ok=True)
    with open("data/cache/hypothese_6_company_institutions_collabos.pkl", "wb") as f:
        pickle.dump(results, f)

    return results

In [7]:
if os.path.exists("data/cache/hypothese_6_company_institutions_collabos.pkl"):
    print("Lade Ergebnisse aus Cache")
    with open("data/cache/hypothese_6_company_institutions_collabos.pkl", "rb") as f:
        analysis_results = pickle.load(f)
else:
    print("Keine gecachte Version vorhanden -> Analyse wird gestartet")
    analysis_results = analyze_company_institutions_and_collaborations(parquet_dataset)

Lade Ergebnisse aus Cache


In [8]:
print(f"Ergebnisse")
print(f"Total analysierte Papers: {analysis_results['total_papers']:,}")
print(f"Papers mit Company-Institutionen: {analysis_results['papers_with_companies']:,}")
print(f"EU-Länder gefunden: {len(analysis_results['country_counts'])}")
print(f"Median FWCI-Bereich: {min(analysis_results['country_median_fwci'].values()):.3f} - {max(analysis_results['country_median_fwci'].values()):.3f}")

Ergebnisse
Total analysierte Papers: 16,751,942
Papers mit Company-Institutionen: 688,905
EU-Länder gefunden: 27
Median FWCI-Bereich: 0.738 - 1.653


In [9]:
print(f"Top 10 EU-Länder nach Anzahl Papers")
top_eu_countries = analysis_results['country_counts'].most_common(10)
for rank, (country, count) in enumerate(top_eu_countries, 1):
    percentage = (count / analysis_results['total_papers']) * 100
    print(f"{rank:2d}. {country}: {count:8,} Papers ({percentage:.2f}%)")

Top 10 EU-Länder nach Anzahl Papers
 1. DE: 2,684,903 Papers (16.03%)
 2. FR: 2,340,240 Papers (13.97%)
 3. IT: 1,628,325 Papers (9.72%)
 4. ES: 1,490,117 Papers (8.90%)
 5. NL:  883,788 Papers (5.28%)
 6. PL:  841,244 Papers (5.02%)
 7. SE:  565,233 Papers (3.37%)
 8. BE:  533,431 Papers (3.18%)
 9. PT:  429,884 Papers (2.57%)
10. DK:  405,820 Papers (2.42%)


In [10]:
print(f"Top 10 Company-Country Kollaborationen")
top_company_collab = analysis_results['company_country_collab'].most_common(10)
for rank, (country, count) in enumerate(top_company_collab, 1):
    print(f"{rank:2d}. {country}: {count:8,} Papers mit Companies")

Top 10 Company-Country Kollaborationen
 1. DE:  233,502 Papers mit Companies
 2. FR:  139,291 Papers mit Companies
 3. IT:  104,189 Papers mit Companies
 4. NL:   79,626 Papers mit Companies
 5. ES:   79,034 Papers mit Companies
 6. SE:   55,836 Papers mit Companies
 7. BE:   50,955 Papers mit Companies
 8. AT:   35,713 Papers mit Companies
 9. CZ:   33,948 Papers mit Companies
10. DK:   31,183 Papers mit Companies


## Industrieanteile pro Land

In [11]:
country_counts = analysis_results['country_counts']
company_country_collab = analysis_results['company_country_collab']
country_median_fwci = analysis_results['country_median_fwci']

# Berechne Industrieanteile für alle Länder
industry_analysis = []
for country, total_papers in country_counts.items():
    industry_papers = company_country_collab.get(country, 0)
    industry_ratio = (industry_papers / total_papers) if total_papers > 0 else 0
    fwci_median = country_median_fwci.get(country, 1.0)
    
    industry_analysis.append({
        'country': country,
        'total_papers': total_papers,
        'industry_papers': industry_papers,
        'industry_ratio': industry_ratio,
        'fwci_median': fwci_median
    })

# Sortiere nach absteigend nach Anteil
industry_analysis.sort(key=lambda x: x['industry_ratio'], reverse=True)

In [12]:
print(f"Top 15 Länder mit höchstem Industrieanteil")
print(f"\n{'Land':<15} {'Gesamt':<8} {'Industrie':<8} {'Anteil':<8} {'FWCI':<6}")

for i, data in enumerate(industry_analysis[:15], 1):
    country = data['country']
    total = data['total_papers']
    industry = data['industry_papers']
    ratio = data['industry_ratio']
    fwci = data['fwci_median']
    
    print(f"{i:2d}. {country:<12} {total:7,} {industry:7,} {ratio:6.1%} {fwci:5.2f}")

Top 15 Länder mit höchstem Industrieanteil

Land            Gesamt   Industrie Anteil   FWCI  
 1. CZ           329,260  33,948  10.3%  1.02
 2. SE           565,233  55,836   9.9%  1.58
 3. BE           533,431  50,955   9.6%  1.45
 4. AT           383,322  35,713   9.3%  1.41
 5. NL           883,788  79,626   9.0%  1.65
 6. DE           2,684,903 233,502   8.7%  1.33
 7. CY            23,566   1,871   7.9%  1.58
 8. FI           297,821  23,507   7.9%  1.55
 9. DK           405,820  31,183   7.7%  1.61
10. IE           225,952  16,034   7.1%  1.43
11. SI            95,454   6,691   7.0%  1.17
12. EE            47,562   3,245   6.8%  1.42
13. IT           1,628,325 104,189   6.4%  1.35
14. LU            43,109   2,570   6.0%  1.50
15. FR           2,340,240 139,291   6.0%  1.23


In [13]:
country_counts = analysis_results['country_counts']
company_country_collab = analysis_results['company_country_collab']

relevant_countries_test = []
industry_ratios_test = []

# Filtere Länder mit mehr als 1000 Paper
for country, count in country_counts.items():
    if count >= 1000:
        industry_papers = company_country_collab.get(country, 0)
        industry_ratio = (industry_papers / count) if count > 0 else 0
        relevant_countries_test.append(country)
        industry_ratios_test.append(industry_ratio)

# Sortiere nach Gesamtpapern
country_data = [(country, country_counts[country], ratio) 
                for country, ratio in zip(relevant_countries_test, industry_ratios_test)]
country_data.sort(key=lambda x: x[1], reverse=True)

print(f"Top 10 Länder bei Industrieanteile")
print(f"{'Land':<4} {'Gesamt Papers':<12} {'Industrie':<10} {'Anteil':<8}")

for i, (country, total_papers, ratio) in enumerate(country_data[:10]):
    industry_papers = company_country_collab.get(country, 0)
    print(f"{country:<4} {total_papers:>10,} {industry_papers:>9,} {ratio:>6.1%}")

Top 10 Länder bei Industrieanteile
Land Gesamt Papers Industrie  Anteil  
DE    2,684,903   233,502   8.7%
FR    2,340,240   139,291   6.0%
IT    1,628,325   104,189   6.4%
ES    1,490,117    79,034   5.3%
NL      883,788    79,626   9.0%
PL      841,244    27,221   3.2%
SE      565,233    55,836   9.9%
BE      533,431    50,955   9.6%
PT      429,884    19,379   4.5%
DK      405,820    31,183   7.7%


In [14]:
# Berechne wie die Segmentgrößen aussehen sollten
ratios_only = [data[2] for data in country_data]  # Nur die Industrieanteile
avg_ratio = sum(ratios_only) / len(ratios_only)
print(f"Durchschnittlicher Industrie-Anteil: {avg_ratio:.1%}")

# Normalisierung der Anteile
all_ratios = ratios_only + [avg_ratio]
total_ratio = sum(all_ratios)
segment_props = [r / total_ratio for r in all_ratios]

print(f"\nErwartete Segmentgrößen:")
print(f"{'Land':<4} {'Industrie-Anteil':<15} {'Segment-Größe':<13}")

for i, (country, _, ratio) in enumerate(country_data[:10]):
    segment_size = segment_props[i]
    print(f"{country:<4} {ratio:>13.1%} {segment_size:>11.1%}")

Durchschnittlicher Industrie-Anteil: 6.2%

Erwartete Segmentgrößen:
Land Industrie-Anteil Segment-Größe
DE            8.7%        5.0%
FR            6.0%        3.4%
IT            6.4%        3.7%
ES            5.3%        3.0%
NL            9.0%        5.1%
PL            3.2%        1.8%
SE            9.9%        5.6%
BE            9.6%        5.5%
PT            4.5%        2.6%
DK            7.7%        4.4%


In [15]:
def create_chord_diagram(analysis_results, min_papers=1000):
    """
    Erstellt ein Chord-Diagramm für EU-Länder und Industrie-Kollaborationen
    Segmentgrößen entsprechen dem Industrie-Anteil pro Land
    """
    
    # Filtere EU-Länder mit mindestens min_papers Paper
    country_counts = analysis_results['country_counts']
    company_country_collab = analysis_results['company_country_collab']
    country_country_collab = analysis_results['country_country_collab']
    
    # Sammle relevante Länder mit Industriedaten
    relevant_countries_data = []
    for country, count in country_counts.items():
        if count >= min_papers:
            industry_papers = company_country_collab.get(country, 0)
            industry_ratio = (industry_papers / count) if count > 0 else 0
            relevant_countries_data.append({
                'country': country,
                'total_papers': count,
                'industry_papers': industry_papers,
                'industry_ratio': industry_ratio
            })
    
    # Sortiere nach Industrieanteil (absteigend)
    relevant_countries_data.sort(key=lambda x: x['industry_ratio'], reverse=True)
    relevant_countries = [data['country'] for data in relevant_countries_data]
    
    print(f"Relevante EU-Länder (>= {min_papers} Papers): {len(relevant_countries)}")
    print(f"Sortiert nach Industrie-Anteil:")
    for i, data in enumerate(relevant_countries_data[:10]):
        print(f"  {i+1:2d}. {data['country']}: {data['industry_ratio']:.1%} ({data['industry_papers']:,} von {data['total_papers']:,})")
    
    labels = relevant_countries + ["Industry"]
    n_nodes = len(labels)
    matrix = np.zeros((n_nodes, n_nodes))
    
    # Country-Index Mapping
    country_to_idx = {country: i for i, country in enumerate(relevant_countries)}
    industry_idx = n_nodes - 1
    
    # Fülle Matrix mit Country-Industry Kollaborationen
    for country in relevant_countries:
        if country in company_country_collab:
            country_idx = country_to_idx[country]
            collab_count = company_country_collab[country]
            matrix[country_idx][industry_idx] = collab_count
            matrix[industry_idx][country_idx] = collab_count
    
    # Fülle Matrix mit Country-Country Kollaborationen
    for (country1, country2), count in country_country_collab.items():
        if country1 in country_to_idx and country2 in country_to_idx:
            idx1 = country_to_idx[country1]
            idx2 = country_to_idx[country2]
            matrix[idx1][idx2] = count
            matrix[idx2][idx1] = count
    
    # Berechne Segmentgrößen basierend auf Industrieanteilen
    industry_ratios = [data['industry_ratio'] for data in relevant_countries_data]
    industry_papers = [data['industry_papers'] for data in relevant_countries_data]
    
    # Füge Industry hinzu
    industry_papers.append(analysis_results['papers_with_companies'])
    avg_industry_ratio = sum(industry_ratios) / len(industry_ratios) if industry_ratios else 0.1
    industry_ratios.append(avg_industry_ratio)
    
    # Normalisiere die Industrieanteile zu Segmentproportionen
    total_ratio = sum(industry_ratios)
    segment_proportions = [ratio / total_ratio for ratio in industry_ratios]
    
    print(f"Segmentgrößen:")
    for i, (label, prop, ratio) in enumerate(zip(labels, segment_proportions, industry_ratios)):
        if label == "Industry":
            print(f"{label}: {prop:.1%} (Durchschnitt: {ratio:.1%})")
        else:
            print(f"{label}: {prop:.1%} (Industrie-Anteil: {ratio:.1%})")
    
    # Normalisiere Segmentgrößen für Visualisierung
    max_papers = max(industry_papers)
    normalized_sizes = [(size / max_papers) * 50 + 10 for size in industry_papers]
    
    return labels, matrix, normalized_sizes, industry_papers, segment_proportions

In [16]:
# Teste die Funktion
print("Test der Chord-Diagramm Funktion")
labels_corrected, matrix_corrected, sizes_corrected, actual_corrected, props_corrected = create_chord_diagram(analysis_results, min_papers=1000)

Test der Chord-Diagramm Funktion
Relevante EU-Länder (>= 1000 Papers): 27
Sortiert nach Industrie-Anteil:
   1. CZ: 10.3% (33,948 von 329,260)
   2. SE: 9.9% (55,836 von 565,233)
   3. BE: 9.6% (50,955 von 533,431)
   4. AT: 9.3% (35,713 von 383,322)
   5. NL: 9.0% (79,626 von 883,788)
   6. DE: 8.7% (233,502 von 2,684,903)
   7. CY: 7.9% (1,871 von 23,566)
   8. FI: 7.9% (23,507 von 297,821)
   9. DK: 7.7% (31,183 von 405,820)
  10. IE: 7.1% (16,034 von 225,952)
Segmentgrößen:
CZ: 5.9% (Industrie-Anteil: 10.3%)
SE: 5.6% (Industrie-Anteil: 9.9%)
BE: 5.5% (Industrie-Anteil: 9.6%)
AT: 5.3% (Industrie-Anteil: 9.3%)
NL: 5.1% (Industrie-Anteil: 9.0%)
DE: 5.0% (Industrie-Anteil: 8.7%)
CY: 4.5% (Industrie-Anteil: 7.9%)
FI: 4.5% (Industrie-Anteil: 7.9%)
DK: 4.4% (Industrie-Anteil: 7.7%)
IE: 4.1% (Industrie-Anteil: 7.1%)
SI: 4.0% (Industrie-Anteil: 7.0%)
EE: 3.9% (Industrie-Anteil: 6.8%)
IT: 3.7% (Industrie-Anteil: 6.4%)
LU: 3.4% (Industrie-Anteil: 6.0%)
FR: 3.4% (Industrie-Anteil: 6.0%)
GR: 3.

In [17]:
print(f"Vergleich der Segmentgrößen:")
print(f"{'Land':<4} {'Segment':<8} {'Industrie-Anteil':<15}")

for i, label in enumerate(labels_corrected[:10]):  # Top 10
    if label != "Industry":
        total_papers = analysis_results['country_counts'][label]
        industry_papers = analysis_results['company_country_collab'].get(label, 0)
        industry_ratio = (industry_papers / total_papers) if total_papers > 0 else 0
        
        print(f"{label:<4} {props_corrected[i]:>6.1%} {industry_ratio:>13.1%}")

print(f"{'IND':<4} {props_corrected[-1]:>6.1%}")

Vergleich der Segmentgrößen:
Land Segment  Industrie-Anteil
CZ     5.9%         10.3%
SE     5.6%          9.9%
BE     5.5%          9.6%
AT     5.3%          9.3%
NL     5.1%          9.0%
DE     5.0%          8.7%
CY     4.5%          7.9%
FI     4.5%          7.9%
DK     4.4%          7.7%
IE     4.1%          7.1%
IND    3.6%


In [18]:
# Teste das Verhältnis CZ/SE
cz_idx = labels_corrected.index('CZ') if 'CZ' in labels_corrected else -1
se_idx = labels_corrected.index('SE') if 'SE' in labels_corrected else -1

if cz_idx >= 0 and se_idx >= 0:
    cz_size = props_corrected[cz_idx]
    se_size = props_corrected[se_idx]
    print(f"CZ={cz_size:.1%}, SE={se_size:.1%}")
    print(f"Verhältnis CZ/SE: {cz_size/se_size:.2f}")
    
    # Teste gegen tatsächliche Industrie-Anteile
    cz_ratio = 0.103
    se_ratio = 0.099
    expected_ratio = cz_ratio / se_ratio
    print(f"Erwartetes Verhältnis: {expected_ratio:.2f}")
else:
    print("CZ oder SE nicht gefunden!")

CZ=5.9%, SE=5.6%
Verhältnis CZ/SE: 1.04
Erwartetes Verhältnis: 1.04


In [19]:
print("Legende für das Chord-Diagramm")
print("Segmentgröße: Größe entspricht dem Anteil der Papers mit Industriebeteiligung")
print("Flaggen: Jede Flagge repräsentiert ein EU-Land, Kraftwerksymbol repräsentiert Industrie")
print("Farbiger äußerer Rand: Median FWCI-Score (Forschungsqualität) des Landes. Grün=Hoch, Gelb=Mittel, Rot=Niedrig")
print("Verbindungslinien zwischen Ländern: absolute Anzahl gemeinsamer Papers, Dicke entspricht Anzahl der gemeinsamen Papers")

Legende für das Chord-Diagramm
Segmentgröße: Größe entspricht dem Anteil der Papers mit Industriebeteiligung
Flaggen: Jede Flagge repräsentiert ein EU-Land, Kraftwerksymbol repräsentiert Industrie
Farbiger äußerer Rand: Median FWCI-Score (Forschungsqualität) des Landes. Grün=Hoch, Gelb=Mittel, Rot=Niedrig
Verbindungslinien zwischen Ländern: absolute Anzahl gemeinsamer Papers, Dicke entspricht Anzahl der gemeinsamen Papers


In [20]:
# Top Country-Industry Kollaborationen
print("Top 15 Country-Industry Kollaborationen")
top_company_collaborations = analysis_results['company_country_collab'].most_common(15)
for rank, (country, count) in enumerate(top_company_collaborations, 1):
    total_papers_country = analysis_results['country_counts'][country]
    percentage = (count / total_papers_country) * 100 if total_papers_country > 0 else 0
    print(f"{rank:2d}. {country}: {count:6,} Papers ({percentage:.1f}% der Länder-Papers)")

Top 15 Country-Industry Kollaborationen
 1. DE: 233,502 Papers (8.7% der Länder-Papers)
 2. FR: 139,291 Papers (6.0% der Länder-Papers)
 3. IT: 104,189 Papers (6.4% der Länder-Papers)
 4. NL: 79,626 Papers (9.0% der Länder-Papers)
 5. ES: 79,034 Papers (5.3% der Länder-Papers)
 6. SE: 55,836 Papers (9.9% der Länder-Papers)
 7. BE: 50,955 Papers (9.6% der Länder-Papers)
 8. AT: 35,713 Papers (9.3% der Länder-Papers)
 9. CZ: 33,948 Papers (10.3% der Länder-Papers)
10. DK: 31,183 Papers (7.7% der Länder-Papers)
11. PL: 27,221 Papers (3.2% der Länder-Papers)
12. FI: 23,507 Papers (7.9% der Länder-Papers)
13. PT: 19,379 Papers (4.5% der Länder-Papers)
14. GR: 16,660 Papers (5.8% der Länder-Papers)
15. IE: 16,034 Papers (7.1% der Länder-Papers)


In [21]:
# Top Country-Country Kollaborationen
print("Top 15 Country-Country Kollaborationen")
top_country_collaborations = analysis_results['country_country_collab'].most_common(15)
for rank, ((country1, country2), count) in enumerate(top_country_collaborations, 1):
    print(f"{rank:2d}. {country1}-{country2}: {count:6,} gemeinsame Papers")

Top 15 Country-Country Kollaborationen
 1. DE-FR: 158,689 gemeinsame Papers
 2. DE-IT: 138,216 gemeinsame Papers
 3. FR-IT: 129,180 gemeinsame Papers
 4. DE-NL: 126,181 gemeinsame Papers
 5. ES-IT: 110,568 gemeinsame Papers
 6. DE-ES: 104,129 gemeinsame Papers
 7. ES-FR: 101,508 gemeinsame Papers
 8. AT-DE: 88,057 gemeinsame Papers
 9. BE-FR: 74,758 gemeinsame Papers
10. IT-NL: 69,820 gemeinsame Papers
11. DE-SE: 69,167 gemeinsame Papers
12. FR-NL: 68,098 gemeinsame Papers
13. BE-DE: 60,765 gemeinsame Papers
14. BE-NL: 58,919 gemeinsame Papers
15. ES-NL: 53,176 gemeinsame Papers


In [22]:
# Country-Namen Mapping für bessere Lesbarkeit
country_names = {
    'DE': 'Deutschland', 'FR': 'Frankreich', 'IT': 'Italien', 'ES': 'Spanien',
    'NL': 'Niederlande', 'BE': 'Belgien', 'AT': 'Österreich', 'PL': 'Polen',
    'SE': 'Schweden', 'DK': 'Dänemark', 'FI': 'Finnland', 'IE': 'Irland',
    'PT': 'Portugal', 'GR': 'Griechenland', 'CZ': 'Tschechien', 'RO': 'Rumänien',
    'HU': 'Ungarn', 'BG': 'Bulgarien', 'HR': 'Kroatien', 'SK': 'Slowakei',
    'SI': 'Slowenien', 'EE': 'Estland', 'LV': 'Lettland', 'LT': 'Litauen',
    'LU': 'Luxemburg', 'MT': 'Malta', 'CY': 'Zypern'
}

print("EU-Länder mit deutschen Namen")
eu_countries_in_data = sorted(analysis_results['country_counts'].keys())
for country in eu_countries_in_data:
    german_name = country_names.get(country, country)
    count = analysis_results['country_counts'][country]
    company_collab = analysis_results['company_country_collab'].get(country, 0)
    company_percentage = (company_collab / count * 100) if count > 0 else 0
    print(f"{country} ({german_name}): {count:6,} Papers, {company_collab:5,} mit Industry ({company_percentage:.1f}%)")

EU-Länder mit deutschen Namen
AT (Österreich): 383,322 Papers, 35,713 mit Industry (9.3%)
BE (Belgien): 533,431 Papers, 50,955 mit Industry (9.6%)
BG (Bulgarien): 89,259 Papers, 2,773 mit Industry (3.1%)
CY (Zypern): 23,566 Papers, 1,871 mit Industry (7.9%)
CZ (Tschechien): 329,260 Papers, 33,948 mit Industry (10.3%)
DE (Deutschland): 2,684,903 Papers, 233,502 mit Industry (8.7%)
DK (Dänemark): 405,820 Papers, 31,183 mit Industry (7.7%)
EE (Estland): 47,562 Papers, 3,245 mit Industry (6.8%)
ES (Spanien): 1,490,117 Papers, 79,034 mit Industry (5.3%)
FI (Finnland): 297,821 Papers, 23,507 mit Industry (7.9%)
FR (Frankreich): 2,340,240 Papers, 139,291 mit Industry (6.0%)
GR (Griechenland): 285,386 Papers, 16,660 mit Industry (5.8%)
HR (Kroatien): 120,725 Papers, 4,399 mit Industry (3.6%)
HU (Ungarn): 196,164 Papers, 10,667 mit Industry (5.4%)
IE (Irland): 225,952 Papers, 16,034 mit Industry (7.1%)
IT (Italien): 1,628,325 Papers, 104,189 mit Industry (6.4%)
LT (Litauen): 63,132 Papers, 1,66

In [23]:
eu_data = []

# Erstelle DataFrame für EU-Länder mit deutschen Namen und Industrieanteilen
for country in eu_countries_in_data:
    german_name = country_names.get(country, country)
    total_papers = analysis_results['country_counts'][country]
    company_papers = analysis_results['company_country_collab'].get(country, 0)
    company_percentage = (company_papers / total_papers * 100) if total_papers > 0 else 0
    
    eu_data.append({
        'Country_Code': country,
        'Country_Name_DE': german_name,
        'Total_Papers': total_papers,
        'Industry_Papers': company_papers,
        'Industry_Percentage': company_percentage
    })

eu_df = pd.DataFrame(eu_data).sort_values('Total_Papers', ascending=False)
eu_df.head(10)

Unnamed: 0,Country_Code,Country_Name_DE,Total_Papers,Industry_Papers,Industry_Percentage
5,DE,Deutschland,2684903,233502,8.696851
10,FR,Frankreich,2340240,139291,5.951996
15,IT,Italien,1628325,104189,6.398538
8,ES,Spanien,1490117,79034,5.303879
20,NL,Niederlande,883788,79626,9.009627
21,PL,Polen,841244,27221,3.235803
24,SE,Schweden,565233,55836,9.878404
1,BE,Belgien,533431,50955,9.552313
22,PT,Portugal,429884,19379,4.50796
6,DK,Dänemark,405820,31183,7.683949


In [24]:
def plot_chord_diagram(
    analysis_results, labels, matrix, node_sizes, actual_sizes, segment_proportions
):
    """
    Erstellt und plottet ein Chord-Diagramm für die EU-Länder und Industrie-Kollaborationen
    """
    # Mapping von Ländercodes zu Flaggen-Emojis
    country_flags = {
        "AT": "🇦🇹",
        "BE": "🇧🇪",
        "BG": "🇧🇬",
        "HR": "🇭🇷",
        "CY": "🇨🇾",
        "CZ": "🇨🇿",
        "DK": "🇩🇰",
        "EE": "🇪🇪",
        "FI": "🇫🇮",
        "FR": "🇫🇷",
        "DE": "🇩🇪",
        "GR": "🇬🇷",
        "HU": "🇭🇺",
        "IE": "🇮🇪",
        "IT": "🇮🇹",
        "LV": "🇱🇻",
        "LT": "🇱🇹",
        "LU": "🇱🇺",
        "MT": "🇲🇹",
        "NL": "🇳🇱",
        "PL": "🇵🇱",
        "PT": "🇵🇹",
        "RO": "🇷🇴",
        "SK": "🇸🇰",
        "SI": "🇸🇮",
        "ES": "🇪🇸",
        "SE": "🇸🇪",
    }

    n_nodes = len(labels)

    # Berechne Segmentwinkel
    segment_angles = [prop * 2 * np.pi for prop in segment_proportions]

    # Start und Endwinkel für jedes Segment
    segment_starts = []
    segment_ends = []
    current_angle = 0

    for i in range(n_nodes):
        segment_starts.append(current_angle)
        current_angle += segment_angles[i]
        segment_ends.append(current_angle)

    # FWCI Daten
    country_median_fwci = analysis_results["country_median_fwci"]
    country_counts = analysis_results["country_counts"]
    company_country_collab = analysis_results["company_country_collab"]

    relevant_countries = labels[:-1]  # Alle außer Industry
    fwci_values = [
        country_median_fwci.get(country, 1.0) for country in relevant_countries
    ]
    min_fwci = min(fwci_values)
    max_fwci = max(fwci_values)

    def fwci_color_mapping(fwci_score):
        """
        FWCI-Farbkodierung als Farbverlauf
        Rot (niedrig) -> Gelb (mittel) -> Grün (hoch)
        """
        # Normalisiere FWCI-Score zwischen 0 und 1
        normalized = (fwci_score - min_fwci) / (max_fwci - min_fwci)

        if normalized <= 0.5:
            # Bereich Rot -> Gelb (0 bis 0.5)
            # Von Rot (255,0,0) zu Gelb (255,255,0)
            factor = normalized * 2  # 0 bis 1
            red = 255
            green = int(255 * factor)
            blue = 0
        else:
            # Bereich Gelb -> Grün (0.5 bis 1)
            # Von Gelb (255,255,0) zu Grün (0,255,0)
            factor = (normalized - 0.5) * 2  # 0 bis 1
            red = int(255 * (1 - factor))
            green = 255
            blue = 0

        return f"rgb({red}, {green}, {blue})"

    # Berechne Chord-Positionen
    chord_positions = {}
    node_total_collabs = [sum(matrix[i]) for i in range(n_nodes)]
    for i in range(n_nodes):
        start_angle = segment_starts[i]
        end_angle = segment_ends[i]
        segment_span = segment_angles[i]

        connections_for_node = []
        for j in range(n_nodes):
            if i != j and matrix[i][j] > 0:
                connections_for_node.append((j, matrix[i][j]))

        connections_for_node.sort(key=lambda x: x[1], reverse=True)

        current_angle = start_angle
        chord_positions[i] = []

        if node_total_collabs[i] > 0:
            for target_node, weight in connections_for_node:
                proportion = weight / node_total_collabs[i]
                chord_angle_span = segment_span * proportion

                chord_start = current_angle
                chord_end = current_angle + chord_angle_span
                chord_mid = (chord_start + chord_end) / 2

                chord_positions[i].append(
                    {
                        "target": target_node,
                        "weight": weight,
                        "start_angle": chord_start,
                        "end_angle": chord_end,
                        "mid_angle": chord_mid,
                        "proportion": proportion,
                    }
                )

                current_angle += chord_angle_span

    # Erstelle alle Traces
    all_traces = []
    for i, label in enumerate(labels):
        start_angle = segment_starts[i]
        end_angle = segment_ends[i]

        angles_seg = np.linspace(start_angle, end_angle, 50)

        # Hauptsegment (von 0.75 bis 0.9)
        x_outer_main = 0.9 * np.cos(angles_seg)
        y_outer_main = 0.9 * np.sin(angles_seg)
        x_inner_main = 0.75 * np.cos(angles_seg[::-1])
        y_inner_main = 0.75 * np.sin(angles_seg[::-1])

        x_main = np.concatenate([x_outer_main, x_inner_main, [x_outer_main[0]]])
        y_main = np.concatenate([y_outer_main, y_inner_main, [y_outer_main[0]]])

        # Segmentfarben
        if label == "Industry":
            main_color = "#00257C"
        else:  # Öffentliche Forschung
            main_color = "#FFC620"

        # Hover-Text
        if label == "Industry":
            hover_text = f"{label}<br>{actual_sizes[i]:,} Papers mit Industrie-Beteiligung<br>Segmentgröße: {segment_proportions[i]*100:.1f}% vom Kreis"
        else:
            total_papers = country_counts.get(label, 0)
            industry_papers = company_country_collab.get(label, 0)
            industry_ratio = (industry_papers / total_papers) if total_papers > 0 else 0
            fwci_score = country_median_fwci.get(label, 1.0)
            flag = country_flags.get(label, label)

            hover_text = (
                f"{flag} {label}<br>"
                f"{total_papers:,} Gesamt-Papers<br>"
                f"{industry_papers:,} Papers mit Industrie ({industry_ratio:.1%})<br>"
                f"Median FWCI: {fwci_score:.3f}<br>"
                f"Segmentgröße: {segment_proportions[i]*100:.1f}% vom Kreis"
            )

        main_trace = go.Scatter(
            x=x_main,
            y=y_main,
            fill="toself",
            fillcolor=main_color,
            line=dict(color="white", width=2),
            mode="lines",
            hoverinfo="text",
            hovertext=hover_text,
            showlegend=False,
            name=label,
        )
        all_traces.append(main_trace)

        # FWCI-Segment über Ländern
        if label != "Industry":
            fwci_score = country_median_fwci.get(label, np.nan)
            fwci_color = fwci_color_mapping(fwci_score)

            x_outer_fwci = 0.95 * np.cos(angles_seg)
            y_outer_fwci = 0.95 * np.sin(angles_seg)
            x_inner_fwci = 0.9 * np.cos(angles_seg[::-1])
            y_inner_fwci = 0.9 * np.sin(angles_seg[::-1])

            x_fwci = np.concatenate([x_outer_fwci, x_inner_fwci, [x_outer_fwci[0]]])
            y_fwci = np.concatenate([y_outer_fwci, y_inner_fwci, [y_outer_fwci[0]]])

            fwci_trace = go.Scatter(
                x=x_fwci,
                y=y_fwci,
                fill="toself",
                fillcolor=fwci_color,
                line=dict(color="white", width=0.5),
                mode="lines",
                hoverinfo="skip",
                showlegend=False,
            )
            all_traces.append(fwci_trace)

        # Länder labels
        mid_angle = start_angle + segment_angles[i] / 2
        label_radius = 0.825
        label_x = label_radius * np.cos(mid_angle)
        label_y = label_radius * np.sin(mid_angle)

        if label == "Industry":
            display_text = "🏭"
            text_size = 24
            text_color = "white"
        else:
            display_text = country_flags.get(label, label)
            text_size = max(18, min(28, segment_proportions[i] * 600))
            text_color = "black"

        label_trace = go.Scatter(
            x=[label_x],
            y=[label_y],
            mode="text",
            text=[display_text],
            textfont=dict(size=text_size, color=text_color),
            textposition="middle center",
            hoverinfo="skip",
            showlegend=False,
        )
        all_traces.append(label_trace)

    # Chordlinien
    chord_traces = []
    processed_pairs = set()
    max_weight = max(max(matrix[i]) for i in range(n_nodes))
    
    # Iteriere über alle Knoten und ihre Chord-Positionen und erstelle Linien
    for i in range(n_nodes):
        if i not in chord_positions:
            continue

        for chord_info in chord_positions[i]:
            j = chord_info["target"]
            weight = chord_info["weight"]

            pair_key = tuple(sorted([i, j]))
            if pair_key in processed_pairs:
                continue
            processed_pairs.add(pair_key)

            target_chord_info = None
            if j in chord_positions:
                for target_chord in chord_positions[j]:
                    if target_chord["target"] == i:
                        target_chord_info = target_chord
                        break

            if target_chord_info is None:
                continue

            source_angle = chord_info["mid_angle"]
            target_angle = target_chord_info["mid_angle"]

            x0 = 0.75 * np.cos(source_angle)
            y0 = 0.75 * np.sin(source_angle)
            x1 = 0.75 * np.cos(target_angle)
            y1 = 0.75 * np.sin(target_angle)

            t = np.linspace(0, 1, 100)
            control_factor = 0.2
            cx = (x0 + x1) / 2 * control_factor
            cy = (y0 + y1) / 2 * control_factor

            curve_x = (1 - t) ** 2 * x0 + 2 * (1 - t) * t * cx + t**2 * x1
            curve_y = (1 - t) ** 2 * y0 + 2 * (1 - t) * t * cy + t**2 * y1

            line_width = max(0.5, (weight / max_weight) * 12)
            alpha = max(0.4, (weight / max_weight) * 0.9)

            # Linienfarbe setzen
            if labels[i] == "Industry" or labels[j] == "Industry":
                line_color = f"rgba(0, 37, 124, {alpha})"  # Dunkelblau für Industry
            else:
                line_color = f"rgba(255, 198, 32, {alpha})"  # Gelb für Länder-Länder

            source_percentage = (
                (weight / actual_sizes[i]) * 100 if actual_sizes[i] > 0 else 0
            )
            target_percentage = (
                (weight / actual_sizes[j]) * 100 if actual_sizes[j] > 0 else 0
            )

            chord_trace = go.Scatter(
                x=curve_x,
                y=curve_y,
                mode="lines",
                line=dict(width=line_width, color=line_color),
                hoverinfo="text",
                hovertext=f"{labels[i]} ↔ {labels[j]}<br>{weight:,} gemeinsame Papers<br>"
                f"{labels[i]}: {source_percentage:.1f}% seiner Papers<br>"
                f"{labels[j]}: {target_percentage:.1f}% seiner Papers",
                showlegend=False,
            )
            chord_traces.append(chord_trace)

    fig = go.Figure(data=chord_traces + all_traces)
    fig.update_layout(
        title=dict(
            text="Einfluss von Industrie-Kollaborationen auf den Forschungserfolg der EU-Mitgliedsstaaten",
            x=0.5,
            font=dict(size=16),
        ),
        showlegend=False,
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            scaleanchor="y",
            scaleratio=1,
            range=[0, 1.1],
        ),
        yaxis=dict(
            showgrid=False, zeroline=False, showticklabels=False, range=[-1.1, 1.1]
        ),
        plot_bgcolor="white",
        width=1300,
        height=900,
        margin=dict(l=50, r=50, t=100, b=50),
    )

    return fig

In [25]:
def add_legend_to_chord_diagram(fig):
    """
    Fügt eine Legende für das Chord-Diagramm hinzu.
    """
    legend_x0 = 0.68
    tile = 0.018
    pad = 0.03
    top_y = 1
    y = top_y

    def heading(txt):
        nonlocal y
        fig.add_annotation(
            x=legend_x0, y=y, xref="paper", yref="paper",
            text=f"{txt}",
            showarrow=False,
            font=dict(size=13, color="black"),
            xanchor="left"
        )
        y -= pad * 1.0

    def entry(color, label):
        nonlocal y
        # farbige Kachel
        fig.add_shape(
            type="rect",
            xref="paper",
            yref="paper",
            x0=legend_x0 + tile,
            x1=legend_x0 + 2*tile,
            y0=y - tile / 2 - pad * 0.4,
            y1=y + tile / 2 - pad * 0.4,
            fillcolor=color,
            line=dict(color="white", width=1),
        )
        # Text
        fig.add_annotation(
            x=legend_x0 + 2*tile + 0.012,
            y=y - pad * 0.4,
            xref="paper",
            yref="paper",
            text=label,
            showarrow=False,
            xanchor="left",
            yanchor="middle",
            font=dict(size=11),
        )
        y -= pad * 1.0

    # Äußerer Ring
    heading("Äußerer Ring: Median FWCI")
    entry("rgb(255,0,0)", "Niedriger FWCI")
    entry("rgb(255,255,0)", "Mittlerer FWCI")
    entry("rgb(0,255,0)", "Hoher FWCI")

    # Segmentfarben
    y -= pad * 0.2
    heading("Segmentgröße: Realtiver Anteil von Papern mit Industrie")
    y -= pad * 0.2

    # Verbindungslinien
    heading("Verbindungslinien: Abolute Anazahl von Papern mit")
    entry("rgba(0,37,124,0.8)", "Industrie")
    entry("rgba(255,198,32,0.8)", "Öffentliche Forschung")

    # Grauer Kasten als Hintergrund
    bottom_y = y - pad * 0.7
    fig.add_shape(
        type="rect",
        xref="paper",
        yref="paper",
        x0=legend_x0 - 0.02,
        x1=1.00,
        y0=bottom_y,
        y1=top_y + 0.03,
        fillcolor="rgb(230,230,230)",
        line=dict(color="rgb(200,200,200)", width=1),
        layer="below",
    )

    return fig

In [26]:
fig_improved = plot_chord_diagram(
    analysis_results, 
    labels_corrected, 
    matrix_corrected, 
    sizes_corrected, 
    actual_corrected, 
    props_corrected
)
# Legende hinzufügen
fig_improved = add_legend_to_chord_diagram(fig_improved)
fig_improved.show()

In [27]:
# Speichere als SVG
svg_path = os.path.join("data/output/", "Hypothese_6.pdf")
fig_improved.write_image(svg_path, format="pdf", width=1300, height=900)
print(f"SVG gespeichert: {svg_path}")

SVG gespeichert: data/output/Hypothese_6.pdf


## Korrelationsanalyse

In [28]:
correlation_data = []
country_counts = analysis_results['country_counts']
company_country_collab = analysis_results['company_country_collab']
country_median_fwci = analysis_results['country_median_fwci']

for country in country_counts.keys():
    total_papers = country_counts[country]
    industry_papers = company_country_collab.get(country, 0)
    median_fwci = country_median_fwci.get(country, 1.0)
    
    # Berechne relative Anzahl Industry-Papers
    relative_industry_papers = (industry_papers / total_papers) * 100 if total_papers > 0 else 0
    
    if total_papers >= 100:  # Mindestens 100 Papers für Stabilität
        correlation_data.append({
            'Country': country,
            'Total_Papers': total_papers,
            'Industry_Papers': industry_papers,
            'Relative_Industry_Papers': relative_industry_papers,
            'Median_FWCI': median_fwci
        })

# Erstelle DataFrame für Korrelationsanalyse
corr_df = pd.DataFrame(correlation_data)
print(f"Anzahl Länder für Korrelationsanalyse: {len(corr_df)}")

Anzahl Länder für Korrelationsanalyse: 27


In [29]:
# Berechne verschiedene Korrelationskoeffizienten
if len(corr_df) >= 3:  # Mindestens 3 Datenpunkte für sinnvolle Korrelation
    # Pearson-Korrelation (lineare Beziehung)
    pearson_corr = corr_df['Relative_Industry_Papers'].corr(corr_df['Median_FWCI'], method='pearson')
    # Spearman-Korrelation (monotone Beziehung)
    spearman_corr = corr_df['Relative_Industry_Papers'].corr(corr_df['Median_FWCI'], method='spearman')
    # Kendall-Tau-Korrelation (robuste Aanlyse)
    kendall_corr = corr_df['Relative_Industry_Papers'].corr(corr_df['Median_FWCI'], method='kendall')
    
    print("Korrelationsanalyse:")
    print(f"Pearson-Korrelation:\t{pearson_corr:.4f}")
    print(f"Spearman-Korrelation:\t{spearman_corr:.4f}")
    print(f"Kendall-Tau:\t\t{kendall_corr:.4f}\n")
    
    # Interpretation der Korrelationsstärke
    def interpret_correlation(corr):
        abs_corr = abs(corr)
        if abs_corr >= 0.8:
            return "sehr stark"
        elif abs_corr >= 0.6:
            return "stark"
        elif abs_corr >= 0.4:
            return "moderat"
        elif abs_corr >= 0.2:
            return "schwach"
        else:
            return "sehr schwach"
    
    print("Interpreation der Korrelationsstärke:")
    print(f"Pearson-Korrelation:\t{interpret_correlation(pearson_corr)} ({pearson_corr:.4f})")
    print(f"Spearman-Korrelation:\t{interpret_correlation(spearman_corr)} ({spearman_corr:.4f})")
    print(f"Kendall-Korrelation:\t{interpret_correlation(kendall_corr)} ({kendall_corr:.4f})\n")
    
    print("Deskriptive Statistiken:")
    print(" Relative Industry-Papers (%):")
    print(f"   Mittelwert: {corr_df['Relative_Industry_Papers'].mean():.2f}%")
    print(f"   Median:     {corr_df['Relative_Industry_Papers'].median():.2f}%")
    print(f"   Min/Max:    {corr_df['Relative_Industry_Papers'].min():.2f}% - {corr_df['Relative_Industry_Papers'].max():.2f}%")
    print(f"   Std.abw.:   {corr_df['Relative_Industry_Papers'].std():.2f}%\n")
    
    print("\n Median FWCI-Score:")
    print(f"   Mittelwert: {corr_df['Median_FWCI'].mean():.3f}")
    print(f"   Median:     {corr_df['Median_FWCI'].median():.3f}")
    print(f"   Min/Max:    {corr_df['Median_FWCI'].min():.3f} - {corr_df['Median_FWCI'].max():.3f}")
    print(f"   Std.abw.:   {corr_df['Median_FWCI'].std():.3f}\n")
else:
    print("Nicht genügend Daten für Korrelationsanalyse")

Korrelationsanalyse:
Pearson-Korrelation:	0.6838
Spearman-Korrelation:	0.7088
Kendall-Tau:		0.5499

Interpreation der Korrelationsstärke:
Pearson-Korrelation:	stark (0.6838)
Spearman-Korrelation:	stark (0.7088)
Kendall-Korrelation:	moderat (0.5499)

Deskriptive Statistiken:
 Relative Industry-Papers (%):
   Mittelwert: 6.25%
   Median:     5.96%
   Min/Max:    2.50% - 10.31%
   Std.abw.:   2.36%


 Median FWCI-Score:
   Mittelwert: 1.258
   Median:     1.262
   Min/Max:    0.738 - 1.653
   Std.abw.:   0.252

