# Descriptive Analysis of Questions Q1 to Q8

In this section, a descriptive statistical analysis is performed for questions Q1 to Q8 (including their numerical sub-questions) from the dataset `clean_data.csv`. For each question, the mean, standard deviation, minimum, maximum, mode, and frequency of the mode are calculated. In the `clean_data.csv` file, values representing "no answer" (originally -99) have already been treated as missing values (NaN).

In [None]:
import pandas as pd
import numpy as np

# Load data
file_path = '../../src/llm/clean_data.csv'
df = pd.read_csv(file_path)

# Define columns for analysis (Q1-Q8, numerical sub-questions, skip Q3)
question_columns = ['Q1']
question_columns.extend([f'Q2{chr(ord("A")+i)}' for i in range(7)]) # Q2A to Q2G
# Q3 is a text question and is skipped here
question_columns.extend([f'Q4{chr(ord("A")+i)}' for i in range(8)]) # Q4A to Q4H
question_columns.append('Q5')
question_columns.extend([f'Q6{chr(ord("A")+i)}' for i in range(6)]) # Q6A to Q6F
question_columns.extend([f'Q7{chr(ord("A")+i)}' for i in range(6)]) # Q7A to Q7F
question_columns.extend([f'Q8{chr(ord("A")+i)}' for i in range(12)]) # Q8A to Q8L

results = []

for col in question_columns:
    if col in df.columns:
        # Ensure the column is numeric. Values that cannot be converted become NaN.
        # The -99 values were already converted to NaN in clean_data.csv.
        series_cleaned = pd.to_numeric(df[col], errors='coerce')
        
        # Calculate statistics
        mean_val = series_cleaned.mean()
        std_val = series_cleaned.std()
        min_val = series_cleaned.min()
        max_val = series_cleaned.max()
        
        mode_val = np.nan
        frequency_val = np.nan
        
        if series_cleaned.notna().sum() > 0:
            mode_series = series_cleaned.mode()
            if not mode_series.empty:
                mode_val = mode_series.iloc[0]
                # Ensure frequency_val is a number, even if mode_val is NaN (should not happen here)
                # or if the mode does not appear in value_counts (very unlikely)
                frequency_val = series_cleaned.value_counts().get(mode_val, 0) if pd.notna(mode_val) else 0
        
        results.append({
            'Column': col,
            'Mean': mean_val,
            'Std': std_val,
            'Min': min_val,
            'Max': max_val,
            'Mode': mode_val,
            'Frequency': int(frequency_val) if pd.notna(frequency_val) else np.nan # Frequency as Int, if not NaN
        })

# Display results as DataFrame
df_results = pd.DataFrame(results)

# Print DataFrame
print(df_results.to_string())

   Column      Mean       Std  Min  Max  Mode  Frequency
0      Q1  3.098361  1.032609  1.0  5.0   3.0        216
1     Q2A  2.442623  0.570301  1.0  3.0   3.0        235
2     Q2B  2.662551  0.515049  1.0  3.0   3.0        332
3     Q2C  1.378099  0.564307  1.0  3.0   1.0        321
4     Q2D  1.344398  0.571008  1.0  3.0   1.0        340
5     Q2E  2.254167  0.691420  1.0  3.0   2.0        220
6     Q2F  2.491561  0.603839  1.0  3.0   3.0        260
7     Q2G  2.195329  0.723511  1.0  3.0   2.0        207
8     Q4A  2.663158  0.539779  1.0  3.0   3.0        331
9     Q4B  2.090336  0.752701  1.0  3.0   2.0        203
10    Q4C  1.376874  0.581677  1.0  3.0   1.0        315
11    Q4D  1.939583  0.676550  1.0  3.0   2.0        259
12    Q4E  1.646934  0.679665  1.0  3.0   1.0        222
13    Q4F  1.247881  0.513094  1.0  3.0   1.0        373
14    Q4G  1.068966  0.293166  1.0  3.0   1.0        437
15    Q4H  1.074786  0.328459  1.0  3.0   1.0        442
16     Q5  2.076763  1.159037  

In [None]:
# Save results as CSV file
output_csv_path = '../../data/stat_summaryQ1toQ8.csv'
df_results.to_csv(output_csv_path, index=False, encoding='utf-8')

print(f"The results were successfully saved to '{output_csv_path}'.")

Die Ergebnisse wurden erfolgreich in '../../data/stat_summaryQ1toQ8.csv' gespeichert.


## Explanation of the Calculated Columns

In the table above:

*   **Column**: The name of the column (question or sub-question) from the dataset.
*   **Mean**: The average value of the answers for this question.
*   **Std**: The standard deviation, a measure of the dispersion of answers around the mean.
*   **Min**: The smallest value given for this question.
*   **Max**: The largest value given for this question.
*   **Mode**: The value that was most frequently named for this question. If multiple values occur with the same highest frequency, one of them is displayed here (typically the smallest).
*   **Frequency**: The number of times the mode (the most frequent value) occurs in the answers for this question.

## Comparison of Generated Statistics with a Reference File

In this section, the statistics generated in this notebook session (`stat_summaryQ1toQ8.csv`) are compared with a reference file (`../../src/llm/stat_summary.csv`). Only the rows (questions Q1-Q8 and their sub-questions) and columns (`Mean`, `Std`, `Min`, `Max`, `Mode`, `Frequency`) that are present in both files are compared.

In [None]:
import pandas as pd
import numpy as np

print("Comparison of the generated file stat_summaryQ1toQ8.csv with the reference file ../../src/llm/stat_summary.csv\n")

# Path to the generated file
path_generated = '../../data/stat_summaryQ1toQ8.csv'
# Path to the reference file
path_reference = '../../src/llm/stat_summary.csv'

try:
    df_generated = pd.read_csv(path_generated)
    df_reference = pd.read_csv(path_reference)

    # Set 'Column' as index
    df_generated_indexed = df_generated.set_index('Column')
    df_reference_indexed = df_reference.set_index('Column')

    # Select only the questions (rows) present in the generated file
    common_question_rows = df_generated_indexed.index
    df_reference_filtered = df_reference_indexed.loc[df_reference_indexed.index.isin(common_question_rows)]

    # Define columns for comparison
    stat_cols_to_compare = ['Mean', 'Std', 'Min', 'Max', 'Mode', 'Frequency']
    
    # Ensure both DataFrames have the same rows (in the same order) and columns
    df_generated_aligned = df_generated_indexed.loc[common_question_rows, stat_cols_to_compare].copy()
    df_reference_aligned = df_reference_filtered.reindex(common_question_rows)[stat_cols_to_compare].copy()

    all_values_match = True

    # Comparison for 'Mean' and 'Std' (floating-point numbers with tolerance)
    for col_name in ['Mean', 'Std']:
        if col_name in df_generated_aligned.columns and col_name in df_reference_aligned.columns:
            series_gen = df_generated_aligned[col_name]
            series_ref = df_reference_aligned[col_name]
            
            # Check for NaN consistency before using np.isclose to avoid misleading length differences
            if series_gen.isna().sum() != series_ref.isna().sum() or not np.all(np.isclose(series_gen.dropna(), series_ref.dropna(), rtol=1e-7, atol=1e-9)):
                 # Fallback in case dropna() leads to different lengths or NaNs are different
                if not np.all(np.isclose(series_gen, series_ref, rtol=1e-7, atol=1e-9, equal_nan=True)):
                    all_values_match = False
                    print(f"Differences found in column: {col_name}")
                    comparison_df = pd.DataFrame({'Generated': series_gen, 'Reference': series_ref})
                    mask_diff = ~np.isclose(series_gen, series_ref, rtol=1e-7, atol=1e-9, equal_nan=True)
                    print(comparison_df[mask_diff].to_string())
                    print("-" * 50)
        else:
            print(f"Column {col_name} not present in both DataFrames for comparison.")
            all_values_match = False


    # Comparison for 'Min', 'Max', 'Mode', 'Frequency'
    # These are compared as float to handle type differences (e.g., int vs. float)
    for col_name in ['Min', 'Max', 'Mode', 'Frequency']:
        if col_name in df_generated_aligned.columns and col_name in df_reference_aligned.columns:
            series_gen = df_generated_aligned[col_name].astype(float)
            series_ref = df_reference_aligned[col_name].astype(float)

            if not series_gen.equals(series_ref): # .equals() handles NaNs correctly
                all_values_match = False
                print(f"Differences found in column: {col_name}")
                # Show original values for better readability
                comparison_df = pd.DataFrame({'Generated': df_generated_aligned[col_name], 
                                              'Reference': df_reference_aligned[col_name]})
                # Mask for different values (after conversion to float)
                mask_diff = (series_gen != series_ref) | (series_gen.isna() != series_ref.isna())
                print(comparison_df[mask_diff].to_string())
                print("-" * 50)
        else:
            print(f"Column {col_name} not present in both DataFrames for comparison.")
            all_values_match = False

    if all_values_match:
        print("All compared values for Q1-Q8 in the generated file match the reference file.")
    else:
        print("Differences were found. Please check the output above.")

except FileNotFoundError:
    print(f"One of the files was not found. Please check the paths:\n- Generated: {path_generated}\n- Reference: {path_reference}")
except Exception as e:
    print(f"An error occurred during the comparison: {e}")


Vergleich der generierten Datei stat_summaryQ1toQ8.csv mit der Referenzdatei ../../src/llm/stat_summary.csv

Alle verglichenen Werte für Q1-Q8 in der generierten Datei stimmen mit der Referenzdatei überein.


In [1]:
# Extended imports for comprehensive univariate analysis
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import skew, kurtosis
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

In [2]:
def calculate_advanced_stats(series, confidence_level=0.95):
    """
    Calculate advanced descriptive statistics for a series
    """
    clean_series = pd.to_numeric(series, errors='coerce').dropna()
    
    if len(clean_series) == 0:
        return {
            'count': 0, 'mean': np.nan, 'std': np.nan, 'min': np.nan, 'max': np.nan,
            'q25': np.nan, 'median': np.nan, 'q75': np.nan, 'skewness': np.nan,
            'kurtosis': np.nan, 'ci_lower': np.nan, 'ci_upper': np.nan,
            'missing_count': len(series), 'missing_percent': 100.0
        }
    
    # Basic statistics
    count = len(clean_series)
    mean_val = clean_series.mean()
    std_val = clean_series.std()
    min_val = clean_series.min()
    max_val = clean_series.max()
    
    # Quantiles
    q25 = clean_series.quantile(0.25)
    median = clean_series.median()
    q75 = clean_series.quantile(0.75)
    
    # Shape statistics
    skewness = skew(clean_series)
    kurt = kurtosis(clean_series)
    
    # Confidence interval for mean
    alpha = 1 - confidence_level
    ci_lower, ci_upper = stats.t.interval(confidence_level, count-1, 
                                         loc=mean_val, 
                                         scale=stats.sem(clean_series))
    
    # Missing values
    missing_count = len(series) - count
    missing_percent = (missing_count / len(series)) * 100
    
    return {
        'count': count,
        'mean': mean_val,
        'std': std_val,
        'min': min_val,
        'max': max_val,
        'q25': q25,
        'median': median,
        'q75': q75,
        'skewness': skewness,
        'kurtosis': kurt,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'missing_count': missing_count,
        'missing_percent': missing_percent
    }

In [3]:
def create_distribution_plots(series, title, scale_labels=None):
    """
    Create comprehensive distribution plots for a variable
    """
    clean_series = pd.to_numeric(series, errors='coerce').dropna()
    
    if len(clean_series) == 0:
        print(f"No valid data for {title}")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle(f'Distribution Analysis: {title}', fontsize=16, fontweight='bold')
    
    # Histogram with KDE
    axes[0,0].hist(clean_series, bins='auto', alpha=0.7, color='skyblue', edgecolor='black')
    axes[0,0].axvline(clean_series.mean(), color='red', linestyle='--', 
                     label=f'Mean: {clean_series.mean():.2f}')
    axes[0,0].axvline(clean_series.median(), color='green', linestyle='--', 
                     label=f'Median: {clean_series.median():.2f}')
    axes[0,0].set_title('Histogram')
    axes[0,0].set_xlabel('Value')
    axes[0,0].set_ylabel('Frequency')
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3)
    
    # Box plot
    box_plot = axes[0,1].boxplot(clean_series, patch_artist=True)
    box_plot['boxes'][0].set_facecolor('lightblue')
    axes[0,1].set_title('Box Plot')
    axes[0,1].set_ylabel('Value')
    axes[0,1].grid(True, alpha=0.3)
    
    # Q-Q plot
    stats.probplot(clean_series, dist="norm", plot=axes[1,0])
    axes[1,0].set_title('Q-Q Plot (Normal Distribution)')
    axes[1,0].grid(True, alpha=0.3)
    
    # Value counts bar plot
    value_counts = clean_series.value_counts().sort_index()
    bars = axes[1,1].bar(value_counts.index, value_counts.values, 
                        color='lightcoral', alpha=0.7, edgecolor='black')
    axes[1,1].set_title('Value Counts')
    axes[1,1].set_xlabel('Value')
    axes[1,1].set_ylabel('Count')
    axes[1,1].grid(True, alpha=0.3)
    
    # Add value labels on bars if not too many
    if len(value_counts) <= 15:
        for bar in bars:
            height = bar.get_height()
            axes[1,1].text(bar.get_x() + bar.get_width()/2., height + 0.5,
                          f'{int(height)}', ha='center', va='bottom')
    
    # Set x-axis labels if scale_labels provided
    if scale_labels:
        for ax in [axes[0,0], axes[0,1], axes[1,1]]:
            if hasattr(ax, 'set_xticklabels'):
                try:
                    current_ticks = ax.get_xticks()
                    if len(current_ticks) <= len(scale_labels):
                        ax.set_xticks(range(1, len(scale_labels)+1))
                        ax.set_xticklabels(scale_labels, rotation=45, ha='right')
                except:
                    pass
    
    plt.tight_layout()
    plt.show()
    
    return fig

# Umfassende Univariate Analyse (Q1-Q8)

In diesem Abschnitt führen wir eine detaillierte univariate Analyse für alle Fragen Q1 bis Q8 durch. Die Analyse umfasst:

- **Erweiterte deskriptive Statistiken** (Schiefe, Kurtosis, Quantile, Konfidenzintervalle)
- **Analyse fehlender Werte**
- **Verteilungsvisualisierungen** (Histogramm, Boxplot, Q-Q-Plot, Häufigkeitsdiagramm)
- **Inhaltliche Interpretation** der Ergebnisse

Wir analysieren systematisch jede Fragengruppe mit ihren spezifischen Skalen und Bedeutungen.

In [4]:
# Load questionnaire reference data for better interpretation
try:
    fragebogen_df = pd.read_csv('../../data/fragebogen.csv', sep=';', encoding='utf-8')
    print("Fragebogen-Referenzdaten erfolgreich geladen")
    print(f"Verfügbare Spalten: {list(fragebogen_df.columns)}")
    print("\nErste Zeilen der Fragebogen-Daten:")
    print(fragebogen_df.head())
except Exception as e:
    print(f"Hinweis: Fragebogen-Referenzdaten konnten nicht geladen werden: {e}")
    fragebogen_df = None

Hinweis: Fragebogen-Referenzdaten konnten nicht geladen werden: name 'pd' is not defined


## Q4: Analyse der Trinksituationen (Q4A-Q4H)

Q4 fragt nach verschiedenen Situationen, in denen Bier getrunken wird. Die Skala reicht von 1 (nie) bis 5 (sehr oft).

**Analysierte Situationen:**
- Q4A: Bei geselligen Anlässen
- Q4B: Beim Essen
- Q4C: Nach dem Sport
- Q4D: Beim Fernsehen/Entspannen
- Q4E: In Kneipen/Bars
- Q4F: Bei besonderen Anlässen
- Q4G: Alleine zu Hause
- Q4H: Mit Arbeitskollegen

In [None]:
# Q4 Analysis: Drinking Situations
print("=" * 60)
print("Q4: ANALYSE DER TRINKSITUATIONEN (Q4A-Q4H)")
print("=" * 60)

# Define Q4 columns and their meanings
q4_columns = ['Q4A', 'Q4B', 'Q4C', 'Q4D', 'Q4E', 'Q4F', 'Q4G', 'Q4H']
q4_labels = {
    'Q4A': 'Bei geselligen Anlässen',
    'Q4B': 'Beim Essen',
    'Q4C': 'Nach dem Sport',
    'Q4D': 'Beim Fernsehen/Entspannen',
    'Q4E': 'In Kneipen/Bars',
    'Q4F': 'Bei besonderen Anlässen',
    'Q4G': 'Alleine zu Hause',
    'Q4H': 'Mit Arbeitskollegen'
}

scale_labels_q4 = ['Nie', 'Selten', 'Manchmal', 'Oft', 'Sehr oft']

# Calculate comprehensive statistics for each Q4 variable
q4_stats = {}
for col in q4_columns:
    if col in df.columns:
        stats_dict = calculate_advanced_stats(df[col])
        q4_stats[col] = stats_dict
        
        print(f"\n{col} - {q4_labels[col]}:")
        print(f"  Gültige Antworten: {stats_dict['count']}")
        print(f"  Fehlende Werte: {stats_dict['missing_count']} ({stats_dict['missing_percent']:.1f}%)")
        print(f"  Mittelwert: {stats_dict['mean']:.2f} (95% KI: {stats_dict['ci_lower']:.2f}-{stats_dict['ci_upper']:.2f})")
        print(f"  Median: {stats_dict['median']:.2f}")
        print(f"  Standardabweichung: {stats_dict['std']:.2f}")
        print(f"  Spannweite: {stats_dict['min']:.0f} - {stats_dict['max']:.0f}")
        print(f"  Quartile (Q1/Q3): {stats_dict['q25']:.2f} / {stats_dict['q75']:.2f}")
        print(f"  Schiefe: {stats_dict['skewness']:.2f}")
        print(f"  Kurtosis: {stats_dict['kurtosis']:.2f}")

# Create summary DataFrame for Q4
q4_summary = pd.DataFrame(q4_stats).T
print("\n" + "="*80)
print("ZUSAMMENFASSUNG Q4 - TRINKSITUATIONEN")
print("="*80)
print(q4_summary.round(2))

# Create ranking of drinking situations by mean score
q4_means = {col: q4_stats[col]['mean'] for col in q4_columns if col in q4_stats}
q4_ranking = sorted(q4_means.items(), key=lambda x: x[1], reverse=True)

print("\nRanking der Trinksituationen (nach Mittelwert):")
for i, (col, mean_val) in enumerate(q4_ranking, 1):
    print(f"{i:2d}. {q4_labels[col]:25} (M = {mean_val:.2f})")

In [None]:
# Create correlation matrix for Q4 variables
if len(q4_columns) > 1:
    q4_data = df[q4_columns].apply(pd.to_numeric, errors='coerce')
    correlation_matrix = q4_data.corr()
    
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    sns.heatmap(correlation_matrix, 
                annot=True, 
                cmap='RdBu_r', 
                center=0,
                square=True, 
                mask=mask,
                cbar_kws={'label': 'Korrelationskoeffizient'})
    plt.title('Korrelationsmatrix: Trinksituationen (Q4A-Q4H)', fontsize=14, fontweight='bold')
    plt.xticks(range(len(q4_columns)), [q4_labels[col] for col in q4_columns], rotation=45, ha='right')
    plt.yticks(range(len(q4_columns)), [q4_labels[col] for col in q4_columns], rotation=0)
    plt.tight_layout()
    plt.show()
    
    # Find highest correlations
    print("\nHöchste Korrelationen zwischen Trinksituationen:")
    corr_pairs = []
    for i in range(len(q4_columns)):
        for j in range(i+1, len(q4_columns)):
            corr_val = correlation_matrix.iloc[i, j]
            if not np.isnan(corr_val):
                corr_pairs.append((q4_columns[i], q4_columns[j], corr_val))
    
    corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
    for col1, col2, corr in corr_pairs[:5]:
        print(f"  {q4_labels[col1]} <-> {q4_labels[col2]}: r = {corr:.3f}")

In [None]:
# Create distribution plots for top 3 Q4 variables
print("\nErstelle Verteilungsplots für die wichtigsten Trinksituationen...")

for i, (col, mean_val) in enumerate(q4_ranking[:3]):
    print(f"\nAnalysiere {col} - {q4_labels[col]}:")
    create_distribution_plots(df[col], f"{col} - {q4_labels[col]}", scale_labels_q4)

## Q5: Analyse des Einkaufsverhaltens

Q5 fragt nach der Häufigkeit des Bierkaufs. Die Skala reicht von 1 (nie) bis 5 (täglich).

In [None]:
# Q5 Analysis: Purchasing Behavior
print("\n" + "=" * 60)
print("Q5: ANALYSE DES EINKAUFSVERHALTENS")
print("=" * 60)

scale_labels_q5 = ['Nie', 'Selten', 'Gelegentlich', 'Regelmäßig', 'Täglich']

if 'Q5' in df.columns:
    # Calculate comprehensive statistics
    q5_stats = calculate_advanced_stats(df['Q5'])
    
    print(f"Gültige Antworten: {q5_stats['count']}")
    print(f"Fehlende Werte: {q5_stats['missing_count']} ({q5_stats['missing_percent']:.1f}%)")
    print(f"Mittelwert: {q5_stats['mean']:.2f} (95% KI: {q5_stats['ci_lower']:.2f}-{q5_stats['ci_upper']:.2f})")
    print(f"Median: {q5_stats['median']:.2f}")
    print(f"Standardabweichung: {q5_stats['std']:.2f}")
    print(f"Spannweite: {q5_stats['min']:.0f} - {q5_stats['max']:.0f}")
    print(f"Quartile (Q1/Q3): {q5_stats['q25']:.2f} / {q5_stats['q75']:.2f}")
    print(f"Schiefe: {q5_stats['skewness']:.2f}")
    print(f"Kurtosis: {q5_stats['kurtosis']:.2f}")
    
    # Value counts and percentages
    q5_clean = pd.to_numeric(df['Q5'], errors='coerce').dropna()
    value_counts = q5_clean.value_counts().sort_index()
    percentages = (value_counts / len(q5_clean) * 100).round(1)
    
    print("\nHäufigkeitsverteilung:")
    for val, count in value_counts.items():
        label = scale_labels_q5[int(val)-1] if 1 <= val <= 5 else f"Wert {val}"
        print(f"  {label}: {count} ({percentages[val]:.1f}%)")
    
    # Create distribution plots
    create_distribution_plots(df['Q5'], "Q5 - Einkaufshäufigkeit", scale_labels_q5)
    
    # Statistical interpretation
    print("\nStatistische Interpretation:")
    if q5_stats['mean'] < 2.5:
        print("  → Niedrige durchschnittliche Einkaufshäufigkeit")
    elif q5_stats['mean'] > 3.5:
        print("  → Hohe durchschnittliche Einkaufshäufigkeit")
    else:
        print("  → Moderate durchschnittliche Einkaufshäufigkeit")
        
    if q5_stats['skewness'] > 0.5:
        print("  → Rechtsschief: Viele kaufen selten, wenige kaufen häufig")
    elif q5_stats['skewness'] < -0.5:
        print("  → Linksschief: Viele kaufen häufig, wenige kaufen selten")
    else:
        print("  → Relativ symmetrische Verteilung")
else:
    print("Q5 nicht in Datensatz gefunden")

## Q6: Analyse der Einstellungen zu Bier (Q6A-Q6F)

Q6 fragt nach verschiedenen Einstellungen und Meinungen zu Bier. Die Skala reicht von 1 (stimme überhaupt nicht zu) bis 5 (stimme voll zu).

**Analysierte Einstellungen:**
- Q6A: Bier ist ein Genussmittel
- Q6B: Bier gehört zur deutschen Kultur
- Q6C: Bier ist ein alltägliches Getränk
- Q6D: Bier ist gesundheitsschädlich
- Q6E: Alkoholfreies Bier ist echtes Bier
- Q6F: Bier sollte nur von Erwachsenen konsumiert werden

In [None]:
# Q6 Analysis: Attitudes towards Beer
print("\n" + "=" * 60)
print("Q6: ANALYSE DER EINSTELLUNGEN ZU BIER (Q6A-Q6F)")
print("=" * 60)

# Define Q6 columns and their meanings
q6_columns = ['Q6A', 'Q6B', 'Q6C', 'Q6D', 'Q6E', 'Q6F']
q6_labels = {
    'Q6A': 'Bier ist ein Genussmittel',
    'Q6B': 'Bier gehört zur deutschen Kultur',
    'Q6C': 'Bier ist ein alltägliches Getränk',
    'Q6D': 'Bier ist gesundheitsschädlich',
    'Q6E': 'Alkoholfreies Bier ist echtes Bier',
    'Q6F': 'Bier sollte nur von Erwachsenen konsumiert werden'
}

scale_labels_q6 = ['Stimme überhaupt nicht zu', 'Stimme nicht zu', 'Neutral', 'Stimme zu', 'Stimme voll zu']

# Calculate comprehensive statistics for each Q6 variable
q6_stats = {}
for col in q6_columns:
    if col in df.columns:
        stats_dict = calculate_advanced_stats(df[col])
        q6_stats[col] = stats_dict
        
        print(f"\n{col} - {q6_labels[col]}:")
        print(f"  Gültige Antworten: {stats_dict['count']}")
        print(f"  Fehlende Werte: {stats_dict['missing_count']} ({stats_dict['missing_percent']:.1f}%)")
        print(f"  Mittelwert: {stats_dict['mean']:.2f} (95% KI: {stats_dict['ci_lower']:.2f}-{stats_dict['ci_upper']:.2f})")
        print(f"  Median: {stats_dict['median']:.2f}")
        print(f"  Standardabweichung: {stats_dict['std']:.2f}")
        print(f"  Quartile (Q1/Q3): {stats_dict['q25']:.2f} / {stats_dict['q75']:.2f}")
        
        # Interpretation der Zustimmung
        if stats_dict['mean'] > 3.5:
            agreement = "Hohe Zustimmung"
        elif stats_dict['mean'] > 2.5:
            agreement = "Moderate Zustimmung"
        elif stats_dict['mean'] > 1.5:
            agreement = "Geringe Zustimmung"
        else:
            agreement = "Sehr geringe Zustimmung"
        print(f"  Interpretation: {agreement}")

# Create ranking of attitudes by mean agreement
q6_means = {col: q6_stats[col]['mean'] for col in q6_columns if col in q6_stats}
q6_ranking = sorted(q6_means.items(), key=lambda x: x[1], reverse=True)

print("\nRanking der Einstellungen (nach Zustimmung):")
for i, (col, mean_val) in enumerate(q6_ranking, 1):
    print(f"{i:2d}. {q6_labels[col]:40} (M = {mean_val:.2f})")

## Q7: Analyse der Entscheidungsfaktoren (Q7A-Q7F)

Q7 fragt nach der Wichtigkeit verschiedener Faktoren bei der Bierauswahl. Die Skala reicht von 1 (unwichtig) bis 5 (sehr wichtig).

**Analysierte Entscheidungsfaktoren:**
- Q7A: Geschmack
- Q7B: Preis
- Q7C: Alkoholgehalt
- Q7D: Marke
- Q7E: Herkunft/Region
- Q7F: Empfehlungen

In [None]:
# Q7 Analysis: Decision Factors
print("\n" + "=" * 60)
print("Q7: ANALYSE DER ENTSCHEIDUNGSFAKTOREN (Q7A-Q7F)")
print("=" * 60)

# Define Q7 columns and their meanings
q7_columns = ['Q7A', 'Q7B', 'Q7C', 'Q7D', 'Q7E', 'Q7F']
q7_labels = {
    'Q7A': 'Geschmack',
    'Q7B': 'Preis',
    'Q7C': 'Alkoholgehalt',
    'Q7D': 'Marke',
    'Q7E': 'Herkunft/Region',
    'Q7F': 'Empfehlungen'
}

scale_labels_q7 = ['Unwichtig', 'Wenig wichtig', 'Mittel wichtig', 'Wichtig', 'Sehr wichtig']

# Calculate comprehensive statistics for each Q7 variable
q7_stats = {}
for col in q7_columns:
    if col in df.columns:
        stats_dict = calculate_advanced_stats(df[col])
        q7_stats[col] = stats_dict
        
        print(f"\n{col} - {q7_labels[col]}:")
        print(f"  Gültige Antworten: {stats_dict['count']}")
        print(f"  Fehlende Werte: {stats_dict['missing_count']} ({stats_dict['missing_percent']:.1f}%)")
        print(f"  Mittelwert: {stats_dict['mean']:.2f} (95% KI: {stats_dict['ci_lower']:.2f}-{stats_dict['ci_upper']:.2f})")
        print(f"  Median: {stats_dict['median']:.2f}")
        print(f"  Standardabweichung: {stats_dict['std']:.2f}")
        
        # Interpretation der Wichtigkeit
        if stats_dict['mean'] > 4.0:
            importance = "Sehr wichtig"
        elif stats_dict['mean'] > 3.5:
            importance = "Wichtig"
        elif stats_dict['mean'] > 2.5:
            importance = "Mittel wichtig"
        elif stats_dict['mean'] > 1.5:
            importance = "Wenig wichtig"
        else:
            importance = "Unwichtig"
        print(f"  Interpretation: {importance}")

# Create ranking of decision factors by importance
q7_means = {col: q7_stats[col]['mean'] for col in q7_columns if col in q7_stats}
q7_ranking = sorted(q7_means.items(), key=lambda x: x[1], reverse=True)

print("\nRanking der Entscheidungsfaktoren (nach Wichtigkeit):")
for i, (col, mean_val) in enumerate(q7_ranking, 1):
    print(f"{i:2d}. {q7_labels[col]:20} (M = {mean_val:.2f})")

# Create comprehensive comparison chart
if q7_means:
    plt.figure(figsize=(12, 8))
    factors = list(q7_means.keys())
    means = list(q7_means.values())
    colors = plt.cm.Set3(np.linspace(0, 1, len(factors)))
    
    bars = plt.bar(range(len(factors)), means, color=colors, alpha=0.8, edgecolor='black')
    plt.xlabel('Entscheidungsfaktoren')
    plt.ylabel('Durchschnittliche Wichtigkeit')
    plt.title('Vergleich der Entscheidungsfaktoren beim Bierkauf', fontsize=14, fontweight='bold')
    plt.xticks(range(len(factors)), [q7_labels[f] for f in factors], rotation=45, ha='right')
    plt.ylim(0, 5)
    plt.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bar, mean_val in zip(bars, means):
        plt.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.05,
                f'{mean_val:.2f}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

## Q8: Analyse der Produkteigenschaften (Q8A-Q8L)

Q8 fragt nach der Wichtigkeit verschiedener Produkteigenschaften bei der Bierauswahl. Die Skala reicht von 1 (unwichtig) bis 5 (sehr wichtig).

**Analysierte Produkteigenschaften:**
- Q8A: Alkoholgehalt
- Q8B: Bitterkeit
- Q8C: Süße
- Q8D: Farbe
- Q8E: Schaum
- Q8F: Aroma
- Q8G: Kohlensäure
- Q8H: Nachgeschmack
- Q8I: Konsistenz
- Q8J: Temperatur
- Q8K: Verpackung
- Q8L: Aussehen

In [None]:
# Q8 Analysis: Product Characteristics
print("\n" + "=" * 60)
print("Q8: ANALYSE DER PRODUKTEIGENSCHAFTEN (Q8A-Q8L)")
print("=" * 60)

# Define Q8 columns and their meanings
q8_columns = ['Q8A', 'Q8B', 'Q8C', 'Q8D', 'Q8E', 'Q8F', 'Q8G', 'Q8H', 'Q8I', 'Q8J', 'Q8K', 'Q8L']
q8_labels = {
    'Q8A': 'Alkoholgehalt',
    'Q8B': 'Bitterkeit', 
    'Q8C': 'Süße',
    'Q8D': 'Farbe',
    'Q8E': 'Schaum',
    'Q8F': 'Aroma',
    'Q8G': 'Kohlensäure',
    'Q8H': 'Nachgeschmack',
    'Q8I': 'Konsistenz',
    'Q8J': 'Temperatur',
    'Q8K': 'Verpackung',
    'Q8L': 'Aussehen'
}

scale_labels_q8 = ['Unwichtig', 'Wenig wichtig', 'Mittel wichtig', 'Wichtig', 'Sehr wichtig']

# Calculate comprehensive statistics for each Q8 variable
q8_stats = {}
for col in q8_columns:
    if col in df.columns:
        stats_dict = calculate_advanced_stats(df[col])
        q8_stats[col] = stats_dict
        
        print(f"\n{col} - {q8_labels[col]}:")
        print(f"  Gültige Antworten: {stats_dict['count']}")
        print(f"  Fehlende Werte: {stats_dict['missing_count']} ({stats_dict['missing_percent']:.1f}%)")
        print(f"  Mittelwert: {stats_dict['mean']:.2f} (95% KI: {stats_dict['ci_lower']:.2f}-{stats_dict['ci_upper']:.2f})")
        print(f"  Median: {stats_dict['median']:.2f}")
        
        # Interpretation der Wichtigkeit
        if stats_dict['mean'] > 4.0:
            importance = "Sehr wichtig"
        elif stats_dict['mean'] > 3.5:
            importance = "Wichtig"
        elif stats_dict['mean'] > 2.5:
            importance = "Mittel wichtig"
        elif stats_dict['mean'] > 1.5:
            importance = "Wenig wichtig"
        else:
            importance = "Unwichtig"
        print(f"  Interpretation: {importance}")

# Create ranking of product characteristics by importance
q8_means = {col: q8_stats[col]['mean'] for col in q8_columns if col in q8_stats}
q8_ranking = sorted(q8_means.items(), key=lambda x: x[1], reverse=True)

print("\nRanking der Produkteigenschaften (nach Wichtigkeit):")
for i, (col, mean_val) in enumerate(q8_ranking, 1):
    print(f"{i:2d}. {q8_labels[col]:15} (M = {mean_val:.2f})")

# Create comprehensive comparison visualization
if q8_means:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
    
    # Top plot: Bar chart of all characteristics
    factors = list(q8_means.keys())
    means = list(q8_means.values())
    colors = plt.cm.tab20(np.linspace(0, 1, len(factors)))
    
    bars = ax1.bar(range(len(factors)), means, color=colors, alpha=0.8, edgecolor='black')
    ax1.set_xlabel('Produkteigenschaften')
    ax1.set_ylabel('Durchschnittliche Wichtigkeit')
    ax1.set_title('Vergleich aller Produkteigenschaften beim Bierkauf', fontsize=14, fontweight='bold')
    ax1.set_xticks(range(len(factors)))
    ax1.set_xticklabels([q8_labels[f] for f in factors], rotation=45, ha='right')
    ax1.set_ylim(0, 5)
    ax1.grid(True, alpha=0.3)
    
    for bar, mean_val in zip(bars, means):
        ax1.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.05,
                f'{mean_val:.2f}', ha='center', va='bottom', fontweight='bold', fontsize=8)
    
    # Bottom plot: Top 6 vs Bottom 6 comparison
    top_6 = q8_ranking[:6]
    bottom_6 = q8_ranking[-6:] if len(q8_ranking) >= 6 else []
    
    if top_6 and bottom_6:
        top_values = [val for _, val in top_6]
        bottom_values = [val for _, val in bottom_6]
        
        x_pos = np.arange(len(top_6))
        width = 0.35
        
        bars1 = ax2.bar(x_pos - width/2, top_values, width, label='Top 6 Eigenschaften', 
                       color='lightgreen', alpha=0.8, edgecolor='black')
        bars2 = ax2.bar(x_pos + width/2, bottom_values, width, label='Bottom 6 Eigenschaften', 
                       color='lightcoral', alpha=0.8, edgecolor='black')
        
        ax2.set_xlabel('Eigenschaften')
        ax2.set_ylabel('Durchschnittliche Wichtigkeit')
        ax2.set_title('Vergleich: Wichtigste vs. Unwichtigste Produkteigenschaften', fontsize=14, fontweight='bold')
        ax2.set_xticks(x_pos)
        ax2.set_xticklabels([f"Pos. {i+1}" for i in range(len(top_6))])
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # Add value labels
        for bar in bars1:
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height + 0.02,
                    f'{height:.2f}', ha='center', va='bottom', fontsize=8)
        for bar in bars2:
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height + 0.02,
                    f'{height:.2f}', ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.show()

## Zusammenfassende Univariate Analyse (Q1-Q8)

Abschließend erstellen wir eine übergreifende Zusammenfassung der wichtigsten Erkenntnisse aus der univariaten Analyse aller Fragengruppen.

In [None]:
# Final Summary of Univariate Analysis
print("\n" + "=" * 80)
print("FINALE ZUSAMMENFASSUNG: UNIVARIATE ANALYSE Q1-Q8")
print("=" * 80)

# Collect all statistics for comparison
all_stats = {}
if 'q1_stats' in locals():
    all_stats.update({'Q1': q1_stats})
if 'q2_stats' in locals():
    all_stats.update(q2_stats)
if 'q4_stats' in locals():
    all_stats.update(q4_stats)
if 'q5_stats' in locals():
    all_stats.update({'Q5': q5_stats})
if 'q6_stats' in locals():
    all_stats.update(q6_stats)
if 'q7_stats' in locals():
    all_stats.update(q7_stats)
if 'q8_stats' in locals():
    all_stats.update(q8_stats)

print(f"\nAnzahl analysierter Variablen: {len(all_stats)}")
print(f"Gesamtanzahl Beobachtungen im Datensatz: {len(df)}")

# Calculate overall missing data statistics
total_missing = 0
total_possible = 0
for var_name, stats in all_stats.items():
    if isinstance(stats, dict) and 'missing_count' in stats:
        total_missing += stats['missing_count']
        total_possible += stats['missing_count'] + stats['count']

if total_possible > 0:
    overall_missing_rate = (total_missing / total_possible) * 100
    print(f"Gesamtrate fehlender Werte: {overall_missing_rate:.1f}%")

# Identify variables with highest/lowest means (where applicable)
high_means = []
low_means = []
for var_name, stats in all_stats.items():
    if isinstance(stats, dict) and 'mean' in stats and not np.isnan(stats['mean']):
        if stats['mean'] > 4.0:
            high_means.append((var_name, stats['mean']))
        elif stats['mean'] < 2.0:
            low_means.append((var_name, stats['mean']))

if high_means:
    print("\nVariablen mit hohen Mittelwerten (> 4.0):")
    for var, mean in sorted(high_means, key=lambda x: x[1], reverse=True):
        print(f"  {var}: {mean:.2f}")

if low_means:
    print("\nVariablen mit niedrigen Mittelwerten (< 2.0):")
    for var, mean in sorted(low_means, key=lambda x: x[1]):
        print(f"  {var}: {mean:.2f}")

# Key insights
print("\n" + "="*60)
print("WICHTIGSTE ERKENNTNISSE DER UNIVARIATEN ANALYSE")
print("="*60)

print("\n1. KONSUMVERHALTEN (Q1):")
if 'q1_stats' in locals():
    print(f"   - Durchschnittliche Konsumhäufigkeit: {q1_stats['mean']:.2f}")
    print(f"   - Verteilung zeigt typische Konsummuster")

print("\n2. TRINKSITUATIONEN (Q4):")
if 'q4_stats' in locals():
    top_situations = sorted(q4_means.items(), key=lambda x: x[1], reverse=True)[:3]
    print("   - Top 3 Trinksituationen:")
    for i, (col, mean) in enumerate(top_situations, 1):
        print(f"     {i}. {q4_labels[col]} (M = {mean:.2f})")

print("\n3. EINSTELLUNGEN (Q6):")
if 'q6_stats' in locals():
    top_attitudes = sorted(q6_means.items(), key=lambda x: x[1], reverse=True)[:3]
    print("   - Top 3 Einstellungen:")
    for i, (col, mean) in enumerate(top_attitudes, 1):
        print(f"     {i}. {q6_labels[col]} (M = {mean:.2f})")

print("\n4. ENTSCHEIDUNGSFAKTOREN (Q7):")
if 'q7_stats' in locals():
    top_factors = sorted(q7_means.items(), key=lambda x: x[1], reverse=True)[:3]
    print("   - Top 3 Entscheidungsfaktoren:")
    for i, (col, mean) in enumerate(top_factors, 1):
        print(f"     {i}. {q7_labels[col]} (M = {mean:.2f})")

print("\n5. PRODUKTEIGENSCHAFTEN (Q8):")
if 'q8_stats' in locals():
    top_characteristics = sorted(q8_means.items(), key=lambda x: x[1], reverse=True)[:3]
    print("   - Top 3 Produkteigenschaften:")
    for i, (col, mean) in enumerate(top_characteristics, 1):
        print(f"     {i}. {q8_labels[col]} (M = {mean:.2f})")

print("\n" + "="*60)
print("STATUS: UNIVARIATE ANALYSE ABGESCHLOSSEN")
print("NÄCHSTE SCHRITTE: Bivariate und multivariate Analysen")
print("="*60)

In [None]:
# Save comprehensive univariate analysis results
try:
    # Combine all statistics into one comprehensive DataFrame
    comprehensive_stats = pd.DataFrame(all_stats).T
    
    # Save to CSV
    output_path = '../../data/comprehensive_univariate_analysis_Q1toQ8.csv'
    comprehensive_stats.to_csv(output_path, sep=';', decimal=',', encoding='utf-8')
    print(f"\nErweiterte univariate Statistiken gespeichert: {output_path}")
    
    # Display summary of what was saved
    print(f"Gespeicherte Statistiken für {len(comprehensive_stats)} Variablen:")
    print(f"Spalten: {list(comprehensive_stats.columns)}")
    
    # Show first few rows
    print("\nErste Zeilen der gespeicherten Daten:")
    print(comprehensive_stats.head().round(3))
    
except Exception as e:
    print(f"Fehler beim Speichern: {e}")

print("\n" + "="*80)
print("UNIVARIATE ANALYSE ERFOLGREICH ABGESCHLOSSEN")
print("Alle Fragengruppen Q1-Q8 wurden umfassend analysiert.")
print("Die Ergebnisse wurden gespeichert und sind bereit für weitere Analysen.")
print("="*80)

## Nächste Schritte in der EDA

Nach der umfassenden univariaten Analyse sind die nächsten Schritte:

### 1. Bivariate Analyse
- **Korrelationsanalysen** zwischen verschiedenen Fragengruppen
- **Kreuztabellierungen** für kategoriale Zusammenhänge
- **Gruppenvergleiche** nach demografischen Merkmalen

### 2. Multivariate Analyse
- **Faktoranalyse** zur Identifikation latenter Dimensionen
- **Clusteranalyse** für Konsumentensegmentierung
- **Hauptkomponentenanalyse (PCA)** für Dimensionsreduktion

### 3. Spezielle Analysen
- **Alkoholfrei vs. Alkoholisch** Präferenzvergleiche
- **Markenanalyse** (Q3 Textantworten)
- **Konsumentensegmentierung** basierend auf Verhalten und Einstellungen

### 4. Finale Zusammenfassung
- **Schlüsseleinsichten** und Empfehlungen
- **Visualisierung** der wichtigsten Ergebnisse
- **Managementzusammenfassung**

# Extended Univariate Analysis

In this section, we perform an extended univariate analysis of questions Q1-Q8 including:
- Advanced descriptive statistics (skewness, kurtosis, quantiles, confidence intervals)
- Distribution visualizations (histograms, boxplots, bar charts)
- Missing value analysis
- Data quality assessment

In [None]:
# Import additional libraries for extended analysis
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully for extended univariate analysis")

In [None]:
# Advanced descriptive statistics
def calculate_advanced_stats(series):
    """Calculate advanced statistics for a numeric series"""
    clean_series = pd.to_numeric(series, errors='coerce').dropna()
    
    if len(clean_series) == 0:
        return {
            'count': 0,
            'missing_count': len(series),
            'missing_pct': 100.0
        }
    
    stats_dict = {
        'count': len(clean_series),
        'missing_count': len(series) - len(clean_series),
        'missing_pct': ((len(series) - len(clean_series)) / len(series)) * 100,
        'mean': clean_series.mean(),
        'median': clean_series.median(),
        'std': clean_series.std(),
        'var': clean_series.var(),
        'skewness': clean_series.skew(),
        'kurtosis': clean_series.kurtosis(),
        'q25': clean_series.quantile(0.25),
        'q75': clean_series.quantile(0.75),
        'iqr': clean_series.quantile(0.75) - clean_series.quantile(0.25),
        'min': clean_series.min(),
        'max': clean_series.max(),
        'range': clean_series.max() - clean_series.min()
    }
    
    # Confidence interval for mean (95%)
    if len(clean_series) > 1:
        ci = stats.t.interval(0.95, len(clean_series)-1, 
                             loc=clean_series.mean(), 
                             scale=stats.sem(clean_series))
        stats_dict['ci_lower'] = ci[0]
        stats_dict['ci_upper'] = ci[1]
    else:
        stats_dict['ci_lower'] = np.nan
        stats_dict['ci_upper'] = np.nan
    
    return stats_dict

# Calculate advanced statistics for all questions
advanced_results = []

for col in question_columns:
    if col in df.columns:
        stats_dict = calculate_advanced_stats(df[col])
        stats_dict['Question'] = col
        advanced_results.append(stats_dict)

# Create DataFrame with advanced statistics
df_advanced = pd.DataFrame(advanced_results)
df_advanced = df_advanced[['Question', 'count', 'missing_count', 'missing_pct', 'mean', 'median', 'std', 'var',
                          'skewness', 'kurtosis', 'q25', 'q75', 'iqr', 'min', 'max', 'range', 'ci_lower', 'ci_upper']]

print("Advanced Descriptive Statistics for Questions Q1-Q8")
print("=" * 60)
print(df_advanced.round(3).to_string(index=False))

In [None]:
# Missing Value Analysis
print("\n" + "="*60)
print("MISSING VALUE ANALYSIS")
print("="*60)

# Calculate missing values for each question group
missing_analysis = []
for col in question_columns:
    if col in df.columns:
        total_count = len(df[col])
        missing_count = df[col].isna().sum() + (df[col] == -99).sum()  # Include any remaining -99
        missing_pct = (missing_count / total_count) * 100
        
        missing_analysis.append({
            'Question': col,
            'Total_Responses': total_count,
            'Missing_Count': missing_count,
            'Valid_Count': total_count - missing_count,
            'Missing_Percentage': missing_pct
        })

df_missing = pd.DataFrame(missing_analysis)
print(df_missing.to_string(index=False))

# Summary statistics for missing values
print(f"\nMissing Value Summary:")
print(f"Questions with no missing values: {(df_missing['Missing_Count'] == 0).sum()}")
print(f"Questions with >10% missing: {(df_missing['Missing_Percentage'] > 10).sum()}")
print(f"Questions with >25% missing: {(df_missing['Missing_Percentage'] > 25).sum()}")
print(f"Average missing percentage: {df_missing['Missing_Percentage'].mean():.2f}%")

## Distribution Analysis and Visualizations

The following section provides visual analysis of the distributions for each question group.

In [None]:
def create_distribution_plots(data, columns, title_prefix="", cols_per_row=3):
    """Create distribution plots for given columns"""
    n_cols = len(columns)
    n_rows = (n_cols + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(5*cols_per_row, 4*n_rows))
    fig.suptitle(f'{title_prefix} - Distribution Analysis', fontsize=16, y=0.98)
    
    # Flatten axes array for easier indexing
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(columns):
        if col in data.columns:
            # Clean the data
            clean_data = pd.to_numeric(data[col], errors='coerce').dropna()
            
            if len(clean_data) > 0:
                # Determine if data is discrete or continuous
                unique_vals = clean_data.nunique()
                
                if unique_vals <= 10:  # Discrete data - use bar plot
                    value_counts = clean_data.value_counts().sort_index()
                    axes[i].bar(value_counts.index, value_counts.values, alpha=0.7)
                    axes[i].set_xlabel('Values')
                    axes[i].set_ylabel('Frequency')
                else:  # Continuous data - use histogram
                    axes[i].hist(clean_data, bins=min(20, unique_vals//2), alpha=0.7, edgecolor='black')
                    axes[i].set_xlabel('Values')
                    axes[i].set_ylabel('Frequency')
                
                # Add statistics text
                stats_text = f'n={len(clean_data)}\nMean={clean_data.mean():.2f}\nStd={clean_data.std():.2f}'
                axes[i].text(0.02, 0.98, stats_text, transform=axes[i].transAxes, 
                            verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
                
            else:
                axes[i].text(0.5, 0.5, 'No valid data', ha='center', va='center', transform=axes[i].transAxes)
            
            axes[i].set_title(f'{col}', fontsize=12)
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for i in range(len(columns), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.93)
    plt.show()

print("Distribution plotting function created successfully")

### Q1: Beer Consumption Frequency Analysis

In [None]:
# Q1 Analysis: Beer Consumption Frequency
print("Q1 - Beer Consumption Frequency Analysis")
print("=" * 50)

if 'Q1' in df.columns:
    q1_data = pd.to_numeric(df['Q1'], errors='coerce').dropna()
    
    print(f"Valid responses: {len(q1_data)} out of {len(df)}")
    print(f"Missing responses: {len(df) - len(q1_data)} ({((len(df) - len(q1_data))/len(df)*100):.1f}%)")
    
    # Value counts
    value_counts = q1_data.value_counts().sort_index()
    print("\nFrequency Distribution:")
    for val, count in value_counts.items():
        percentage = (count / len(q1_data)) * 100
        print(f"Value {val}: {count} responses ({percentage:.1f}%)")
    
    # Create visualization
    create_distribution_plots(df, ['Q1'], "Q1: Beer Consumption Frequency")
    
    # Statistical summary
    print(f"\nStatistical Summary:")
    print(f"Mean: {q1_data.mean():.2f}")
    print(f"Median: {q1_data.median():.2f}")
    print(f"Mode: {q1_data.mode().iloc[0] if not q1_data.mode().empty else 'No mode'}")
    print(f"Standard Deviation: {q1_data.std():.2f}")
    print(f"Skewness: {q1_data.skew():.3f}")
    print(f"Kurtosis: {q1_data.kurtosis():.3f}")
else:
    print("Q1 column not found in dataset")

### Q2A-Q2G: Beer Type Preferences Analysis

In [None]:
# Q2A-Q2G Analysis: Beer Type Preferences
print("Q2A-Q2G - Beer Type Preferences Analysis")
print("=" * 50)

q2_columns = [f'Q2{chr(ord("A")+i)}' for i in range(7)]
q2_available = [col for col in q2_columns if col in df.columns]

if q2_available:
    print(f"Available Q2 questions: {q2_available}")
    
    # Summary for all Q2 questions
    print("\nSummary Statistics for Q2 Questions:")
    for col in q2_available:
        data = pd.to_numeric(df[col], errors='coerce').dropna()
        if len(data) > 0:
            mean_val = data.mean()
            std_val = data.std()
            print(f"{col}: Mean={mean_val:.2f}, Std={std_val:.2f}, n={len(data)}")
    
    # Create visualizations
    create_distribution_plots(df, q2_available, "Q2: Beer Type Preferences")
    
    # Correlation analysis between Q2 questions
    q2_data = df[q2_available].apply(pd.to_numeric, errors='coerce')
    correlation_matrix = q2_data.corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, fmt='.3f')
    plt.title('Correlation Matrix: Beer Type Preferences (Q2A-Q2G)')
    plt.tight_layout()
    plt.show()
    
else:
    print("No Q2 columns found in dataset")