In [None]:
import pandas as pd
from statsmodels.formula.api import ols
from scipy.stats import pearsonr


In [None]:

df_songs = pd.read_excel('Song_Data_With_Genres.xlsx')
df_climate = pd.read_csv('Global_Climate_Data_Formatted.csv', delimiter=';')

# Fix Climate Data Types
# Replace commas with dots for decimals and convert to float
df_climate['Average_Temp_Celsius'] = df_climate['Average_Temp_Celsius'].astype(str).str.replace(',', '.').astype(float)
# Force rainfall to numeric, turning non-numbers into NaN
df_climate['Rainfall_mm_per_year'] = pd.to_numeric(df_climate['Rainfall_mm_per_year'], errors='coerce')

# Fix Country Name Mismatches (Standardizing names to match Climate Data)
country_corrections = {
    'United Arab': 'United Arab Emirates',
    'South Korea': 'South Korea',
    'Ivory Coast': "Cote d'Ivoire",
    'Costa': 'Costa Rica',
    'Saudi': 'Saudi Arabia',
    'Türkiye': 'Turkey',
    'Czechia': 'Czech Republic'
}
df_songs['Country'] = df_songs['Country'].replace(country_corrections)


# only keep countrşes taht exist in both data
df_merged = pd.merge(df_songs, df_climate, on='Country', how='inner')

print(f"Data Merged successfully. Total Rows: {len(df_merged)}")
print(f"Unique Countries: {df_merged['Country'].nunique()}")


def get_country_proportions(df, col):
    """
    1. Splits multi-tag strings (e.g., "Pop / R&B" -> "Pop", "R&B").
    2. Counts occurrences per country.
    3. Normalizes by the total number of songs in that country.
    """
    # Create a copy to avoid SettingWithCopy warnings
    temp_df = df.copy()
    
    # Explode the column (Split "A / B" into separate rows for A and B)
    expanded = temp_df.assign(**{col: temp_df[col].str.split('/')}).explode(col)
    
    # Strip whitespace (e.g., " Pop" -> "Pop")
    expanded[col] = expanded[col].str.strip()
    
    # Count how many times each Genre/Mood appears in each Country
    counts = expanded.groupby(['Country', col]).size().unstack(fill_value=0)
    
    # Normalize: Divide by the total number of songs per country (usually 30)
    total_songs = df.groupby('Country').size()
    proportions = counts.div(total_songs, axis=0)
    
    return proportions

# Calculate Proportions
mood_props = get_country_proportions(df_merged, 'Mood')
genre_props = get_country_proportions(df_merged, 'Genre')

climate_features = df_merged[['Country', 'Average_Temp_Celsius', 'Rainfall_mm_per_year']].drop_duplicates().set_index('Country')

# Final Dataframes ready for Hypothesis Testing
analysis_df = mood_props.join(climate_features, how='inner')       # Use this for MOOD testing
genre_analysis_df = genre_props.join(climate_features, how='inner') # Use this for GENRE testing


In [None]:

# Testing Hypothesis
def precise_hypothesis_test(test_name, var_name, metric_name, df):
    # Get clean data arrays
    subset = df[[var_name, metric_name]].dropna()
    x = subset[var_name]
    y = subset[metric_name]
    
    # Calculate Statistics
    corr, p_val = pearsonr(x, y)
    
    # PRINT THE REPORT

    print(f"HYPOTHESIS TEST: {test_name}")

    print(f"1. Variables Tested:")
    print(f"   - Independent Variable (X): {var_name}")
    print(f"   - Dependent Variable (Y):   Percentage of songs with '{metric_name}' tag")
    
    print(f"\n2. Hypothesis Definitions:")
    print(f"   - Null Hypothesis (H0):     There is NO correlation between {var_name} and '{metric_name}' music (r = 0).")
    print(f"   - Alternative Hypothesis (H1): There IS a significant correlation (r != 0).")
    
    print(f"\n3. Statistical Results:")
    print(f"   - Pearson Correlation (r): {corr:.4f}")
    
    # Determine direction string
    if corr > 0:
        direction = "POSITIVE (They move in the same direction)"
    else:
        direction = "NEGATIVE (They move in opposite directions)"
    
    print(f"   - Direction:               {direction}")
    print(f"   - P-Value:                 {p_val:.6f}")
    
    print(f"\n4. Conclusion:")
    # The standard threshold is 0.05 (95% confidence)
    if p_val < 0.05:
        print(f"   - Since P-Value ({p_val:.6f}) < 0.05, we REJECT the Null Hypothesis.")
        print(f"   - STATISTICAL PROOF: There is a significant relationship.")
    else:
        print(f"   - Since P-Value ({p_val:.6f}) > 0.05, we FAIL TO REJECT the Null Hypothesis.")
        print(f"   - STATISTICAL PROOF: We cannot prove a relationship exists.")
    print("\n")

# ==========================================
# 3. RUN THE TESTS
# ==========================================
print("Running Formal Hypothesis Tests...\n")

# Test 1: Winter Melancholy (Temp vs Dark)
precise_hypothesis_test("The Winter Melancholy Theory", "Average_Temp_Celsius", "Dark", analysis_df)

# Test 2: Sunny Disposition (Temp vs Dance)
precise_hypothesis_test("The Sunny Disposition Theory", "Average_Temp_Celsius", "Dance", analysis_df)

# Test 3: Rainy Day Vibe (Rainfall vs Chill)
precise_hypothesis_test("The Rainy Day Vibe Theory", "Rainfall_mm_per_year", "Chill", analysis_df)