In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import os

# Load the data files
cost_df = pd.read_csv("../data/CostOfLiving.csv")
gpi_df = pd.read_csv("../data/GPI.csv")
temp_df = pd.read_csv("../data/combined_temperature.csv")
CO2_df = pd.read_csv("../data/CO2Emission.csv")

# Clean and prepare the data
cost_df.columns = cost_df.columns.str.strip()
cost_df['Country'] = cost_df['Country'].str.strip()

# Clean CO2 data
CO2_df.columns = CO2_df.columns.str.strip()
CO2_df['Country'] = CO2_df['Country'].str.strip()

# Process temperature data
temp_df_latest = temp_df.sort_values('Year').groupby('Country').last().reset_index()
temp_df_latest = temp_df_latest[['Country', 'Annual Mean']]
temp_df_latest.columns = ['Country', 'Annual_Mean_Temperature']

# Process GPI data
gpi_latest = gpi_df[['Country', '2023']]
gpi_latest.columns = ['Country', 'GPI']

# Simplified country name mapping
country_name_map = {
    'United States': 'United States',
    'Russia': 'Russia',
    'South Korea': 'South Korea',
    'Bosnia And Herzegovina': 'Bosnia And Herzegovina',
    'United Kingdom': 'United Kingdom',
    'Czech Republic': 'Czech Republic',
    'Dominican Republic': 'Dominican Republic',
    'Costa Rica': 'Costa Rica',
    'Saudi Arabia': 'Saudi Arabia',
    'South Africa': 'South Africa',
    'New Zealand': 'New Zealand',
    'United Arab Emirates': 'United Arab Emirates',
    'Trinidad And Tobago': 'Trinidad And Tobago',
    'Papua New Guinea': 'Papua New Guinea'
}

# Create standardized country name column for each dataframe
cost_df['Country_Mapped'] = cost_df['Country'].map(country_name_map).fillna(cost_df['Country'])
gpi_latest['Country_Mapped'] = gpi_latest['Country'].map(country_name_map).fillna(gpi_latest['Country'])
temp_df_latest['Country_Mapped'] = temp_df_latest['Country'].map(country_name_map).fillna(temp_df_latest['Country'])
CO2_df['Country_Mapped'] = CO2_df['Country'].map(country_name_map).fillna(CO2_df['Country'])

# Select only needed columns for each dataframe
cost_cols = ['Country_Mapped', 'Cost of Living Index', 'Rent Index', 'Groceries Index', 'Restaurant Price Index']
gpi_cols = ['Country_Mapped', 'GPI']
temp_cols = ['Country_Mapped', 'Annual_Mean_Temperature']
co2_cols = ['Country_Mapped', 'CO2']

# Merge datasets step by step
merged_df = cost_df[cost_cols].merge(gpi_latest[gpi_cols], on='Country_Mapped', how='outer')
merged_df = merged_df.merge(temp_df_latest[temp_cols], on='Country_Mapped', how='outer')
merged_df = merged_df.merge(CO2_df[co2_cols], on='Country_Mapped', how='outer')

# Define metrics for correlation analysis
available_metrics = [
    ('Cost of Living Index', 'Cost of Living Index'),
    ('GPI', 'Global Peace Index'),
    ('Rent Index', 'Rent Index'),
    ('Annual_Mean_Temperature', 'Temperature (°C)'),
    ('Groceries Index', 'Groceries Index'),
    ('CO2', 'CO2 Emissions (kg)'),
    ('Restaurant Price Index', 'Restaurant Prices')
]

# Create a comprehensive correlation matrix
correlation_data = merged_df[[metric[0] for metric in available_metrics]].copy()

# Calculate correlation matrix
corr_matrix = correlation_data.corr()

# Create interactive correlation heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=[metric[1] for metric in available_metrics],
    y=[metric[1] for metric in available_metrics],
    colorscale='RdBu',
    zmid=0,
    text=np.round(corr_matrix.values, 3),
    texttemplate='%{text}',
    textfont={"size": 10},
    colorbar=dict(title="Correlation")
))

fig_heatmap.update_layout(
    title='Correlation Matrix - Tourism Destination Factors',
    xaxis_title='',
    yaxis_title='',
    width=800,
    height=700,
    font=dict(size=12)
)

# Show the heatmap
fig_heatmap

# Create a summary table of key correlations
key_correlations = []
for i, (metric1_key, metric1_name) in enumerate(available_metrics):
    for j, (metric2_key, metric2_name) in enumerate(available_metrics):
        if i < j:
            valid_data = merged_df[
                (merged_df[metric1_key].notna()) & 
                (merged_df[metric2_key].notna())
            ]
            if len(valid_data) > 10:  # Only show correlations with enough data points
                correlation = valid_data[metric1_key].corr(valid_data[metric2_key])
                key_correlations.append({
                    'Metric 1': metric1_name,
                    'Metric 2': metric2_name,
                    'Correlation': correlation,
                    'Countries': len(valid_data)
                })

# Sort by absolute correlation value
if key_correlations:
    key_correlations_df = pd.DataFrame(key_correlations)
    key_correlations_df['Abs_Correlation'] = key_correlations_df['Correlation'].abs()
    key_correlations_df = key_correlations_df.sort_values('Abs_Correlation', ascending=False)
    
    print("\nKey Correlations (sorted by strength):")
    print(key_correlations_df[['Metric 1', 'Metric 2', 'Correlation', 'Countries']].head(10))

# Save the heatmap
os.makedirs('../assets', exist_ok=True)
fig_heatmap.write_html('../assets/correlation_heatmap.html')

print("\nCorrelation heatmap saved as '../assets/correlation_heatmap.html'!")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed