In [18]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import os

# Load the data files
cost_df = pd.read_csv("../data/CostOfLiving.csv")
gpi_df = pd.read_csv("../data/GPI.csv")
temp_df = pd.read_csv("../data/combined_temperature.csv")
CO2_df = pd.read_csv("../data/CO2Emission.csv")

# Clean and prepare the data
cost_df.columns = cost_df.columns.str.strip()
cost_df['Country'] = cost_df['Country'].str.strip()

# Clean CO2 data
CO2_df.columns = CO2_df.columns.str.strip()
CO2_df['Country'] = CO2_df['Country'].str.strip()

# Process temperature data
temp_df_latest = temp_df.sort_values('Year').groupby('Country').last().reset_index()
temp_df_latest = temp_df_latest[['Country', 'Annual Mean']]
temp_df_latest.columns = ['Country', 'Annual_Mean_Temperature']

# Process GPI data
gpi_latest = gpi_df[['Country', '2023']]
gpi_latest.columns = ['Country', 'GPI']

# Simplified country name mapping
country_name_map = {
    'United States': 'United States',
    'Russia': 'Russia',
    'South Korea': 'South Korea',
    'Bosnia And Herzegovina': 'Bosnia And Herzegovina',
    'United Kingdom': 'United Kingdom',
    'Czech Republic': 'Czech Republic',
    'Dominican Republic': 'Dominican Republic',
    'Costa Rica': 'Costa Rica',
    'Saudi Arabia': 'Saudi Arabia',
    'South Africa': 'South Africa',
    'New Zealand': 'New Zealand',
    'United Arab Emirates': 'United Arab Emirates',
    'Trinidad And Tobago': 'Trinidad And Tobago',
    'Papua New Guinea': 'Papua New Guinea'
}

# Create standardized country name column for each dataframe
cost_df['Country_Mapped'] = cost_df['Country'].map(country_name_map).fillna(cost_df['Country'])
gpi_latest['Country_Mapped'] = gpi_latest['Country'].map(country_name_map).fillna(gpi_latest['Country'])
temp_df_latest['Country_Mapped'] = temp_df_latest['Country'].map(country_name_map).fillna(temp_df_latest['Country'])
CO2_df['Country_Mapped'] = CO2_df['Country'].map(country_name_map).fillna(CO2_df['Country'])

# Select only needed columns for each dataframe
cost_cols = ['Country_Mapped', 'Cost of Living Index', 'Rent Index', 'Groceries Index', 'Restaurant Price Index']
gpi_cols = ['Country_Mapped', 'GPI']
temp_cols = ['Country_Mapped', 'Annual_Mean_Temperature']
co2_cols = ['Country_Mapped', 'CO2'] # Assuming 'CO2' is the correct column name after stripping

# Merge datasets step by step
merged_df = cost_df[cost_cols].merge(gpi_latest[gpi_cols], on='Country_Mapped', how='outer')
merged_df = merged_df.merge(temp_df_latest[temp_cols], on='Country_Mapped', how='outer')
merged_df = merged_df.merge(CO2_df[co2_cols], on='Country_Mapped', how='outer')

# Define metrics for correlation analysis
available_metrics = [
    ('Cost of Living Index', 'Cost of Living Index'),
    ('GPI', 'Global Peace Index'),
    ('Rent Index', 'Rent Index'),
    ('Annual_Mean_Temperature', 'Temperature (°C)'),
    ('Groceries Index', 'Groceries Index'),
    ('CO2', 'CO2 Emissions (kg)'),
    ('Restaurant Price Index', 'Restaurant Prices')
]

# Create a comprehensive correlation matrix
metric_keys = [metric[0] for metric in available_metrics]
# Ensure only existing columns are selected, especially after potential merge issues or if CO2 column name was different
valid_metric_keys = [key for key in metric_keys if key in merged_df.columns]
correlation_data = merged_df[valid_metric_keys].copy()


# Calculate correlation matrix
corr_matrix = correlation_data.corr()

# Ensure metric names for x and y axes match the actual data used for corr_matrix
heatmap_x_labels = [metric[1] for metric in available_metrics if metric[0] in valid_metric_keys]
heatmap_y_labels = [metric[1] for metric in available_metrics if metric[0] in valid_metric_keys]


# Create interactive correlation heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=heatmap_x_labels,
    y=heatmap_y_labels,
    colorscale='RdBu',
    zmid=0,
    text=np.round(corr_matrix.values, 3),
    texttemplate='%{text}',
    textfont={"size": 9}, # Slightly reduced for potentially tighter fit
    colorbar=dict(title=dict(text="Correlation"), tickfont=dict(size=9))
))

fig_heatmap.update_layout(
    title=dict(
        text='Correlation Matrix - Tourism Destination Factors',
        x=0.5, # Center title
        font=dict(size=14)
    ),
    xaxis_title='',
    yaxis_title='',
    autosize=True, # Key change: allow autosizing
    # Remove fixed width and height to allow iframe to control it
    # width=800, # Removed
    # height=700, # Removed
    margin=dict(l=130, r=50, t=60, b=100, pad=4), # Adjust margins: l(left), r(right), t(top), b(bottom)
                                                # Increased left/bottom margin for potentially longer labels
    font=dict(size=10) # General font size for axis titles etc.
)
fig_heatmap.update_xaxes(tickangle=-45, tickfont=dict(size=9)) # Angle labels to save space
fig_heatmap.update_yaxes(tickfont=dict(size=9))


# Show the heatmap (optional, for testing in script environment)
# fig_heatmap.show()

# Create a summary table of key correlations
key_correlations = []
for i, (metric1_key, metric1_name) in enumerate(available_metrics):
    # Ensure metric1_key is in corr_matrix columns (i.e., was in valid_metric_keys)
    if metric1_key not in corr_matrix.columns:
        continue
    for j, (metric2_key, metric2_name) in enumerate(available_metrics):
        # Ensure metric2_key is in corr_matrix columns
        if metric2_key not in corr_matrix.columns:
            continue
        if i < j:
            valid_data = merged_df[
                (merged_df[metric1_key].notna()) &
                (merged_df[metric2_key].notna())
            ]
            if len(valid_data) > 10:  # Only show correlations with enough data points
                # Ensure the correlation value exists in the matrix
                if metric1_key in corr_matrix.index and metric2_key in corr_matrix.columns:
                    correlation = corr_matrix.loc[metric1_key, metric2_key]
                    key_correlations.append({
                        'Metric 1': metric1_name,
                        'Metric 2': metric2_name,
                        'Correlation': correlation,
                        'Countries': len(valid_data)
                    })
                else:
                     print(f"Warning: Could not find correlation between {metric1_key} and {metric2_key} in matrix.")


# Sort by absolute correlation value
if key_correlations:
    key_correlations_df = pd.DataFrame(key_correlations)
    key_correlations_df['Abs_Correlation'] = key_correlations_df['Correlation'].abs()
    key_correlations_df = key_correlations_df.sort_values('Abs_Correlation', ascending=False)

    print("\nKey Correlations (sorted by strength):")
    print(key_correlations_df[['Metric 1', 'Metric 2', 'Correlation', 'Countries']].head(10))

# Save the heatmap
# Ensure the target directory exists
output_dir = '../assets'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

heatmap_path = os.path.join(output_dir, 'correlation_heatmap.html')
fig_heatmap.write_html(heatmap_path, include_plotlyjs='cdn') # Using 'cdn' can reduce file size

print(f"\nCorrelation heatmap saved as '{heatmap_path}'!")


Key Correlations (sorted by strength):
                Metric 1            Metric 2  Correlation  Countries
3   Cost of Living Index     Groceries Index     0.946954        139
5   Cost of Living Index   Restaurant Prices     0.916608        139
1   Cost of Living Index          Rent Index     0.858366        139
19       Groceries Index   Restaurant Prices     0.837287        139
14            Rent Index   Restaurant Prices     0.785055        139
12            Rent Index     Groceries Index     0.780299        139
10    Global Peace Index   Restaurant Prices    -0.524321        119
0   Cost of Living Index  Global Peace Index    -0.523900        119
16      Temperature (°C)  CO2 Emissions (kg)     0.512394         94
8     Global Peace Index     Groceries Index    -0.508812        119

Correlation heatmap saved as '../assets\correlation_heatmap.html'!




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

