In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [3]:
# Generate some sample 3D data
x = np.random.randn(100)
y = np.random.randn(100)
z = np.random.randn(100)

# Create 3D scatter plot
fig = px.scatter_3d(x=x, y=y, z=z)
fig.write_html('3d_scatter.html', auto_open=True)

In [4]:
# ... existing code ...
fig = px.scatter_3d(x=x, y=y, z=z)

# Add more customization for interaction
fig.update_layout(
    scene = dict(
        camera=dict(
            up=dict(x=0, y=0, z=1),
            center=dict(x=0, y=0, z=0),
            eye=dict(x=1.5, y=1.5, z=1.5)  # adjust initial camera position
        ),
        dragmode='orbit',  # 'orbit' or 'turntable' rotation modes
        # Optional: Add axes labels
        xaxis_title='X Axis',
        yaxis_title='Y Axis',
        zaxis_title='Z Axis'
    )
)

fig.write_html('3d_scatter_2.html', auto_open=True)

In [7]:
# Read the CSV file
df = pd.read_csv('Simulated_Influence_on_Project_Capacities.csv')

# 1. Basic Statistics and Overview
print("Dataset Overview:")
print(f"Total number of projects: {len(df)}")
print("\nColumns in dataset:", df.columns.tolist())
print("\nBasic statistics for numerical columns:")
print(df.describe())

# 2. Projects by Country
country_counts = df['Country'].value_counts().head(10)
fig1 = px.bar(x=country_counts.index, y=country_counts.values,
              title='Top 10 Countries by Number of Projects',
              labels={'x': 'Country', 'y': 'Number of Projects'})
fig1.write_html('projects_by_country.html', auto_open=True)

# 3. Capacity Distribution
fig2 = px.histogram(df, x='Capacity (kt H2/y)',
                    title='Distribution of Project Capacities',
                    nbins=50)
fig2.write_html('capacity_distribution.html', auto_open=True)

# 4. Timeline Analysis (if dates are available)
df['Start Date'] = pd.to_datetime(df['Start Date'], errors='coerce')
df['Date Online'] = pd.to_datetime(df['Date Online'], errors='coerce')

yearly_projects = df['Date Online'].dt.year.value_counts().sort_index()
fig3 = px.line(x=yearly_projects.index, y=yearly_projects.values,
               title='Number of Projects Coming Online by Year',
               labels={'x': 'Year', 'y': 'Number of Projects'})
fig3.write_html('timeline_analysis.html', auto_open=True)

# 5. Technology Distribution
tech_dist = df['Technology'].value_counts()
fig4 = px.pie(values=tech_dist.values, names=tech_dist.index,
              title='Distribution of Technologies')
fig4.write_html('technology_distribution.html', auto_open=True)

# 6. 3D Scatter Plot: Capacity vs Investment Cost vs Length
fig5 = px.scatter_3d(df, 
                     x='Capacity (kt H2/y)',
                     y='Investment Cost (MUSD)',
                     z='Length (km)',
                     color='Country',
                     title='3D View: Capacity vs Investment vs Length')
fig5.write_html('3d_analysis.html', auto_open=True)

# 7. Investment Cost vs Capacity
fig6 = px.scatter(df,
                  x='Capacity (kt H2/y)',
                  y='Investment Cost (MUSD)',
                  color='Country',
                  title='Investment Cost vs Capacity')  # Removed problematic trendline
fig6.write_html('cost_vs_capacity.html', auto_open=True)

# 8. Average Capacity by Technology
avg_cap_by_tech = df.groupby('Technology')['Capacity (kt H2/y)'].mean().sort_values(ascending=False)
fig7 = px.bar(x=avg_cap_by_tech.index, 
              y=avg_cap_by_tech.values,
              title='Average Capacity by Technology',
              labels={'x': 'Technology', 'y': 'Average Capacity (kt H2/y)'})
fig7.write_html('avg_capacity_by_tech.html', auto_open=True)

Dataset Overview:
Total number of projects: 1981

Columns in dataset: ['Project Name', 'Country', 'Start Date', 'Capacity (kt H2/y)', 'Length (km)', 'Investment Cost (MUSD)', 'Source', 'Date Online', 'Technology', 'Capacity Change (Simulated)']

Basic statistics for numerical columns:
        Start Date  Capacity (kt H2/y)  Investment Cost (MUSD)  Date Online  \
count     8.000000         1981.000000             1981.000000  1583.000000   
mean   2029.500000          532.266825               92.656245  2024.253948   
std       2.507133         2070.844406              358.356876     6.033614   
min    2027.000000            0.000000                0.000039  1965.000000   
25%    2028.500000            2.500000                0.374827  2023.000000   
50%    2029.000000           40.000000                6.779478  2025.000000   
75%    2030.000000          298.295399               50.960384  2027.000000   
max    2035.000000        41574.271700             7202.881152  2043.000000   

  

In [9]:
# Read the CSV file
df = pd.read_csv('Simulated_Influence_on_Project_Capacities.csv')

# Function to calculate influence score
def calculate_influence_score(row1, row2):
    # Technology similarity (T_ij)
    T = 1 if row1['Technology'] == row2['Technology'] else 0
    
    # Geography similarity (G_ij)
    G = 1 if row1['Country'] == row2['Country'] else 0
    
    # Capacity similarity (C_ij)
    max_capacity = df['Capacity (kt H2/y)'].max()
    C = 1 - abs(row1['Capacity (kt H2/y)'] - row2['Capacity (kt H2/y)']) / max_capacity
    
    # Calculate influence score
    S = 0.5 * T + 0.3 * G + 0.2 * C
    return S

# Create a 3D visualization of influence scores
# Let's take a sample of projects for better visualization
sample_size = 50
sample_df = df.sample(n=sample_size, random_state=42)

# Calculate influence scores for the sample
influence_scores = []
for i, row1 in sample_df.iterrows():
    for j, row2 in sample_df.iterrows():
        if i != j:
            score = calculate_influence_score(row1, row2)
            influence_scores.append({
                'Project1': row1['Project Name'],
                'Project2': row2['Project Name'],
                'Score': score,
                'Technology1': row1['Technology'],
                'Country1': row1['Country'],
                'Capacity1': row1['Capacity (kt H2/y)'],
                'Technology2': row2['Technology'],
                'Country2': row2['Country'],
                'Capacity2': row2['Capacity (kt H2/y)']
            })

scores_df = pd.DataFrame(influence_scores)

# Create 3D scatter plot of influence relationships
fig = px.scatter_3d(scores_df,
                    x='Capacity1',
                    y='Capacity2',
                    z='Score',
                    color='Score',
                    title='Project Influence Scores',
                    labels={
                        'Capacity1': 'Project 1 Capacity (kt H2/y)',
                        'Capacity2': 'Project 2 Capacity (kt H2/y)',
                        'Score': 'Influence Score'
                    })

# Customize the layout
fig.update_layout(
    scene=dict(
        camera=dict(
            up=dict(x=0, y=0, z=1),
            center=dict(x=0, y=0, z=0),
            eye=dict(x=1.5, y=1.5, z=1.5)
        ),
        xaxis_title='Project 1 Capacity (kt H2/y)',
        yaxis_title='Project 2 Capacity (kt H2/y)',
        zaxis_title='Influence Score'
    )
)

# Save the interactive plot
fig.write_html('influence_scores_3d.html', auto_open=True)

# Create a heatmap of average influence scores by country
country_scores = scores_df.groupby(['Country1', 'Country2'])['Score'].mean().reset_index()
country_pivot = country_scores.pivot(index='Country1', columns='Country2', values='Score')

fig2 = px.imshow(country_pivot,
                 title='Average Influence Scores Between Countries',
                 labels=dict(x='Country 2', y='Country 1', color='Influence Score'))

# Save the heatmap
fig2.write_html('country_influence_heatmap.html', auto_open=True)