# Exploratory Data Analysis

In [1]:
# Load libraries
import pandas as pd
import geopandas as gpd
import plotly.express as px

In [2]:
# Load data
project_path = "../data/all_projects.csv"
coords_path = "../data/project_coords.csv"

df = pd.read_csv(project_path)
coords = pd.read_csv(coords_path)

## I. Initial data exploration

In [11]:
# Check the distribution of projects by country
fig = px.bar(df, x='country', color='sector_type')
fig.update_layout(
    title = 'Projects by sector per country', 
    xaxis = dict(title = 'Country'), 
    yaxis = dict(title = 'Project count'), 
    barmode = 'group', 
    paper_bgcolor = '#FFFFFF', 
    showlegend = True,
    width = 800,
    height = 500
)

In [37]:
# Check the distribution of projects by country
fig = px.bar(df, x='country_of_company_hq', color='sector_type')
fig.update_layout(
    title = 'Where do these companies come from?', 
    xaxis = dict(title = 'Country'), 
    yaxis = dict(title = 'Project count'), 
    barmode = 'group', 
    paper_bgcolor = '#FFFFFF', 
    showlegend = True,
    width = 800,
    height = 500
)

In [31]:
# Total number of projects per country of company HQ
df['country_of_company_hq'].value_counts()

country_of_company_hq
canada                    99
brazil                    64
bolivia                   62
ecuador                   40
spain                     39
china                     32
colombia                  24
peru                      24
chile                     22
australia                 21
usa                       17
uk                        16
argentina                 16
russia                    15
venezuela                 11
france                     7
suriname                   6
italy                      6
portugal                   5
south_korea                5
norway                     4
singapore                  3
taiwan                     2
algeria                    2
british_virgin_islands     2
india                      1
vietnam                    1
japan                      1
belarus                    1
the_netherlands            1
trinidad_and_tobago        1
germany                    1
french_guiana              1
cayman_islands       

In [41]:
# Check who owns the most projects in the region by sector
country_counts = df.groupby(['country_of_company_hq', 'sector_type']).size().reset_index(name='project_count')
sorted_country_counts = country_counts.groupby('sector_type').apply(lambda x: x.sort_values('project_count', ascending=False)).reset_index(drop=True)

# Create separate bar plots for each sector type
plots = []
for sector_type, data in sorted_country_counts.groupby('sector_type'):
    fig = px.bar(data, x='country_of_company_hq', y='project_count', color='country_of_company_hq')
    fig.update_layout(
        title=f'Where do these companies come from? ({sector_type} sector)',
        xaxis=dict(title='Country of company HQ'),
        yaxis=dict(title='Project count'),
        #barmode='group',
        paper_bgcolor='#FFFFFF',
        showlegend=False,
        width=600,
        height=400
    )

    # Adjust the bar width and gap
    fig.update_traces(marker=dict(line=dict(width=0.01)))  # Adjust the width as needed
    fig.update_layout(bargap=0.2) 

    plots.append(fig)

for fig in plots:
    fig.show()

In [42]:
df.head()


Unnamed: 0,project_name,sector_type,country,installed_capacity_mw,operational_status,owner,equity_stake_owner,equity_stake_operator,country_of_company_hq,company_type,company_type_for_label,is_operator,lender_type,substances,deal_type,parent_org,pid
0,ambrosia,hydropower,bolivia,84.9,permitting,empresa_electrica_corani,1.0,1.0,bolivia,soe,stateowned_enterprise,yes,multilateral_development_bank,,greenfield,ende,kl0fxe
1,banda_azul,hydropower,bolivia,146.0,permitting,empresa_electrica_corani,1.0,1.0,bolivia,soe,stateowned_enterprise,yes,foreign_development_bankfund,,greenfield,ende,ffjpph
2,chojlla,hydropower,bolivia,37.0,in_operation,hidroelectrica_boliviana,1.0,1.0,bolivia,local_power_company,local_power_company,yes,,,greenfield,hidroelectrica_boliviana,1cxvzq
3,corani,hydropower,bolivia,54.0,in_operation,empresa_electrica_corani,1.0,1.0,bolivia,soe,stateowned_enterprise,yes,multilateral_development_bank,,greenfield,ende,eh6ntx
4,ivirizu,hydropower,bolivia,279.8,under_construction,ende_valle_hermoso,1.0,0.0,bolivia,soe,stateowned_enterprise,no,,,greenfield,ende,l1dro7


### Other questions to answer:

#### General
1. What is the distribution of deal types? lender type? company type? operational status?
2. How are major owners distributed by country?

#### Relationships
1. How many of these companies are operating/working on foreign soil? 
2. How many of these companies come from outside South America?
3. Are there any trends between project country and company country?
4. Is there a company, host country, or country of company HQ that is dominant over others? Any trends in new projects?
5. How many major owners operate in foreign soil?
6. Find the correlation between owner type and projects per country -- mosaic plots

#### Geospatial
1. Where are these projects distributed within each country?
2. How are these projects distributed within the Amazon River Basin?