<a href="https://colab.research.google.com/github/gabrielborja/python_data_analysis/blob/main/ecuador_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Ecuador Volcanoes Analysis

Ecuador is located at the center of the world, and is aslo known as the land of volcanoes. Data about Ecuador volcanoes can be found [here](https://en.wikipedia.org/wiki/List_of_volcanoes_in_Ecuador)

## Uploading packages and data

In [None]:
#!pip freeze

In [None]:
#Importing data manipulation packages
import numpy as np
import pandas as pd

##Retrieving data from URL

In [None]:
#Parsing tables from URL
ecu_tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_volcanoes_in_Ecuador',
                           attrs = {'class': 'wikitable sortable'})

In [None]:
#Saving table 1 to dataframe for mainland volcanoes
ecu_main = ecu_tables[0]
ecu_main.tail()

Unnamed: 0,Name,Meters,Feet,Coordinates,Last Eruption
28,Soche,3955,12972,0°33′07″N 77°34′48″W﻿ / ﻿0.552°N 77.580°W,-
29,Sumaco,3990,13087,0°32′S 77°37′W﻿ / ﻿0.53°S 77.62°W,1933
30,Tulabug,3336,10942,1°46′48″S 78°36′47″W﻿ / ﻿1.78°S 78.613°W,Holocene
31,Tungurahua,5023,16475,1°28′01″S 78°26′31″W﻿ / ﻿1.467°S 78.442°W,2016
32,Yanaurcu,4535,14879,0°29′55″N 78°20′02″W﻿ / ﻿0.49849°N 78.33389°W,"60,600 ± 20,000 years ago"


In [None]:
#Saving table 2 to dataframe for galapagos volcanoes
ecu_galap = ecu_tables[1]
ecu_galap.tail()

Unnamed: 0,Name,Meters,Feet,Coordinates,Last Eruption
10,San Cristóbal,759,2490,0°53′S 89°30′W﻿ / ﻿0.88°S 89.50°W,-
11,Santa Cruz,964,2834,0°37′S 90°20′W﻿ / ﻿0.62°S 90.33°W,-
12,Santiago,920,3018,0°13′S 90°46′W﻿ / ﻿0.22°S 90.77°W,1906
13,Sierra Negra,1124,3687,0°50′S 91°10′W﻿ / ﻿0.83°S 91.17°W,2018
14,Wolf,1710,5609,0°01′N 91°21′W﻿ / ﻿0.02°N 91.35°W,2015


In [None]:
#Creating categories for volcanoes location
ecu_main['Category'] = 'Mainland'
ecu_galap['Category'] = 'Galapagos'

In [None]:
#Concatenating both dataframes
ecu_volc = pd.concat([ecu_main, ecu_galap], ignore_index=True)
ecu_volc.shape

(48, 6)

##Data Cleaning

In [None]:
#Importing regular expression package
import re

In [None]:
#Checking the dataframe head
ecu_volc.head()

Unnamed: 0,Name,Meters,Feet,Coordinates,Last Eruption,Category
0,El Altar / Kapak Urku,5405,17730,".mw-parser-output .geo-default,.mw-parser-outp...",Unknown,Mainland
1,Antisana,5753,18870,0°28′52″S 78°08′24″W﻿ / ﻿0.481°S 78.14°W,1802,Mainland
2,Atacazo,4463,14639,0°21′11″S 78°37′01″W﻿ / ﻿0.353°S 78.617°W,-,Mainland
3,Carihuairazo,5018,16463,01°24′25″S 78°45′00″W﻿ / ﻿1.40694°S 78.75000°W,Unknown,Mainland
4,Cayambe,5790,18991,0°01′44″N 77°59′10″W﻿ / ﻿0.029°N 77.986°W,1786,Mainland


In [None]:
#Replacing preliminary missing/incorrect values
ecu_volc['Name'][0] = 'El Altar'
ecu_volc['Coordinates'][0] = '1.68°S 78.42°W / 1.68°S 78.42°W' #==> El Altar
ecu_volc['Meters'][12] = 3719 #==> Cusin
ecu_volc['Meters'][17] = 4453 #==> Ninahuilca
ecu_volc['Last Eruption'][17] = '2300 AD'
ecu_volc['Coordinates'][17] = '0.3535°S 78.617°W / 0.3535°S 78.617°W' #==> Ninahuilca
ecu_volc['Meters'][21] = 4453 #==> Pilavo
ecu_volc['Coordinates'][21] = '0.53°N 78.37°W / 0.53°N 78.37°W' #==> Pilavo
ecu_volc = ecu_volc.drop(39).reset_index(drop=True) #==> Galapagos Rift -2430
ecu_volc.shape

(47, 6)

In [None]:
#Calculating missing values from Feet column
ecu_volc['Feet'] = ecu_volc['Feet'].replace(to_replace='-', value=np.nan)
ecu_volc = ecu_volc.assign(Feet = np.where(ecu_volc['Feet'].isin([np.nan]), ecu_volc['Meters'].astype(int)*3.28, ecu_volc['Feet']))

In [None]:
#Extracting coordinates into separate list of lists

def split_coord(x):
  try:
    val = str(x.split('/')[1].strip()).replace('\ufeff', '')
    return re.split('[\° ]', val)
  except AttributeError:
    return ['NA', 'NA' 'NA', 'NA']

ecu_volc = ecu_volc.assign(String_coord = ecu_volc['Coordinates'].apply(split_coord))
ecu_volc.head()

Unnamed: 0,Name,Meters,Feet,Coordinates,Last Eruption,Category,String_coord
0,El Altar,5405,17730,1.68°S 78.42°W / 1.68°S 78.42°W,Unknown,Mainland,"[1.68, S, 78.42, W]"
1,Antisana,5753,18870,0°28′52″S 78°08′24″W﻿ / ﻿0.481°S 78.14°W,1802,Mainland,"[0.481, S, 78.14, W]"
2,Atacazo,4463,14639,0°21′11″S 78°37′01″W﻿ / ﻿0.353°S 78.617°W,-,Mainland,"[0.353, S, 78.617, W]"
3,Carihuairazo,5018,16463,01°24′25″S 78°45′00″W﻿ / ﻿1.40694°S 78.75000°W,Unknown,Mainland,"[1.40694, S, 78.75000, W]"
4,Cayambe,5790,18991,0°01′44″N 77°59′10″W﻿ / ﻿0.029°N 77.986°W,1786,Mainland,"[0.029, N, 77.986, W]"


In [None]:
#Splitting list of lists into columns for latitude and longitude
ecu_volc = ecu_volc.assign(Lat = [i[0] for i in ecu_volc['String_coord']],
                           Lat_char = [i[1] for i in ecu_volc['String_coord']],
                           Lon = [i[2] for i in ecu_volc['String_coord']],
                           Lon_char = [i[3] for i in ecu_volc['String_coord']])
ecu_volc.head()

Unnamed: 0,Name,Meters,Feet,Coordinates,Last Eruption,Category,String_coord,Lat,Lat_char,Lon,Lon_char
0,El Altar,5405,17730,1.68°S 78.42°W / 1.68°S 78.42°W,Unknown,Mainland,"[1.68, S, 78.42, W]",1.68,S,78.42,W
1,Antisana,5753,18870,0°28′52″S 78°08′24″W﻿ / ﻿0.481°S 78.14°W,1802,Mainland,"[0.481, S, 78.14, W]",0.481,S,78.14,W
2,Atacazo,4463,14639,0°21′11″S 78°37′01″W﻿ / ﻿0.353°S 78.617°W,-,Mainland,"[0.353, S, 78.617, W]",0.353,S,78.617,W
3,Carihuairazo,5018,16463,01°24′25″S 78°45′00″W﻿ / ﻿1.40694°S 78.75000°W,Unknown,Mainland,"[1.40694, S, 78.75000, W]",1.40694,S,78.75,W
4,Cayambe,5790,18991,0°01′44″N 77°59′10″W﻿ / ﻿0.029°N 77.986°W,1786,Mainland,"[0.029, N, 77.986, W]",0.029,N,77.986,W


In [None]:
#Assigning negative values to latitude and longitude (If 'S' or 'W' = negative coordinate)
ecu_volc = ecu_volc.assign(Lat = np.where(ecu_volc['Lat_char'].isin(['S']), ecu_volc['Lat'].astype(float)*-1, ecu_volc['Lat'].astype(float)),
                           Lon = np.where(ecu_volc['Lon_char'].isin(['W']), ecu_volc['Lon'].astype(float)*-1, ecu_volc['Lat'].astype(float)))
ecu_volc.head()

Unnamed: 0,Name,Meters,Feet,Coordinates,Last Eruption,Category,String_coord,Lat,Lat_char,Lon,Lon_char
0,El Altar,5405,17730,1.68°S 78.42°W / 1.68°S 78.42°W,Unknown,Mainland,"[1.68, S, 78.42, W]",-1.68,S,-78.42,W
1,Antisana,5753,18870,0°28′52″S 78°08′24″W﻿ / ﻿0.481°S 78.14°W,1802,Mainland,"[0.481, S, 78.14, W]",-0.481,S,-78.14,W
2,Atacazo,4463,14639,0°21′11″S 78°37′01″W﻿ / ﻿0.353°S 78.617°W,-,Mainland,"[0.353, S, 78.617, W]",-0.353,S,-78.617,W
3,Carihuairazo,5018,16463,01°24′25″S 78°45′00″W﻿ / ﻿1.40694°S 78.75000°W,Unknown,Mainland,"[1.40694, S, 78.75000, W]",-1.40694,S,-78.75,W
4,Cayambe,5790,18991,0°01′44″N 77°59′10″W﻿ / ﻿0.029°N 77.986°W,1786,Mainland,"[0.029, N, 77.986, W]",0.029,N,-77.986,W


In [None]:
#print([(a, b) for a, b in zip(ecu_volc['Lon'], ecu_volc['Lon_char'])])

[(-78.42, 'W'), (-78.14, 'W'), (-78.617, 'W'), (-78.75, 'W'), (-77.986, 'W'), (-78.25, 'W'), (-77.95083, 'W'), (-78.8175, 'W'), (-77.72, 'W'), (-78.34917, 'W'), (-78.436, 'W'), (-78.36389, 'W'), (-78.14, 'W'), (-78.714, 'W'), (-78.18, 'W'), (-78.613, 'W'), (-78.27, 'W'), (-78.617, 'W'), (-77.72, 'W'), (-78.48, 'W'), (-78.598, 'W'), (-78.37, 'W'), (-78.463, 'W'), (-78.9, 'W'), (-77.65, 'W'), (-78.53333, 'W'), (-78.34, 'W'), (-78.3665806, 'W'), (-77.58, 'W'), (-77.62, 'W'), (-78.613, 'W'), (-78.442, 'W'), (-78.33389, 'W'), (-91.12, 'W'), (-91.42, 'W'), (-91.28, 'W'), (-91.546, 'W'), (-91.55, 'W'), (-90.45, 'W'), (-89.958, 'W'), (-90.47, 'W'), (-90.75, 'W'), (-89.5, 'W'), (-90.33, 'W'), (-90.77, 'W'), (-91.17, 'W'), (-91.35, 'W')]


In [None]:
#Checking for missing values in the dataframe
ecu_volc.isna().sum()

Name             0
Meters           0
Feet             0
Coordinates      0
Last Eruption    0
Category         0
String_coord     0
Lat              0
Lat_char         0
Lon              0
Lon_char         0
dtype: int64

In [None]:
#Exporting to csv in local disk
from google.colab import files
ecu_volc.to_csv('volcanoes_ecuador.csv', index=False) #==> Excluding index from file
files.download('volcanoes_ecuador.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Data Visualization

###Uploading data

In [1]:
#Importing data manipulation libraries
import numpy as np
import pandas as pd

In [2]:
#Uploading visualization libraries
import matplotlib.pyplot as plt
from ipywidgets import interact
import plotly.express as px
import plotly.graph_objects as go
import folium
#from folium import plugins
#from folium.plugins import HeatMap

In [None]:
#Remove previous versions of the uploaded file
!rm volcanoes_ecuador.csv

In [3]:
#Uploading file from local drive
from google.colab import files
uploaded1 = files.upload()

Saving volcanoes_ecuador.csv to volcanoes_ecuador.csv


In [4]:
#Storing dataset in a Pandas Dataframe
import io
ecu_volc = pd.read_csv(io.BytesIO(uploaded1['volcanoes_ecuador.csv']))

In [5]:
#Checking the datagrame info
ecu_volc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           47 non-null     object 
 1   Meters         47 non-null     int64  
 2   Feet           47 non-null     float64
 3   Coordinates    47 non-null     object 
 4   Last Eruption  47 non-null     object 
 5   Category       47 non-null     object 
 6   String_coord   47 non-null     object 
 7   Lat            47 non-null     float64
 8   Lat_char       47 non-null     object 
 9   Lon            47 non-null     float64
 10  Lon_char       47 non-null     object 
dtypes: float64(3), int64(1), object(7)
memory usage: 4.2+ KB


In [36]:
#Selecting necessary columns
ecu_volc = ecu_volc[['Category', 'Name', 'Lat', 'Lon', 'Meters']]

In [37]:
#Checking the dataframe tail
ecu_volc.tail()

Unnamed: 0,Category,Name,Lat,Lon,Meters
42,Galapagos,San Cristóbal,-0.88,-89.5,759
43,Galapagos,Santa Cruz,-0.62,-90.33,964
44,Galapagos,Santiago,-0.22,-90.77,920
45,Galapagos,Sierra Negra,-0.83,-91.17,1124
46,Galapagos,Wolf,0.02,-91.35,1710


###Plotting volcanoes using Plotly Graph Objects

In [41]:
#Plotting interactive map of volcanoes
@interact(Cat=ecu_volc['Category'].unique())
def plot_graph(Cat):
  df = (ecu_volc[ecu_volc['Category']==Cat]).reset_index()
  fig = go.Figure(px.scatter_geo(data_frame=df, lat='Lat', lon='Lon',
                                 size='Meters', color='Meters',
                                 title='Volcanoes in Ecuador'))
  if Cat == 'Mainland':
    fig.update_layout(geo = dict(projection_scale=30, center=dict(lat=-0.83, lon=-78.18)))
  else:
    fig.update_layout(geo = dict(projection_scale=30, center=dict(lat=-0.95, lon=-90.97)))
  return fig

interactive(children=(Dropdown(description='Cat', options=('Mainland', 'Galapagos'), value='Mainland'), Output…

###Plotting volcanoes using Folium

In [29]:
#Creating a function to generate a basemap
def generate_basemap(default_loc=[-1.83, -76.18], default_zoom=7):
    bmap = folium.Map(location=default_loc, zoom_start=default_zoom)
    return bmap

In [46]:
#Plotting first basemap for mainland volcanoes
basemap_1 = generate_basemap()
df1_ecu = ecu_volc[ecu_volc['Category']=='Mainland'].copy()
for i in range(0,len(df1_ecu)):
   folium.Marker(
      location=[df1_ecu.iloc[i]['Lat'], df1_ecu.iloc[i]['Lon']],
      popup=df1_ecu.iloc[i]['Name'],
      tooltip=df1_ecu.iloc[i]['Meters']
   ).add_to(basemap_1)
basemap_1

In [47]:
#Plotting second basemap for galapagos volcanoes
basemap_2 = generate_basemap([-0.95, -90.97], 8)
df1_gal = ecu_volc[ecu_volc['Category']=='Galapagos'].copy()
for i in range(0,len(df1_gal)):
   folium.Marker(
      location=[df1_gal.iloc[i]['Lat'], df1_gal.iloc[i]['Lon']],
      popup=df1_gal.iloc[i]['Name'],
      tooltip=df1_gal.iloc[i]['Meters']
   ).add_to(basemap_2)
basemap_2