# Análise Exploratória

* Requer `./data/tpr_co2_data.csv`;

* Verificar diretório `integrate` para maiores instruções de instalação.

# Bibliotecas

In [1]:
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



# Leitura

In [None]:
spark = SparkSession.builder.getOrCreate()

data = spark.read.options(header='True').csv('./data/tpr_co2_data.csv')
data.show(20)

# Análise

In [4]:
data.describe().show(100)

+-------+----------+------------------+-----------------+--------+-----------+--------------------+------------------+------------------+
|summary|      date|               lat|              lon|    city|    country|                 tpr|           tpr_unc|               co2|
+-------+----------+------------------+-----------------+--------+-----------+--------------------+------------------+------------------+
|  count|   6861761|           6861761|          6861761| 6861761|    6861761|             6734803|           6734803|           6808706|
|   mean|      null|23.001327965194324|32.41362665651364|    null|       null|  17.513312067183502|0.6556305384136641|316.93265450915146|
| stddev|      null|23.212739900958823| 74.7539339596633|    null|       null|  10.109577976111598|0.5142646885087389|30.093303272470383|
|    min|1850-01-01|              -0.8|             -0.0|A Coruña|Afghanistan|-0.00099999999999...|             0.034| 274.9664001464844|
|    max|2013-09-01|              

In [39]:
dt = pd.read_csv('./data/tpr_co2_data.csv')

# Desconsiderando os países que são colônias (e não têm seus nomes marcados)
dt_clear = dt[~dt['country'].isin(
    ['Denmark', 'Antarctica', 'France', 'Europe', 'Netherlands',
     'United Kingdom', 'Africa', 'South America'])]

dt_clear = dt_clear.replace(
   ['Denmark (Europe)', 'France (Europe)', 'Netherlands (Europe)', 'United Kingdom (Europe)'],
   ['Denmark', 'France', 'Netherlands', 'United Kingdom'])

countries = np.unique(dt_clear['country'])
mean_temp = []
for country in countries:
    mean_temp.append(dt_clear[dt_clear['country'] == country]['tpr'].mean())
    
data_fig = [ dict(
        type = 'choropleth',
        locations = countries,
        z = mean_temp,
        locationmode = 'country names',
        text = countries,
        marker = dict(
            line = dict(color = 'rgb(0,0,0)', width = 1)),
            colorbar = dict(autotick = True, tickprefix = '', 
            title = '# Temperatura\nMédia,\n°C')
            )
       ]

layout = dict(
    title = 'Temperatura Média por País de 1850 a 2013',
    geo = dict(
        showframe = False,
        showocean = True,
        oceancolor = 'rgb(0,255,255)',
        projection = dict(
        type = 'orthographic',
            rotation = dict(
                    lon = 60,
                    lat = 10),
        ),
        lonaxis =  dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
            ),
        lataxis = dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
                )
            ),
        )

fig = dict(data=data_fig, layout=layout)
py.iplot(fig, validate=False, filename='worldmap')

In [47]:
anos = np.unique(dt_clear['date'].str.slice(stop=4))
mean_temp_world = []
mean_co2_world = []
for ano in anos:
    mean_temp_world.append(dt_clear[dt_clear['date'].str.slice(stop=4) == ano]['tpr'].mean())
    mean_co2_world.append(dt_clear[dt_clear['date'].str.slice(stop=4) == ano]['co2'].mean())

trace0 = go.Scatter(
    x = anos, 
    y = mean_temp_world,
    name='Temperatura Média',
    line=dict(
        color='rgb(199, 121, 093)',
    )
)
data = [trace0]

layout = go.Layout(
    xaxis=dict(title='Ano'),
    yaxis=dict(title='Temperatura Média, °C'),
    title='Temperatura Média Mundial',
    showlegend = False)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [45]:
mean_co2 = []
for country in countries:
    mean_co2.append(dt_clear[dt_clear['country'] == country]['co2'].mean())
    
data_fig = [ dict(
        type = 'choropleth',
        locations = countries,
        z = mean_co2,
        locationmode = 'country names',
        text = countries,
        marker = dict(
            line = dict(color = 'rgb(0,0,0)', width = 1)),
            colorbar = dict(autotick = True, tickprefix = '', 
            title = '# CO2\nMédio,\nppm')
            )
       ]

layout = dict(
    title = 'CO2 Médio por País de 1850 a 2013',
    geo = dict(
        showframe = False,
        showocean = True,
        oceancolor = 'rgb(0,255,255)',
        projection = dict(
        type = 'orthographic',
            rotation = dict(
                    lon = 60,
                    lat = 10),
        ),
        lonaxis =  dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
            ),
        lataxis = dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
                )
            ),
        )

fig = dict(data=data_fig, layout=layout)
py.iplot(fig, validate=False, filename='worldmap')

In [48]:
trace0 = go.Scatter(
    x = anos, 
    y = mean_co2_world,
    name='CO2 Médio',
    line=dict(
        color='rgb(199, 121, 093)',
    )
)
data = [trace0]

layout = go.Layout(
    xaxis=dict(title='Ano'),
    yaxis=dict(title='CO2 Médio, ppm'),
    title='CO2 Médio Mundial',
    showlegend = False)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)