# Generating Visualisations with Plotly and Integrating IBM Cloud Object Storage and Apache Spark with ibmos2spark package.

### 1 - Setup and Credentials Configuration

#### 1.1 - Dependencies and spark context initialization

In [51]:
# Imports

from IPython.display import Image, HTML, display
import plotly as py
import plotly.graph_objs as go
import pandas as pd
import requests
requests.packages.urllib3.disable_warnings()

In [52]:
# Spark Context initialization

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

#### 1.2 - Plotly credentials configuration

In [53]:
# Write your Plotly credentials in the function below

py.tools.set_credentials_file(username='<YOUR USERNAME>', api_key='<YOUR API-KEY>')

#### 1.3 - Data extraction from .csv file on Cloud Object Storage to a SparkSession DataFrame

In [None]:
# Click on this cell;
# Click on "Find and add Data" (top-right corner);
# Select desired .csv file (enem2016data.csv1.csv) and choose: "Insert as SparkSession Dataframe".

'<INSERT GENERATED CODE HERE>'

### 2 - Data Preparation

In [17]:
# Desired columns selection

df = df_data_2.select('TP_SEXO', 'Q001', 'Q002', 'TP_ESCOLA', 'Q006', 'NU_NOTA_REDACAO', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_MT')
df.show(5)

+-------+----+----+---------+----+---------------+----------+----------+----------+----------+
|TP_SEXO|Q001|Q002|TP_ESCOLA|Q006|NU_NOTA_REDACAO|NU_NOTA_CN|NU_NOTA_CH|NU_NOTA_LC|NU_NOTA_MT|
+-------+----+----+---------+----+---------------+----------+----------+----------+----------+
|      M|   E|   E|        1|   B|            580|       550|     629.4|       574|     462.9|
|      M|   H|   H|        1|   D|            560|     576.6|     669.8|     610.1|     423.6|
|      M|   B|   E|        1|   D|            480|     485.8|     576.9|     637.4|     414.6|
|      M|   E|   E|        1|   B|            620|     571.4|     628.6|     646.2|     709.4|
|      F|   A|   A|        1|   A|           null|      null|      null|      null|      null|
+-------+----+----+---------+----+---------------+----------+----------+----------+----------+
only showing top 5 rows



In [18]:
# Blank lines removal

print("Rows: ", df.count())
df = df.dropna()
print("Rows: ", df.count())

Rows:  9999
Rows:  7201


In [20]:
# Conversion between Spark and Pandas DataFrames (for Plotly visualisations)

sqlCtx.registerDataFrameAsTable(df, "DFasTABLE")
df_rd = sqlCtx.sql("SELECT NU_NOTA_REDACAO as rd from DFasTABLE")
df_cn = sqlCtx.sql("SELECT NU_NOTA_CN as cn from DFasTABLE")
df_mt = sqlCtx.sql("SELECT NU_NOTA_MT as mt from DFasTABLE")
df_ch = sqlCtx.sql("SELECT NU_NOTA_CH as ch from DFasTABLE")
df_lc = sqlCtx.sql("SELECT NU_NOTA_LC as lc from DFasTABLE")
data_rd = [go.Histogram(x = df_rd.toPandas()['rd'])]
data_cn = [go.Histogram(x = df_cn.toPandas()['cn'])]
data_mt = [go.Histogram(x = df_mt.toPandas()['mt'])]
data_ch = [go.Histogram(x = df_ch.toPandas()['ch'])]
data_lc = [go.Histogram(x = df_lc.toPandas()['lc'])]

### 3 - Visualisations: Histograms and Pie Charts

In [23]:
layout = go.Layout(
    title = "Histogram: ENEM 2016 Text Writing Exam",
    xaxis = dict(
        range = [0, 1000]
    )
)

fig = go.Figure(data = data_rd, layout = layout)
py.plotly.iplot(fig, filename="enem2016_redacao")

In [24]:
layout = go.Layout(
    title = "Histogram: ENEM 2016 Biology, Chemistry and Physics Exam",
    xaxis = dict(
        range = [0, 1000]
    )
)

fig = go.Figure(data = data_cn, layout = layout)
py.plotly.iplot(fig, filename="enem2016_ciencias_da_natureza")

In [25]:
layout = go.Layout(
    title = "Histogram: ENEM 2016 Mathematics Exam",
    xaxis = dict(
        range = [0, 1000]
    )
)

fig = go.Figure(data = data_mt, layout = layout)
py.plotly.iplot(fig, filename="enem2016_matematica")

In [26]:
layout = go.Layout(
    title = "Histogram: ENEM 2016 Humanities Exam",
    xaxis = dict(
        range = [0, 1000]
    )
)

fig = go.Figure(data = data_ch, layout = layout)
py.plotly.iplot(fig, filename="enem2016_ciencias_humanas")

In [27]:
layout = go.Layout(
    title = "Histogram: ENEM 2016 Literature Exam",
    xaxis = dict(
        range = [0, 1000]
    )
)

fig = go.Figure(data = data_lc, layout = layout)
py.plotly.iplot(fig, filename="enem2016_linguagem_e_comunicacao")

In [29]:
# This is a pie chart showing the family income of the ENEM 2016 candidates.
# 1 SM = one brazilian mininum salary (330$ USD per month)

labels = ['1 SM','2 SM','3 SM','4 SM','5 SM','6 SM','7 SM','8 SM','9 SM','10 SM','11 SM','12 SM','13 SM','14 SM','15 SM','16 SM','17 SM']
values = [138,1264,1759,1021,740,484,536,394,250,129,82,45,54,95,82,60,68]

trace = go.Pie(labels = labels, values = values)

py.plotly.iplot([trace], filename='basic_pie_chart')

In [31]:
# This is an overlaid histogram showing the education level of the ENEM 2016 candidates parents.
# Levels (starting from lowest level of education - A = incomplete basic schooling): A, B, C, D, E, F, G, H

df_q1 = sqlCtx.sql("SELECT Q001 as q1 from DFasTABLE")
df_q2 = sqlCtx.sql("SELECT Q002 as q2 from DFasTABLE")

trace1 = go.Histogram(
    x=df_q1.toPandas()['q1'],
    opacity=0.3,
    name="Father education level"
)
trace2 = go.Histogram(
    x=df_q2.toPandas()['q2'],
    opacity=0.3,
    name="Mother education level"
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

py.plotly.iplot(fig, filename='overlaid histogram')

In [33]:
# This is a pie chart showing the ENEM 2016 candidates genders.

labels = ['MEN', 'WOMEN']
values = [2906, 4295]

trace = go.Pie(labels = labels, values = values)

py.plotly.iplot([trace], filename='basic_pie_chart2')