In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
import plotly.graph_objects as graph_objects
import findspark

findspark.init()

# spark_context = SparkContext.getOrCreate()
spark_context = SparkContext('local', 'R1_A3')
spark_session = SparkSession(spark_context)
sqlContext = SQLContext(spark_context)



In [2]:
text_file = sqlContext.read.format('com.databricks.spark.csv').\
    options(header='true', inferschema='true', quote='"', delimiter=',').\
    load('./work/data/Covid19.csv')

rddfiltro = text_file.rdd.map(tuple)
rddGen = rddfiltro.map(lambda word: (word[7], word[5]))
# rddGen.take(10)

In [3]:
rddConteo = rddGen.reduceByKey(lambda a, b: a+b)
print(f'Conteo total -> {rddConteo.collect()}')

Conteo total -> [('Afghanistan', 31391), ('Antigua_and_Barbuda', 1), ('Argentina', 622921), ('Armenia', 47431), ('Aruba', 3551), ('Australia', 26898), ('Austria', 38557), ('Azerbaijan', 39188), ('Bahamas', 3370), ('Bahrain', 64499), ('Bangladesh', 348916), ('Barbados', 189), ('Belarus', 75674), ('Belgium', 102201), ('Belize', 1627), ('Benin', 2280), ('Bermuda', 180), ('Bhutan', 261), ('Bolivia', 130676), ('Bonaire, Saint Eustatius and Saba', 36), ('Bosnia_and_Herzegovina', 25424), ('Botswana', 2567), ('Brazil', 4544629), ('British_Virgin_Islands', 71), ('Brunei_Darussalam', 145), ('Bulgaria', 18863), ('Burkina_Faso', 1846), ('Burundi', 473), ('Cambodia', 275), ('Cameroon', 20431), ('Canada', 143649), ('Cape_Verde', 5257), ('Cases_on_an_international_conveyance_Japan', 696), ('Cayman_Islands', 209), ('Central_African_Republic', 4793), ('Chad', 1151), ('Chile', 446274), ('China', 90369), ('Colombia', 765076), ('Comoros', 470), ('Congo', 4986), ('Costa_Rica', 63712), ('Cote_dIvoire', 1932

In [4]:
filtros = ['Cuba', 'France', 'Canada', 'Singapore', 'South_Korea']
rddOrden = spark_context.parallelize(
    rddConteo.filter(lambda a: a[0] in filtros).take(5))
print(f'Total de casos de COVID-19 en {filtros} -> {rddOrden.collect()}')

Total de casos de COVID-19 en ['Cuba', 'France', 'Canada', 'Singapore', 'South_Korea'] -> [('Canada', 143649), ('Cuba', 5091), ('France', 453763), ('Singapore', 57576), ('South_Korea', 23045)]


In [5]:
rddNombres = rddOrden.map(lambda x: (x[0]))
print(rddNombres.collect())

rddTotales = rddOrden.map(lambda x: (x[1]))
print(rddTotales.collect())

['Canada', 'Cuba', 'France', 'Singapore', 'South_Korea']
[143649, 5091, 453763, 57576, 23045]


In [7]:
graph = graph_objects.Figure(
    data=graph_objects.Bar(
        x=rddNombres.collect(),
        y=rddTotales.collect()
    ))

graph.update_layout(
    title_text='Total de casos de COVID-19',
    title_font_size=30,
    yaxis=dict(title='No. Casos', title_font_size=25),
    xaxis=dict(title='País', title_font_size=25))

graph.update_traces(overwrite=True, marker={"opacity": 0.5})
graph.write_html('./work/reports/R1_A3.html', auto_open=True)