# PROYECTO BIG DATA PROCESSING

### Análisis de la felicidad en el mundo

In [1]:
!pip install findspark



In [2]:
!pip install pyspark



In [208]:
import findspark
findspark.init()

import pandas as pd

import pyspark
from pyspark.sql import functions as f
from pyspark.sql.types import *
from pyspark.sql.functions import col, max, avg, first, dense_rank, rank, row_number
from pyspark.sql import Window

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

### Inicio de sesión en Spark

In [11]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local[*]")\
        .appName('PySpark_Tutorial')\
        .getOrCreate()

### Carga de datos felicidad en el año 2021

In [9]:
# Carga de datos antes de diseñar el schema
data_happy_21 = spark.read.csv(
    '/home/lucia/BigDataProcessing/Datasets/world-happiness-report-2021.csv',
    sep = ',',
    header = True,
    )

data_happy_21.printSchema()

root
 |-- Country name: string (nullable = true)
 |-- Regional indicator: string (nullable = true)
 |-- Ladder score: string (nullable = true)
 |-- Standard error of ladder score: string (nullable = true)
 |-- upperwhisker: string (nullable = true)
 |-- lowerwhisker: string (nullable = true)
 |-- Logged GDP per capita: string (nullable = true)
 |-- Social support: string (nullable = true)
 |-- Healthy life expectancy: string (nullable = true)
 |-- Freedom to make life choices: string (nullable = true)
 |-- Generosity: string (nullable = true)
 |-- Perceptions of corruption: string (nullable = true)
 |-- Ladder score in Dystopia: string (nullable = true)
 |-- Explained by: Log GDP per capita: string (nullable = true)
 |-- Explained by: Social support: string (nullable = true)
 |-- Explained by: Healthy life expectancy: string (nullable = true)
 |-- Explained by: Freedom to make life choices: string (nullable = true)
 |-- Explained by: Generosity: string (nullable = true)
 |-- Explained 

In [17]:
# Crear un nuevo esquema de datos con los tipos de datos ya que los carga como Strings
data21_schema = [
               StructField('Country name', StringType(), True),
               StructField('Regional indicator', StringType(), True),
               StructField('Ladder score', DoubleType(), True),
               StructField('Standard error of ladder score', DoubleType(), True),
               StructField('upperwhisker', DoubleType(), True),
               StructField('lowerwhisker', DoubleType(), True),
               StructField('Logged GDP per capita', DoubleType(), True),
               StructField('Social support', DoubleType(), True),
               StructField('Healthy life expectancy', DoubleType(), True),
               StructField('Freedom to make life choices', DoubleType(), True),
               StructField('Generosity', DoubleType(), True),
               StructField('Perceptions of corruption', DoubleType(), True),
               StructField('Ladder score in Dystopia', DoubleType(), True),
               StructField('Explained by: Log GDP per capita', DoubleType(), True),
               StructField('Explained by: Social support', DoubleType(), True),
               StructField('Explained by: Healthy life expectancy', DoubleType(), True),
               StructField('Explained by: Freedom to make life choices', DoubleType(), True),
               StructField('Explained by: Generosity', DoubleType(), True),
               StructField('Explained by: Perceptions of corruption', DoubleType(), True),
               StructField('Dystopia + residual', DoubleType(), True)
                ]
final_struc_21 = StructType(fields=data21_schema)

In [18]:
# Carga de datos con un esquema

data_happy_21 = spark.read.csv(
    '/home/lucia/BigDataProcessing/Datasets/world-happiness-report-2021.csv',
    sep = ',',
    header = True,
    schema = final_struc_21
    )

data_happy_21.printSchema()

root
 |-- Country name: string (nullable = true)
 |-- Regional indicator: string (nullable = true)
 |-- Ladder score: double (nullable = true)
 |-- Standard error of ladder score: double (nullable = true)
 |-- upperwhisker: double (nullable = true)
 |-- lowerwhisker: double (nullable = true)
 |-- Logged GDP per capita: double (nullable = true)
 |-- Social support: double (nullable = true)
 |-- Healthy life expectancy: double (nullable = true)
 |-- Freedom to make life choices: double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Perceptions of corruption: double (nullable = true)
 |-- Ladder score in Dystopia: double (nullable = true)
 |-- Explained by: Log GDP per capita: double (nullable = true)
 |-- Explained by: Social support: double (nullable = true)
 |-- Explained by: Healthy life expectancy: double (nullable = true)
 |-- Explained by: Freedom to make life choices: double (nullable = true)
 |-- Explained by: Generosity: double (nullable = true)
 |-- Explained 

In [22]:
# Comprobaciones del archivo

data_happy_21.head(5)

[Row(Country name='Finland', Regional indicator='Western Europe', Ladder score=7.842, Standard error of ladder score=0.032, upperwhisker=7.904, lowerwhisker=7.78, Logged GDP per capita=10.775, Social support=0.954, Healthy life expectancy=72.0, Freedom to make life choices=0.949, Generosity=-0.098, Perceptions of corruption=0.186, Ladder score in Dystopia=2.43, Explained by: Log GDP per capita=1.446, Explained by: Social support=1.106, Explained by: Healthy life expectancy=0.741, Explained by: Freedom to make life choices=0.691, Explained by: Generosity=0.124, Explained by: Perceptions of corruption=0.481, Dystopia + residual=3.253),
 Row(Country name='Denmark', Regional indicator='Western Europe', Ladder score=7.62, Standard error of ladder score=0.035, upperwhisker=7.687, lowerwhisker=7.552, Logged GDP per capita=10.933, Social support=0.954, Healthy life expectancy=72.7, Freedom to make life choices=0.946, Generosity=0.03, Perceptions of corruption=0.179, Ladder score in Dystopia=2.

In [23]:
data_happy_21.tail(5)

[Row(Country name='Lesotho', Regional indicator='Sub-Saharan Africa', Ladder score=3.512, Standard error of ladder score=0.12, upperwhisker=3.748, lowerwhisker=3.276, Logged GDP per capita=7.926, Social support=0.787, Healthy life expectancy=48.7, Freedom to make life choices=0.715, Generosity=-0.131, Perceptions of corruption=0.915, Ladder score in Dystopia=2.43, Explained by: Log GDP per capita=0.451, Explained by: Social support=0.731, Explained by: Healthy life expectancy=0.007, Explained by: Freedom to make life choices=0.405, Explained by: Generosity=0.103, Explained by: Perceptions of corruption=0.015, Dystopia + residual=1.8),
 Row(Country name='Botswana', Regional indicator='Sub-Saharan Africa', Ladder score=3.467, Standard error of ladder score=0.074, upperwhisker=3.611, lowerwhisker=3.322, Logged GDP per capita=9.782, Social support=0.784, Healthy life expectancy=59.269, Freedom to make life choices=0.824, Generosity=-0.246, Perceptions of corruption=0.801, Ladder score in D

In [27]:
data_happy_21.count()

149

### Carga de datos felicidad hasta el año 2020

In [19]:
# Carga del archivo

data_happy_years = spark.read.csv(
    '/home/lucia/BigDataProcessing/Datasets/world-happiness-report.csv',
    sep = ',',
    header = True,
    )

data_happy_years.printSchema()

root
 |-- Country name: string (nullable = true)
 |-- year: string (nullable = true)
 |-- Life Ladder: string (nullable = true)
 |-- Log GDP per capita: string (nullable = true)
 |-- Social support: string (nullable = true)
 |-- Healthy life expectancy at birth: string (nullable = true)
 |-- Freedom to make life choices: string (nullable = true)
 |-- Generosity: string (nullable = true)
 |-- Perceptions of corruption: string (nullable = true)
 |-- Positive affect: string (nullable = true)
 |-- Negative affect: string (nullable = true)



In [35]:
# Crear un esquema para la carga de datos ya que todos los carga como strings
data_years_schema = [
               StructField('Country name', StringType(), True),
               StructField('year', IntegerType(), True),
               StructField('Life Ladder', DoubleType(), True),
               StructField('Log GDP per capita', DoubleType(), True),
               StructField('Social support', DoubleType(), True),
               StructField('Healthy life expectancy at birth', DoubleType(), True),
               StructField('Freedom to make life choices', DoubleType(), True),
               StructField('Generosity', DoubleType(), True),
               StructField('Perceptions of corruption', DoubleType(), True),
               StructField('Positive affect', DoubleType(), True),
               StructField('Negative affect', DoubleType(), True),
                ]

final_struc_years = StructType(fields=data_years_schema)

In [36]:
# Cargar después con el nuevo esquema
data_happy_years = spark.read.csv(
    '/home/lucia/BigDataProcessing/Datasets/world-happiness-report.csv',
    sep = ',',
    header = True,
    schema = final_struc_years
    )

data_happy_years.printSchema()

root
 |-- Country name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- Life Ladder: double (nullable = true)
 |-- Log GDP per capita: double (nullable = true)
 |-- Social support: double (nullable = true)
 |-- Healthy life expectancy at birth: double (nullable = true)
 |-- Freedom to make life choices: double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Perceptions of corruption: double (nullable = true)
 |-- Positive affect: double (nullable = true)
 |-- Negative affect: double (nullable = true)



In [37]:
# Comprobación de la carga de datos
data_happy_years.head(5)

[Row(Country name='Afghanistan', year=2008, Life Ladder=3.724, Log GDP per capita=7.37, Social support=0.451, Healthy life expectancy at birth=50.8, Freedom to make life choices=0.718, Generosity=0.168, Perceptions of corruption=0.882, Positive affect=0.518, Negative affect=0.258),
 Row(Country name='Afghanistan', year=2009, Life Ladder=4.402, Log GDP per capita=7.54, Social support=0.552, Healthy life expectancy at birth=51.2, Freedom to make life choices=0.679, Generosity=0.19, Perceptions of corruption=0.85, Positive affect=0.584, Negative affect=0.237),
 Row(Country name='Afghanistan', year=2010, Life Ladder=4.758, Log GDP per capita=7.647, Social support=0.539, Healthy life expectancy at birth=51.6, Freedom to make life choices=0.6, Generosity=0.121, Perceptions of corruption=0.707, Positive affect=0.618, Negative affect=0.275),
 Row(Country name='Afghanistan', year=2011, Life Ladder=3.832, Log GDP per capita=7.62, Social support=0.521, Healthy life expectancy at birth=51.92, Free

In [39]:
data_happy_years.tail(5)

[Row(Country name='Zimbabwe', year=2016, Life Ladder=3.735, Log GDP per capita=7.984, Social support=0.768, Healthy life expectancy at birth=54.4, Freedom to make life choices=0.733, Generosity=-0.095, Perceptions of corruption=0.724, Positive affect=0.738, Negative affect=0.209),
 Row(Country name='Zimbabwe', year=2017, Life Ladder=3.638, Log GDP per capita=8.016, Social support=0.754, Healthy life expectancy at birth=55.0, Freedom to make life choices=0.753, Generosity=-0.098, Perceptions of corruption=0.751, Positive affect=0.806, Negative affect=0.224),
 Row(Country name='Zimbabwe', year=2018, Life Ladder=3.616, Log GDP per capita=8.049, Social support=0.775, Healthy life expectancy at birth=55.6, Freedom to make life choices=0.763, Generosity=-0.068, Perceptions of corruption=0.844, Positive affect=0.71, Negative affect=0.212),
 Row(Country name='Zimbabwe', year=2019, Life Ladder=2.694, Log GDP per capita=7.95, Social support=0.759, Healthy life expectancy at birth=56.2, Freedom t

In [40]:
data_happy_years.count()

1949

### PREGUNTAS

1. ¿Cuál es el país más “feliz” del 2021 según la data? (considerar que la columna “Ladder score” mayor número más feliz es el país)

In [145]:
max_ladder_21 = data_happy_21.groupBy('Country name') \
                .agg(
                    max(col('Ladder score')).alias('Max_ladder_score')
                    )\
                .orderBy(col('Max_ladder_score').desc()) \
                .limit(1)
max_ladder_21.show(truncate = False)

+------------+----------------+
|Country name|Max_ladder_score|
+------------+----------------+
|Finland     |7.842           |
+------------+----------------+



### El país con mayor nivel del felicidad según el indicador "Ladder Score" con un valor de 7.842 es FINLAND.

2. ¿Cuál es el país más “feliz” del 2021 por continente según la data?

In [72]:
# Utilizamos la columna "Regional indicator" para agrupar por continente

data_happy_21.groupBy('Regional indicator') \
    .agg(
        first(col('Country name')).alias('Country'),
        max(col('Ladder score')).alias('Max_ladder_score')
        )\
    .orderBy(col('Max_ladder_score').desc()) \
    .show(truncate = False)

+----------------------------------+------------------------+----------------+
|Regional indicator                |Country                 |Max_ladder_score|
+----------------------------------+------------------------+----------------+
|Western Europe                    |Finland                 |7.842           |
|North America and ANZ             |New Zealand             |7.277           |
|Middle East and North Africa      |Israel                  |7.157           |
|Latin America and Caribbean       |Costa Rica              |7.069           |
|Central and Eastern Europe        |Czech Republic          |6.965           |
|East Asia                         |Taiwan Province of China|6.584           |
|Southeast Asia                    |Singapore               |6.377           |
|Commonwealth of Independent States|Uzbekistan              |6.179           |
|Sub-Saharan Africa                |Mauritius               |6.049           |
|South Asia                        |Nepal           

### En la tabla previa podemos ver el país por continente con mayor nivel de felicidad según el indicador "Ladder score".

3. ¿Cuál es el país que más veces ocupó el primer lugar en todos los años?

In [147]:
# Añadir el país con más felicidad en 2021
# Creamos un DF con solo las columnas 'Country name', 'year' y 'Life Ladder'

df_happy_yearsG = data_happy_years.select('Country name', 'year', 'Life Ladder')

# Creamos un nuevo registro con los datos obtenidos en el punto 1 de país, año y valor de Ladder score.
first_row = max_ladder_21.collect()[0]
country = first_row['Country name']
max_ladder = first_row['Max_ladder_score']

# Imprimir los valores
print("Country:", country_value)
print("Max Ladder Score:", max_ladder_value)
new_register = Row(country, 2021, max_ladder)  

# Con ese registro creamos un DF con el esquema del primer DF
df_new_register = spark.createDataFrame([new_register], schema=['Country name', 'year', 'Life Ladder'])

# Unimos los dos DF
df_happy_yearsGUpdate = df_happy_yearsG.union(df_new_register)

# Mostrar resultados
df_happy_yearsGUpdate.show(truncate=False)

Country: Finland
Max Ladder Score: 7.842
+------------+----+-----------+
|Country name|year|Life Ladder|
+------------+----+-----------+
|Afghanistan |2008|3.724      |
|Afghanistan |2009|4.402      |
|Afghanistan |2010|4.758      |
|Afghanistan |2011|3.832      |
|Afghanistan |2012|3.783      |
|Afghanistan |2013|3.572      |
|Afghanistan |2014|3.131      |
|Afghanistan |2015|3.983      |
|Afghanistan |2016|4.22       |
|Afghanistan |2017|2.662      |
|Afghanistan |2018|2.694      |
|Afghanistan |2019|2.375      |
|Albania     |2007|4.634      |
|Albania     |2009|5.485      |
|Albania     |2010|5.269      |
|Albania     |2011|5.867      |
|Albania     |2012|5.51       |
|Albania     |2013|4.551      |
|Albania     |2014|4.814      |
|Albania     |2015|4.607      |
+------------+----+-----------+
only showing top 20 rows



In [148]:
# Al DF anterior hacemos un ranking por años, de forma ascendente 
# para ver cual tiene el valor mayor del indicador 'Life Ladder'

ranked_data_happy = df_happy_yearsGUpdate.withColumn("Ranking", 
                                                dense_rank().over(Window.partitionBy("year")
                                                                  .orderBy(col("Life Ladder").desc()))
                                               )
# Creamos un DF con los primeros en el ranking para tener el país con el valor más alto por cada año
first_places = ranked_data_happy.filter(col("Ranking") == 1)

# Realizamos un conteo agrupando por países ordenado de forma descendente (Cogemos 2 porque hay un empate)
first_places.groupBy("Country name").count().orderBy(col("count").desc()).limit(2).show()


+------------+-----+
|Country name|count|
+------------+-----+
|     Finland|    7|
|     Denmark|    7|
+------------+-----+



### Hay dos países empatados en el primer puesto con 7 veces como primeros en ranking de felicidad 
### FINLAND y DENMARK

4. ¿Qué puesto de Felicidad tiene el país con mayor GDP del 2020?

In [182]:
# Creamos un DF del año 2020 con las columnas 'Country name' y 'Log GDP per capita'
df_happy20GDP = data_happy_years.filter(col('year') == 2020).select('Country name','year','Log GDP per capita', 'Life Ladder')

df_happy20GPD_ranked = df_happy20GDP.withColumn('Ranking', rank().over(Window.partitionBy('year').orderBy(col('Life Ladder').desc())))

df_happy20GPD_ranked.show()
# Ordenamos de mayor a menor por el indicador 'Log GDP per capita', cogemos el primer valor.
df_happy20GDP_sorted = df_happy20GPD_ranked.orderBy(col('Log GDP per capita').desc()).limit(1)

df_happy20GDP_sorted.show()




# Mostrar el DataFrame resultante




+--------------------+----+------------------+-----------+-------+
|        Country name|year|Log GDP per capita|Life Ladder|Ranking|
+--------------------+----+------------------+-----------+-------+
|             Finland|2020|             10.75|      7.889|      1|
|             Iceland|2020|            10.824|      7.575|      2|
|             Denmark|2020|             10.91|      7.515|      3|
|         Switzerland|2020|            11.081|      7.508|      4|
|         Netherlands|2020|            10.901|      7.504|      5|
|              Sweden|2020|            10.838|      7.314|      6|
|             Germany|2020|            10.833|      7.312|      7|
|              Norway|2020|            11.042|       7.29|      8|
|         New Zealand|2020|              10.6|      7.257|      9|
|             Austria|2020|            10.851|      7.213|     10|
|              Israel|2020|            10.538|      7.195|     11|
|           Australia|2020|             10.76|      7.137|    

### El país en el año 2020 con  mayor GDP es IRELAND cuyo puesto en el ranking de la felicidad fue el Nª 13


5. ¿En que porcentaje ha variado a nivel mundial el GDP promedio del 2020 respecto al 2021? ¿Aumentó o disminuyó?

In [194]:
# En el dataset de happy_years hay valores nulos en Log GPD per capita los eleminamos
data_happy_yearsCleaned = data_happy_years.na.drop(subset=['Log GDP per capita'])


In [195]:


gdp_avg_21 = data_happy_21.agg(avg(col('Logged GDP per capita')).alias('Avg_2021')).collect()[0]['Avg_2021']

gdp_avg_20 = data_happy_yearsCleaned.filter(col('year')==2020).agg(avg(col('Log GDP per capita')).alias('Avg_2020'))\
            .collect()[0]['Avg_2020']

variacion_21_20 = ((gdp_avg_21-gdp_avg_20)/gdp_avg_20)*100

if variacion_21_20 > 0:
    result = "AUMENTÓ"
elif variacion_21_20 < 0:
    result = "DISMINUYÓ"
else:
    result = "SE MANTUVO IGUAL"

print(f"El promedio de GDP en 2021 fue: {gdp_avg_21:.2f}.")
print(f"El promedio de GDP en 2020 fue: {gdp_avg_20:.2f}.")


print(f"El GDP promedio varió en un {variacion_21_20:.2f}% a nivel mundial del 2020 al 2021: {result}.")


El promedio de GDP en 2021 fue: 9.43.
El promedio de GDP en 2020 fue: 9.75.
El GDP promedio varió en un -3.27% a nivel mundial del 2020 al 2021: DISMINUYÓ.


### El promedio del indicador GDP DISMINUYÓ en 2021 con repecto al 2020 en un -3.27%


6. ¿Cuál es el país con mayor expectativa de vide (“Healthy life expectancy at birth”)? Y ¿Cuánto tenia en ese indicador en el 2019?

In [216]:
# Conseguimos el país con mayor "Healthy life spectancy" en el 2021
healthy21 = data_happy_21.groupBy('Country name') \
    .agg(
        max(col('Healthy life expectancy')).alias('Healthy')
        )\
    .orderBy(col('Healthy').desc()) \
    .limit(1)

# Seleccionamos del DF hasta 2020 las columnas de país, año y el indicador.
df_happy_yearsH = data_happy_years.select('Country name', 'year', 'Healthy life expectancy at birth')

# Creamos un nuevo registro con los datos obtenidos en el 2021.
first_row_Healthy = healthy21.collect()[0]
country_healthy = first_row_Healthy['Country name']
max_ladder_healthy = first_row_Healthy['Healthy']

# Creamos un nuevo registro
new_registerH = Row(country_healthy, 2021, max_ladder_healthy)  

# Con ese registro creamos un DF con el esquema del DF hasta 2020
df_new_registerH = spark.createDataFrame([new_registerH], schema=['Country name', 'year', 'Healthy life expectancy at birth'])

# Unimos los dos DF
df_happy_yearsHUpdate = df_happy_yearsH.union(df_new_registerH)


# Agregar una columna de rango para obtener el máximo por año
max_healthy_with_rank = df_happy_yearsHUpdate.withColumn('rank', row_number().over(Window.partitionBy('year').orderBy(col('Healthy life expectancy at birth').desc())))

# Filtrar solo las filas con rango 1 (máximo por año)
max_healthy_per_year = max_healthy_with_rank.filter(col('rank') == 1)

# Seleccionar las columnas relevantes
result_df = max_healthy_per_year.select('year', 'Country name', 'Healthy life expectancy at birth')\
                                .orderBy(col('Healthy life expectancy at birth').desc()).limit(1)

# Mostrar el DataFrame resultante
result_df.show(truncate=False)




+----+------------+--------------------------------+
|year|Country name|Healthy life expectancy at birth|
+----+------------+--------------------------------+
|2019|Singapore   |77.1                            |
+----+------------+--------------------------------+



### El país con la mayor esperanza de vida al nacimiento es Singapur con 77.1 años en el año 2019