### Import required libraries, classes, etc. and initiate spark instance.

In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
#from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import asc, desc, count, sum, mean, corr, col


spark = SparkSession.builder.appName('hr').getOrCreate()

### Create dataframes by reading CSV files and use first row as header and infer schema

In [2]:
wh2015 = spark.read.csv('WH_2015.csv', inferSchema=True, header=True)
wh2016 = spark.read.csv('WH_2016.csv', inferSchema=True, header=True)
wh2017 = spark.read.csv('WH_20171.csv', inferSchema=True, header=True)

### Inspect data in dataframes

In [3]:
wh2015.describe().toPandas()

Unnamed: 0,summary,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,count,158,158,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
1,mean,,,79.49367088607595,5.375734177215191,0.047884746835443,0.8461372151898726,0.9910459493670888,0.6302593670886079,0.4286149367088611,0.1434218354430379,0.237295506329114,2.098976772151899
2,stddev,,,45.75436310480852,1.145010134952066,0.0171461785569693,0.4031207785379107,0.2723690860079153,0.2470777663021721,0.150692783937678,0.1200340735745592,0.1266849340202053,0.5535497923037985
3,min,Afghanistan,Australia and New Zealand,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
4,max,Zimbabwe,Western Europe,158.0,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


In [4]:
wh2016.describe().toPandas()

Unnamed: 0,summary,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,count,157,157,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0
1,mean,,,78.98089171974522,5.382184713375795,5.282394904458597,5.481974522292995,0.9538798089171971,0.7936210828025482,0.5576189808917195,0.3709938853503184,0.1376237579617834,0.2426349044585987,2.3258065605095544
2,stddev,,,45.46603005479287,1.14167351760057,1.1480426137782471,1.1364932228133051,0.4125954051030415,0.2667057461995736,0.2293491914878578,0.1455067674417439,0.111037909918155,0.133755691298121,0.5422199984864142
3,min,Afghanistan,Australia and New Zealand,1.0,2.905,2.732,3.078,0.0,0.0,0.0,0.0,0.0,0.0,0.81789
4,max,Zimbabwe,Western Europe,157.0,7.526,7.46,7.669,1.82427,1.18326,0.95277,0.60848,0.50521,0.81971,3.83772


In [5]:
wh2017.show()

+--------------+--------------+---------------+-------------------------+-------------------------+------------------------+-----------+------------------------+-----------+-----------+-----------------------------+-----------------+
|       Country|Happiness Rank|Happiness Score|Lower Confidence Interval|Upper Confidence Interval|Economy (GDP per Capita)|     Family|Health (Life Expectancy)|    Freedom| Generosity|Trust (Government Corruption)|Dystopia Residual|
+--------------+--------------+---------------+-------------------------+-------------------------+------------------------+-----------+------------------------+-----------+-----------+-----------------------------+-----------------+
|        Norway|             1|    7.537000179|              7.594444821|              7.479555538|             1.616463184| 1.53352356|             0.796666503|0.635422587|0.362012237|                  0.315963835|      2.277026653|
|       Denmark|             2|    7.521999836|              7.5

In [6]:
wh2016.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Happiness Rank: integer (nullable = true)
 |-- Happiness Score: double (nullable = true)
 |-- Lower Confidence Interval: double (nullable = true)
 |-- Upper Confidence Interval: double (nullable = true)
 |-- Economy (GDP per Capita): double (nullable = true)
 |-- Family: double (nullable = true)
 |-- Health (Life Expectancy): double (nullable = true)
 |-- Freedom: double (nullable = true)
 |-- Trust (Government Corruption): double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Dystopia Residual: double (nullable = true)



### Index the feature region using numeric values

In [7]:
indexer = StringIndexer(inputCol='Region', outputCol='region_index')
indexed_wh2015 = indexer.fit(wh2015).transform(wh2015)

In [8]:
indexer = StringIndexer(inputCol='Region', outputCol='region_index')
indexed_wh2016 = indexer.fit(wh2016).transform(wh2016)

In [9]:
tmp = wh2016.select('Region', 'Country')
tmp = tmp.withColumnRenamed('Country', 'tmp_name')

In [10]:
wh2017 = wh2017.join(tmp, tmp['tmp_name'] == wh2017['Country'])

In [11]:
indexer = StringIndexer(inputCol='Region', outputCol='region_index')
indexed_wh2017 = indexer.fit(wh2017).transform(wh2017)

In [12]:

indexed_wh2017.show()

+--------------+--------------+---------------+-------------------------+-------------------------+------------------------+-----------+------------------------+-----------+-----------+-----------------------------+-----------------+--------------------+--------------+------------+
|       Country|Happiness Rank|Happiness Score|Lower Confidence Interval|Upper Confidence Interval|Economy (GDP per Capita)|     Family|Health (Life Expectancy)|    Freedom| Generosity|Trust (Government Corruption)|Dystopia Residual|              Region|      tmp_name|region_index|
+--------------+--------------+---------------+-------------------------+-------------------------+------------------------+-----------+------------------------+-----------+-----------+-----------------------------+-----------------+--------------------+--------------+------------+
|        Norway|             1|    7.537000179|              7.594444821|              7.479555538|             1.616463184| 1.53352356|             0.

### Inspect the new schema of dataframe

In [13]:
indexed_wh2015.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Happiness Rank: integer (nullable = true)
 |-- Happiness Score: double (nullable = true)
 |-- Standard Error: double (nullable = true)
 |-- Economy (GDP per Capita): double (nullable = true)
 |-- Family: double (nullable = true)
 |-- Health (Life Expectancy): double (nullable = true)
 |-- Freedom: double (nullable = true)
 |-- Trust (Government Corruption): double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Dystopia Residual: double (nullable = true)
 |-- region_index: double (nullable = true)



### Using corr function, checking how various features co relate with the Happiness Rank

In [14]:
indexed_wh2015.select(corr('Happiness Rank', 'Economy (GDP per Capita)')).show()

+----------------------------------------------+
|corr(Happiness Rank, Economy (GDP per Capita))|
+----------------------------------------------+
|                           -0.7852669153290183|
+----------------------------------------------+



In [15]:
indexed_wh2015.select(corr('Happiness Rank', 'Health (Life Expectancy)')).show()

+----------------------------------------------+
|corr(Happiness Rank, Health (Life Expectancy))|
+----------------------------------------------+
|                           -0.7356129584428032|
+----------------------------------------------+



### Assemble vector using selective features using VectorAssembler

In [16]:
# assembler = VectorAssembler(inputCols=['Happiness Score',
#                                         'Standard Error',
#                                         'Economy (GDP per Capita)',
#                                         'Family',
#                                         'Health (Life Expectancy)',
#                                         'Freedom',
#                                         'Trust (Government Corruption)',
#                                         'Generosity',
#                                         'Dystopia Residual',
#                                         'region_index'
#                                         ], outputCol='features')

In [17]:
assembler = VectorAssembler(inputCols=['Happiness Score', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'region_index'], outputCol='features')

### Transform the dataframes using the assembler in order to add vector to the dataframe

In [18]:
data2015 = assembler.transform(indexed_wh2015)
data2016 = assembler.transform(indexed_wh2016)

In [19]:
indexed_wh2017

DataFrame[Country: string, Happiness Rank: int, Happiness Score: double, Lower Confidence Interval: double, Upper Confidence Interval: double, Economy (GDP per Capita): double, Family: double, Health (Life Expectancy): double, Freedom: double, Generosity: double, Trust (Government Corruption): double, Dystopia Residual: double, Region: string, tmp_name: string, region_index: double]

In [20]:
data2017 = assembler.transform(indexed_wh2017)

### Inspect prepared dataframes, check the features vector and happiness rank

In [21]:
data2015.select('features', 'Happiness Rank').show()

+--------------------+--------------+
|            features|Happiness Rank|
+--------------------+--------------+
|[7.587,0.94143,0....|             1|
|[7.561,0.94784,0....|             2|
|[7.527,0.87464,0....|             3|
|[7.522,0.88521,0....|             4|
|[7.427,0.90563,0....|             5|
|[7.406,0.88911,0....|             6|
|[7.378,0.89284,0....|             7|
|[7.364,0.91087,0....|             8|
|[7.286,0.90837,0....|             9|
|[7.284,0.93156,0....|            10|
|[7.278,0.91387,0....|            11|
|[7.226,0.86027,0....|            12|
|[7.2,0.89042,0.62...|            13|
|[7.187,0.81444,0....|            14|
|[7.119,0.86179,0....|            15|
|[6.983,0.69702,0....|            16|
|[6.946,0.91894,0....|            17|
|[6.94,0.89533,0.6...|            18|
|[6.937,0.89667,0....|            19|
|[6.901,0.80925,0....|            20|
+--------------------+--------------+
only showing top 20 rows



### Define regression model for predicting happiness rank value

In [22]:
hr_lr = LinearRegression(labelCol='Happiness Rank')

In [23]:
#hr_lr.setRegParam(0)

In [24]:
#hr_lr.setElasticNetParam(0)

In [25]:
#train_leftLR, test_leftLR = data2015.randomSplit([0.7,0.3])

In [26]:
#trained_hr_lr1 = hr_lr.fit(train_leftLR)

In [27]:
#hr_results2016 = trained_hr_lr1.evaluate(test_leftLR)

In [28]:
#hr_results2016.rootMeanSquaredError

### Train the regression model using 2015 data.

In [29]:
trained_hr_lr = hr_lr.fit(data2015)

### Evaluate the trained model using 2016 data

In [30]:
hr_results2016 = trained_hr_lr.evaluate(data2016)

#### View happiness ranks and predictions

In [31]:
hr_results2016.predictions.select('Country', 'Happiness Rank', 'prediction').show()

+-------------+--------------+------------------+
|      Country|Happiness Rank|        prediction|
+-------------+--------------+------------------+
|      Denmark|             1|-2.937331602521567|
|  Switzerland|             2|-3.038040887740351|
|      Iceland|             3|-4.901830875183975|
|       Norway|             4|-2.576715683418513|
|      Finland|             5|1.0693075559815952|
|       Canada|             6|1.3321266199043293|
|  Netherlands|             7| 3.041653312918811|
|  New Zealand|             8|  5.06383471991353|
|    Australia|             9| 4.956420951692053|
|       Sweden|            10|  5.65417616860293|
|       Israel|            11| 4.121660270595896|
|      Austria|            12|10.955426111376028|
|United States|            13|12.069133527134227|
|   Costa Rica|            14|11.461014016236277|
|  Puerto Rico|            15|13.409189519983215|
|      Germany|            16|16.353518129963277|
|       Brazil|            17|18.174154863733918|


### Get the root mean squared error from the evaluator

In [32]:
hr_results2016.rootMeanSquaredError

4.23530474976203

### Evaluate the trained model using 2016 data

In [33]:
hr_results2017 = trained_hr_lr.evaluate(data2017)

#### View happiness ranks and predictions

In [34]:
hr_results2017.predictions.select('Country', 'Happiness Rank', 'prediction').show()

+--------------+--------------+-------------------+
|       Country|Happiness Rank|         prediction|
+--------------+--------------+-------------------+
|        Norway|             1| -4.470325333664277|
|       Denmark|             2|-3.1533260203965483|
|       Iceland|             3|-4.7824267038423045|
|   Switzerland|             4|-2.8108836558611188|
|       Finland|             5|-1.3538852413406062|
|   Netherlands|             6|  1.415971062999347|
|        Canada|             7|  4.471017343415554|
|   New Zealand|             8|  5.626460716321503|
|        Sweden|             9|  5.709901842052659|
|     Australia|            10|  5.935182363821355|
|        Israel|            11|  6.258518669936166|
|    Costa Rica|            12| 11.724648786625778|
|       Austria|            13| 15.331882094559603|
| United States|            14|  16.30225405349495|
|       Ireland|            15|  17.13344962522052|
|       Germany|            16| 18.052752075855892|
|       Belg

### Get the root mean squared error from the evaluator

In [35]:
hr_results2017.rootMeanSquaredError

5.43960405681417

## Gradient-boosted tree regression

### Import required classes for evaluators, indexers for gradient boosted tree regression

In [36]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

### Assemble the index using feature vector in order to automatically identify categorical features, and index them.
### Set maxCategories so features with > 4 distinct values are treated as continuous.

In [37]:
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=5).fit(data2015)

### Transform the dataframes for each year and collect new dataframes

In [38]:
data2015wfi = featureIndexer.transform(data2015)
data2016wfi = featureIndexer.transform(data2016)
data2017wfi = featureIndexer.transform(data2017)

### Define gradient bossted tree regression model for happiness rank prediction

In [39]:
gbt = GBTRegressor(labelCol="Happiness Rank",featuresCol="indexedFeatures")

### Fit the training data in model for training

In [40]:
model = gbt.fit(data2015wfi)

### Test the model using 2016 data

In [41]:
gbtPreds16 = model.transform(data2016wfi)

### Evaluate the result using regression evaluator, compare the rank with the predicted values and collect root mean square error

In [42]:
evaluator = RegressionEvaluator(labelCol="Happiness Rank", predictionCol="prediction", metricName="rmse")

### Collect the RMSE value from evaluator

In [43]:
rmse16 = evaluator.evaluate(gbtPreds16)

In [44]:
gbtPreds16.select('Country','Happiness Rank', 'prediction').show()

+-------------+--------------+------------------+
|      Country|Happiness Rank|        prediction|
+-------------+--------------+------------------+
|      Denmark|             1|  3.66781951245897|
|  Switzerland|             2| 2.101162957078867|
|      Iceland|             3|3.3104469522954463|
|       Norway|             4| 2.679695796692385|
|      Finland|             5|  3.37509178095679|
|       Canada|             6| 9.971964045710617|
|  Netherlands|             7| 9.038330535673722|
|  New Zealand|             8|  9.43026303694145|
|    Australia|             9| 9.950523552699197|
|       Sweden|            10|  8.59213951151116|
|       Israel|            11|13.117446460530703|
|      Austria|            12|13.701802214401763|
|United States|            13|14.520500989081933|
|   Costa Rica|            14|12.700755430351565|
|  Puerto Rico|            15|13.044459245723415|
|      Germany|            16|13.755897652535502|
|       Brazil|            17|17.368277378097247|


### Print the RMSE

In [45]:
rmse16

3.883400779923485

In [46]:
gbtPreds17 = model.transform(data2017wfi)

In [47]:
gbtPreds17.select('Country','Happiness Rank', 'prediction').show()

+--------------+--------------+------------------+
|       Country|Happiness Rank|        prediction|
+--------------+--------------+------------------+
|        Norway|             1| 2.679695796692385|
|       Denmark|             2|2.4649825976755495|
|       Iceland|             3|2.9293360166597635|
|   Switzerland|             4|2.5246423971219345|
|       Finland|             5| 2.679695796692385|
|   Netherlands|             6| 7.716001837078553|
|        Canada|             7| 9.742269479863953|
|   New Zealand|             8|  9.71920171235674|
|        Sweden|             9| 7.350069945512941|
|     Australia|            10| 9.720828986852533|
|        Israel|            11|13.810484551359421|
|    Costa Rica|            12|13.499734815130244|
|       Austria|            13|14.038330535673722|
| United States|            14|14.819924026571424|
|       Ireland|            15| 18.70180221440175|
|       Germany|            16| 18.70180221440175|
|       Belgium|            17|

In [48]:
rmse17 = evaluator.evaluate(gbtPreds17)

In [49]:
rmse17

4.472366693469931

## From 2015 to 2017, which country’s happiness ranking increased the most?

### Select countries and happiness rank from 2015 dataframe and then rename selected columns in order to avoid ambigious column names later on

In [50]:
new2015 = wh2015.select('Country','Happiness Rank')
new2015 = new2015.withColumnRenamed(existing='Happiness Rank', new='HR15')
new2015 = new2015.withColumnRenamed(existing='Country', new='C15')

### Perform a join on 2015 and 2017 based on equality and using a predicate where country names from both dataframes should match

In [51]:
happyDF = wh2017.join(new2015, new2015['C15'] == wh2017['Country'])

In [52]:
happyDF.columns

['Country',
 'Happiness Rank',
 'Happiness Score',
 'Lower Confidence Interval',
 'Upper Confidence Interval',
 'Economy (GDP per Capita)',
 'Family',
 'Health (Life Expectancy)',
 'Freedom',
 'Generosity',
 'Trust (Government Corruption)',
 'Dystopia Residual',
 'Region',
 'tmp_name',
 'C15',
 'HR15']

### Append a column to a dataframe to substract 2017 rank from 2015 rank.

In [54]:
happyDF = happyDF.withColumn("Gain", happyDF['HR15'] - happyDF['Happiness Rank'])

### Display highest values for gain, the positive value for gain interpret as increase in rank

In [55]:
happyDF.select('Country', 'Gain').orderBy(desc('Gain')).toPandas().head(10)

Unnamed: 0,Country,Gain
0,Latvia,35
1,Egypt,31
2,Romania,29
3,Hungary,29
4,Bulgaria,29
5,Senegal,27
6,Cameroon,26
7,Gabon,25
8,Ivory Coast,23
9,Nepal,22


## From 2015 to 2017, which country’s happiness ranking decreased the most?

### Display lowest values for gain, the negetive value for gain interpret as decrease in rank

In [56]:
happyDF.select('Country', 'Gain').orderBy(asc('Gain')).toPandas().head(10)

Unnamed: 0,Country,Gain
0,Venezuela,-59
1,Liberia,-32
2,Zambia,-31
3,Haiti,-26
4,Zimbabwe,-23
5,Kyrgyzstan,-21
6,Ukraine,-21
7,Vietnam,-19
8,Bhutan,-18
9,Paraguay,-17


Some countries see drastic difference in their happiness countries. This change is not necessarily be reflected in metrics related to health, livability and gdp etc. For example, Venezuela is facing severe economic meltdown and devaluation of currency. From 2013-2016 Liberia witnessed severe Ebola outbreak. While introduction of new constitution, improvement in democratic institutions helped countries like Nepal, Invory Coast helped gain happiness.

## Happiest Regions in descending orders

### Group data for each year by region, aggregate group members by average Happiness Score and order in descending order

In [64]:
wh2015.groupBy('Region').agg(mean('Happiness Score')).orderBy(desc('avg(Happiness Score)')).toPandas()

Unnamed: 0,Region,avg(Happiness Score)
0,Australia and New Zealand,7.285
1,North America,7.273
2,Western Europe,6.689619
3,Latin America and Caribbean,6.144682
4,Eastern Asia,5.626167
5,Middle East and Northern Africa,5.4069
6,Central and Eastern Europe,5.332931
7,Southeastern Asia,5.317444
8,Southern Asia,4.580857
9,Sub-Saharan Africa,4.2028


### Happiest to least happiest continents in descending orders

In [65]:
wh2016.groupBy('Region').agg(mean('Happiness Score')).orderBy(desc('avg(Happiness Score)')).toPandas()

Unnamed: 0,Region,avg(Happiness Score)
0,Australia and New Zealand,7.3235
1,North America,7.254
2,Western Europe,6.685667
3,Latin America and Caribbean,6.10175
4,Eastern Asia,5.624167
5,Middle East and Northern Africa,5.386053
6,Central and Eastern Europe,5.37069
7,Southeastern Asia,5.338889
8,Southern Asia,4.563286
9,Sub-Saharan Africa,4.136421


In [58]:
wh2017.groupBy('Region').agg(mean('Happiness Score')).orderBy(desc('avg(Happiness Score)')).toPandas()

Unnamed: 0,Region,avg(Happiness Score)
0,Australia and New Zealand,7.299
1,North America,7.1545
2,Western Europe,6.703714
3,Latin America and Caribbean,5.957818
4,Eastern Asia,5.4965
5,Southeastern Asia,5.444875
6,Central and Eastern Europe,5.409931
7,Middle East and Northern Africa,5.369684
8,Southern Asia,4.628429
9,Sub-Saharan Africa,4.147639


### In the original dataset the region is provided, these regions are not based on geographical continental boundries. To tackle this issue, I'm using another dataset with countries and their country codes, continents, region and sub-region

### Import some additional data to dataframe, In order to obtain continents information and country codes.
** it is important that data should have same country name format/names as world happiness dataset **

Country codes will be required for choropleth.<br/>
Continents are required for grouping countries further by contients, instead of regions

In [59]:
countries = spark.read.csv('countries.csv', inferSchema=True, header=True)

### Select country name and continets

In [60]:
continents = countries.select('name', 'continent')

### Show continents dataframe

In [61]:
continents.show()

+-------------------+--------------------+
|               name|           continent|
+-------------------+--------------------+
|        Afghanistan|                Asia|
|      Åland Islands|              Europe|
|            Albania|              Europe|
|            Algeria|              Africa|
|     American Samoa|Australia and Oce...|
|            Andorra|              Europe|
|             Angola|              Africa|
|           Anguilla|       North America|
|         Antarctica|                null|
|Antigua and Barbuda|       North America|
|          Argentina|       South America|
|            Armenia|                Asia|
|              Aruba|       North America|
|          Australia|Australia and Oce...|
|            Austria|              Europe|
|         Azerbaijan|                Asia|
|            Bahamas|       North America|
|            Bahrain|                Asia|
|         Bangladesh|                Asia|
|           Barbados|       North America|
+----------

### Join continents dataframe with dataframe for each year on equality with predicate of country name being equal.
We perform a left outer join so we do not lose out on any information from world happiness dataframe, if there is any mismatch between names

In [62]:
wh2015gp = wh2015.join(continents, continents['name'] == wh2015['Country'],"left_outer")
wh2016gp = wh2016.join(continents, continents['name'] == wh2016['Country'],"left_outer")
wh2017gp = wh2017.join(continents, continents['name'] == wh2017['Country'],"left_outer")

### Group data for each year by continens, aggregate group members by average Happiness Score and order in descending order

In [63]:
wh2015gp.groupBy('continent').agg(mean('Happiness Score')).orderBy(desc('avg(Happiness Score)')).toPandas()

Unnamed: 0,continent,avg(Happiness Score)
0,Australia and Oceania,7.285
1,South America,6.348636
2,North America,6.145692
3,Europe,6.0604
4,Asia,5.302851
5,Africa,4.298156


#### Continents happiest to least happiest in descending order

In [64]:
wh2016gp.groupBy('continent').agg(mean('Happiness Score')).orderBy(desc('avg(Happiness Score)')).toPandas()

Unnamed: 0,continent,avg(Happiness Score)
0,Australia and Oceania,7.3235
1,South America,6.251364
2,North America,6.145667
3,Europe,6.0796
4,Asia,5.271957
5,Africa,4.272372


In [65]:
wh2017gp.groupBy('continent').agg(mean('Happiness Score')).orderBy(desc('avg(Happiness Score)')).toPandas()

Unnamed: 0,continent,avg(Happiness Score)
0,Australia and Oceania,7.299
1,Europe,6.11705
2,South America,6.0986
3,North America,6.028214
4,Asia,5.273953
5,Africa,4.280171


## Visualizing data on a map

requires plotly

**pip install plotly**

### Selecting country name and country code from countries dataset.

In [66]:
codes = countries.select('name','alpha-3')

### Display newly created dataframe

In [67]:
codes.show()

+-------------------+-------+
|               name|alpha-3|
+-------------------+-------+
|        Afghanistan|    AFG|
|      Åland Islands|    ALA|
|            Albania|    ALB|
|            Algeria|    DZA|
|     American Samoa|    ASM|
|            Andorra|    AND|
|             Angola|    AGO|
|           Anguilla|    AIA|
|         Antarctica|    ATA|
|Antigua and Barbuda|    ATG|
|          Argentina|    ARG|
|            Armenia|    ARM|
|              Aruba|    ABW|
|          Australia|    AUS|
|            Austria|    AUT|
|         Azerbaijan|    AZE|
|            Bahamas|    BHS|
|            Bahrain|    BHR|
|         Bangladesh|    BGD|
|           Barbados|    BRB|
+-------------------+-------+
only showing top 20 rows



### Perform a left outer join on WH2017 and country codes dataframes in order to add country codes to world happiness dataframes

In [68]:
dfwCode = wh2017.join(codes, codes['name'] == wh2017['Country'], "left_outer")

### Convert new spark dataframe to a pandas dataframe

In [69]:
df = dfwCode.toPandas()

### Import required libraries for choropleth

In [70]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

ImportError: No module named 'plotly'

#### Inititialze plotply for notebook mode

In [71]:
init_notebook_mode(connected=True)

NameError: name 'init_notebook_mode' is not defined

### Construct data dictionary to pass an argument to plotly

In [72]:
data = dict(type='choropleth',
            locations=df['alpha-3'],
            locationmode='Country Names',
            z = df['Happiness Score'],
            text = df['Country'],
            colorbar = {'title':'Happyness Score'},
            colorscale = 'Portland',
            reversescale = True
           )

### Configure additional configuration for choropleth, such as title, projection type.

In [73]:
layout = dict(title = '2017 World Happiness Scores',
geo = dict(showframe=True, projection=dict(type='equirectangular')))

# projection=dict(type='orthographic') # for fun

### Display the worldmap wth populated data

In [74]:
choromap = go.Figure(data = [data], layout=layout)
iplot(choromap, validate=False)

NameError: name 'go' is not defined