# World Development Use Case

## Contents

### Part 1: # Questions only concerning temperature data
### Part 2: # Questions only concerning commodities data
### Part 3: # Questions using both temperature and commodity data

In [1]:
# import
import configparser
import pandas as pd
import numpy as np
from tqdm import tqdm
from time import time
from sqlalchemy import create_engine
from functools import reduce
import operator


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

## Database connection

In order to execute this code you need the target PostgreSQL DBs container IP-Address.
You get it entering:

```
> docker ps
# find the ID (first column) of the docker_db_1 container, make sure its images is form postgres:10.10-alpine
> docker inspect <ID-of-spark-master-container>
```

Now, set the value of the variable ip_adress to: IP-Address entry you find at the bottom of the output (in: "Networks" => ... => "IPAddress"L <IP_ADRESS>)

In [2]:
IP_ADDRESS = "172.25.0.3"

config = configparser.ConfigParser()
config.read('../config.cfg')

db_prop = config['POSTGRESQL']
user = db_prop['username']
password = db_prop['password']
dbname = db_prop['dbname']

engine_connection_string = f'postgresql://{user}:{password}@{IP_ADDRESS}:5432/{dbname}'
    
engine = create_engine(engine_connection_string)
conn = engine.connect().execution_options(isolation_level="AUTOCOMMIT")

# Questions only concerning temperature data

## What where the hottest contries for all years in DB?

In [22]:
query = """SELECT t.year, c.country_or_area 
           FROM temperatures t JOIN 
                country_or_area c ON t.country_or_area_id = c.country_or_area_id
           WHERE t.rank = 1.0;
        """

df = pd.read_sql(query, conn)
df

Unnamed: 0,year,country_or_area
0,1743,San Marino
1,1744,Greece
2,1745,Spain
3,1750,Greece
4,1751,Greece
5,1752,Greece
6,1753,Tunisia
7,1754,Tunisia
8,1755,Tunisia
9,1756,Tunisia


### Which countries where the hottest how often between 1988 and 2013?

In [24]:
query = """SELECT c.country_or_area, count(*)
           FROM temperatures t JOIN 
                country_or_area c ON t.country_or_area_id = c.country_or_area_id
Countries           WHERE t.rank = 1.0 AND year BETWEEN 1988 AND 2013
           GROUP BY c.country_or_area;
        """

df = pd.read_sql(query, conn)
df

Unnamed: 0,country_or_area,count
0,Djibouti,22
1,Mali,4


### Which countries where the hottest for each year? (1988 to 2013)

In [42]:
query = '''
 SELECT rt.year, rt.temperature, c.country_or_area FROM (
    SELECT temperatures.*, 
    rank() OVER (
        PARTITION BY year, temperature
    )
    FROM temperatures
    WHERE rank = 1
  ) rt JOIN country_or_area c ON rt.country_or_area_id = c.country_or_area_id
  WHERE year BETWEEN 1988 AND 2013
  ORDER BY year;
    
'''
df = pd.read_sql(query, conn)
df

Unnamed: 0,year,temperature,country_or_area
0,1988,29.531833,Djibouti
1,1989,28.797,Djibouti
2,1990,29.321083,Mali
3,1991,29.289417,Djibouti
4,1992,28.999,Djibouti
5,1993,28.9665,Mali
6,1994,29.062333,Djibouti
7,1995,29.25975,Djibouti
8,1996,29.252583,Mali
9,1997,29.269333,Djibouti


# Questions only concerning commodity data

## What is the top exported/imported commodity for specific country and year?

In [45]:
year = 2010
country = 'Mali'
flow = flow

query = """SELECT ca.category_name, commodity_name, trade_usd
           FROM trades t JOIN commodities c 
                           ON t.commodity_id = c.commodity_id
                         JOIN flows f
                           ON t.flow_id = f.flow_id
                         JOIN categories ca 
                           ON c.category_id = ca.category_id
                         JOIN country_or_area co
                           ON t.country_or_area_id = co.country_or_area_id
           WHERE t.year = {}                 AND 
                 co.country_or_area = '{}' AND 
                 f.flow_type = '{}'      AND
                 ca.category_name != 'all_commodities'
           ORDER BY trade_usd DESC
           LIMIT 1
        
""".format(year, country, flow)
df = pd.read_sql(query, conn)
df

Unnamed: 0,category_name,commodity_name,trade_usd
0,01_live_animals,"Bovine animals, live, except pure-bred breeding",48626300.0


# Questions using both temperature and commodity data

##  What is the main commodity imported/exported by trade_usd by the hottest countries? 

In [39]:
flow = 'Import'
query = '''
SELECT t.year, c.country_or_area, te.rank, co.commodity_name, t.trade_usd
FROM
( SELECT * 
    FROM trades WHERE (year, country_or_area_id, trade_usd) IN 
    ( SELECT year, country_or_area_id, MAX(trade_usd)
      FROM trades
      WHERE commodity_id != 21
      GROUP BY year, country_or_area_id
    )
) as t JOIN country_or_area c ON t.country_or_area_id = c.country_or_area_id
       JOIN temperatures te ON t.temperature_id = te.temperature_id
       JOIN commodities co ON t.commodity_id = co.commodity_id
       JOIN flows f ON t.flow_id = f.flow_id
WHERE te.rank BETWEEN 1 AND 2 AND flow_type = '{}'
ORDER BY t.year;
'''.format(flow)


df = pd.read_sql(query, conn)
print(len(df))
df

14


Unnamed: 0,year,country_or_area,rank,commodity_name,trade_usd
0,1996,Mali,1.0,"Oils petroleum, bituminous, distillates, excep...",120789300.0
1,1997,Mali,2.0,"Oils petroleum, bituminous, distillates, excep...",139092500.0
2,1998,Mali,2.0,"Oils petroleum, bituminous, distillates, excep...",128707900.0
3,2000,Mali,2.0,"Oils petroleum, bituminous, distillates, excep...",187484100.0
4,2001,Mali,2.0,"Oils petroleum, bituminous, distillates, excep...",184211600.0
5,2002,Mali,2.0,"Oils petroleum, bituminous, distillates, excep...",156912500.0
6,2003,Mali,2.0,"Oils petroleum, bituminous, distillates, excep...",216699100.0
7,2004,Mali,2.0,"Oils petroleum, bituminous, distillates, excep...",281833300.0
8,2005,Mali,2.0,"Oils petroleum, bituminous, distillates, excep...",365683100.0
9,2006,Mali,2.0,"Oils petroleum, bituminous, distillates, excep...",424866000.0


##  What is the main commodity imported/exported by trade_usd by the coldest countries? 

In [40]:
flow = 'Export'

query = '''
SELECT t.year, c.country_or_area, te.rank, co.commodity_name, t.trade_usd
FROM
( SELECT * 
    FROM trades WHERE (year, country_or_area_id, trade_usd) IN 
    ( SELECT year, country_or_area_id, MAX(trade_usd)
      FROM trades
      WHERE commodity_id != 21
      GROUP BY year, country_or_area_id
    )
) as t JOIN country_or_area c ON t.country_or_area_id = c.country_or_area_id
       JOIN temperatures te ON t.temperature_id = te.temperature_id
       JOIN commodities co ON t.commodity_id = co.commodity_id
       JOIN flows f ON t.flow_id = f.flow_id
WHERE te.rank BETWEEN 235 AND 236 AND flow_type = '{}'
ORDER BY t.year;
'''.format(flow)
df = pd.read_sql(query, conn)
df

Unnamed: 0,year,country_or_area,rank,commodity_name,trade_usd
0,1994,Greenland,235.0,"Shrimps and prawns, prepared or preserved",92983504.0
1,1995,Greenland,235.0,"Shrimps and prawns, prepared or preserved",118630600.0
2,1996,Greenland,235.0,"Shrimps and prawns, prepared or preserved",114697712.0
3,1997,Greenland,235.0,"Shrimps and prawns, prepared or preserved",78629600.0
4,1998,Greenland,235.0,"Shrimps and prawns, prepared or preserved",73013232.0
5,1999,Greenland,235.0,"Shrimps and prawns, prepared or preserved",70135949.0
6,2000,Greenland,235.0,"Shrimps and prawns, prepared or preserved",68821315.0
7,2001,Greenland,235.0,"Shrimps and prawns, prepared or preserved",60707450.0
8,2002,Greenland,235.0,"Shrimps and prawns, prepared or preserved",67786863.0
9,2003,Greenland,235.0,"Shrimps and prawns, prepared or preserved",71428840.0
