## Cleaning datasets 

Import dependencies:

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import json

---

### Dataset 1: Annual deforestation 
*type: json*


In [19]:
with open('annual-deforestation-of-change-1.json') as json_file:
    data = json.load(json_file)
  
    # Print the type of data variable
    # print("Type:", type(data)) ; it's a list
    
deforest_raw_data = pd.Series(data) 

# deforest_data[0] ; after peeking at this we can see it is just some metadata
deforest_raw_data = deforest_raw_data.drop(0) # drops metadata
deforest_data = deforest_raw_data[1]


deforest_df = pd.DataFrame(deforest_data)
deforest_df.columns

Index(['indicator', 'country', 'countryiso3code', 'date', 'value', 'unit',
       'obs_status', 'decimal'],
      dtype='object')

In [20]:
deforest_df['indicator']

0       {'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...
1       {'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...
2       {'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...
3       {'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...
4       {'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...
                              ...                        
2571    {'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...
2572    {'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...
2573    {'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...
2574    {'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...
2575    {'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...
Name: indicator, Length: 2576, dtype: object

In [21]:
deforest_df['indicator'][0]

{'id': 'ER.FST.DFST.ZG', 'value': 'Annual deforestation (% of change)'}

In [22]:
deforest_df.head()

Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal
0,"{'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...","{'id': '', 'value': 'East Asia & Pacific (IBRD...",,2017,,,,0
1,"{'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...","{'id': '', 'value': 'East Asia & Pacific (IBRD...",,2016,,,,0
2,"{'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...","{'id': '', 'value': 'East Asia & Pacific (IBRD...",,2015,-0.467377,,,0
3,"{'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...","{'id': '', 'value': 'East Asia & Pacific (IBRD...",,2014,,,,0
4,"{'id': 'ER.FST.DFST.ZG', 'value': 'Annual defo...","{'id': '', 'value': 'East Asia & Pacific (IBRD...",,2013,,,,0


In [23]:
deforest_df = deforest_df.drop(['indicator','countryiso3code', 'obs_status', 'decimal'], axis=1)

In [24]:
deforest_df

Unnamed: 0,country,date,value,unit
0,"{'id': '', 'value': 'East Asia & Pacific (IBRD...",2017,,
1,"{'id': '', 'value': 'East Asia & Pacific (IBRD...",2016,,
2,"{'id': '', 'value': 'East Asia & Pacific (IBRD...",2015,-0.467377,
3,"{'id': '', 'value': 'East Asia & Pacific (IBRD...",2014,,
4,"{'id': '', 'value': 'East Asia & Pacific (IBRD...",2013,,
...,...,...,...,...
2571,"{'id': '', 'value': 'Zimbabwe'}",1994,,
2572,"{'id': '', 'value': 'Zimbabwe'}",1993,,
2573,"{'id': '', 'value': 'Zimbabwe'}",1992,,
2574,"{'id': '', 'value': 'Zimbabwe'}",1991,,


In [25]:
# need to clean country data

In [26]:
type(deforest_df['country'][0])

dict

In [27]:
def extract_name(row):
    '''
    Takes in a row of a pandas series object (more specifically `deforest_df['country']`)
    and returns the corresponding country name 
    '''
    return row['value']
    

In [28]:
deforest_df['country'] = deforest_df['country'].apply(extract_name)

In [29]:
deforest_df.head()

Unnamed: 0,country,date,value,unit
0,East Asia & Pacific (IBRD only),2017,,
1,East Asia & Pacific (IBRD only),2016,,
2,East Asia & Pacific (IBRD only),2015,-0.467377,
3,East Asia & Pacific (IBRD only),2014,,
4,East Asia & Pacific (IBRD only),2013,,


In [30]:
deforest_df

Unnamed: 0,country,date,value,unit
0,East Asia & Pacific (IBRD only),2017,,
1,East Asia & Pacific (IBRD only),2016,,
2,East Asia & Pacific (IBRD only),2015,-0.467377,
3,East Asia & Pacific (IBRD only),2014,,
4,East Asia & Pacific (IBRD only),2013,,
...,...,...,...,...
2571,Zimbabwe,1994,,
2572,Zimbabwe,1993,,
2573,Zimbabwe,1992,,
2574,Zimbabwe,1991,,


In [31]:
deforest_df.describe()

Unnamed: 0,value
count,270.0
mean,0.467385
std,1.147634
min,-3.579504
25%,0.0
50%,0.305847
75%,0.823395
max,6.135485
