In [6]:
import pandas as pd
import numpy as np
import os
import csv
from sklearn.linear_model import LinearRegression

pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', 500)

# Create Deforestation DF, 1990-2016

## 1. Read in data, wrangle all dataframes

In [7]:
def wrangle(df):
  '''
  A function that reformats the World Bank data
  for feature engineering.
  '''

  # drop years between 1960-1989
  df.drop(df.iloc[:, 3:34], inplace=True, axis=1)
  
  # drop 2017-2018 as NANs
  df = df.drop(columns=['2017', '2018'])
  
  # deal with nulls
  for col in df.select_dtypes(include=np.number):
    df[col] = df[col].fillna(df[col].median())
   
  # melt
  year = map(str, range(1990, 2017))
  feature = df.iloc[0][2]
  df = pd.concat([pd.melt(df, id_vars=['Country Name', 'Country Code'], value_vars=val, var_name='Year', value_name=feature) for val in year])
  
  return(df)

In [8]:
# import the target dataframe from /raw_data/target

# change directory
os.chdir('/Users/elliotgunn/Desktop/Lambda/deforestation/raw_data/target')

# list files
path = os.getcwd()
files = os.listdir(path)

# Pick out 'csv' files for features
target_csv = [f for f in files if f[-3:] == 'csv']
target_csv

# read
forest_pct = pd.read_csv(target_csv[0], skiprows=3)

# wrangle
forest_pct = wrangle(forest_pct)

In [9]:
# import 6 feature dataframes from /raw_data/features

# change directory
os.chdir('/Users/elliotgunn/Desktop/Lambda/deforestation/raw_data/features')

# list files
path = os.getcwd()
files = os.listdir(path)

# Pick out 'csv' files for features
features_csv = [f for f in files if f[-3:] == 'csv']
features_csv

# read in features
gdp = pd.read_csv(features_csv[0], skiprows=3)
urban_population = pd.read_csv(features_csv[1], skiprows=3)
agriculture = pd.read_csv(features_csv[2], skiprows=3)
livestock = pd.read_csv(features_csv[3], skiprows=3)
electricity = pd.read_csv(features_csv[4], skiprows=3)
mining = pd.read_csv(features_csv[5], skiprows=3)

# wrangle features
gdp = wrangle(gdp)
urban_population = wrangle(urban_population)
agriculture = wrangle(agriculture)
livestock = wrangle(livestock)
electricity = wrangle(electricity)
mining = wrangle(mining)

# create list of dfs
feature_dfs = [gdp, urban_population, agriculture, livestock, electricity, mining]

## 2. Merge features and target

In [10]:
def merge_features(list_dfs):
  '''
  A function that takes tidy dataframes across all features
  and merges them.
  
  Returns in tidy format.
  '''
  
  # use as base for merging
  base = list_dfs.pop(0)
  
  for df in list_dfs:
    base = base.merge(df, on=['Country Name', 'Country Code', 'Year'])
  
    merged = base
  return(merged)

In [11]:
features = merge_features(feature_dfs)

In [12]:
# merge features and forest_pct (target)

merged_1990_2016 = features.merge(forest_pct, on=['Country Name', 'Country Code', 'Year'])

merged_1990_2016.shape

(7128, 10)

In [13]:
# export as csv

# change directory
os.chdir('/Users/elliotgunn/Desktop/Lambda/deforestation/wrangled_data')

merged_1990_2016.to_csv(r'/Users/elliotgunn/Desktop/Lambda/deforestation/wrangled_data/merged_1990_2016.csv')

# Feature Engineering: Create Forecast Dataframe for Features, 2017-2025

In [14]:
def extend_df(df):
  '''
  A function that takes wrangled data in tidy format and extends predictions 
  of the wrangled dataframe from 2017-2025.
  
  '''
  # Preparing linear regression to make predictions for each country...
  model = LinearRegression()
  
  # Getting list of country codes...
  codes = df['Country Code'].unique()
  
  # Getting list of years where we will predict forest coverage...
  years = [year for year in range(2017, 2026)]
  
  # For-loop to make predictions for each country with first dataset...
  rows = []
  feature = df.columns.tolist()[3]
  
  for code in codes:
      dictionary = {'Country Code': code}
      model.fit(df[df['Country Code'] == code][['Year']],
                df[df['Country Code'] == code][feature])
      for year in years:
          prediction = model.predict([[year]])
          dictionary[str(year)] = prediction[0]
      rows.append(dictionary)
  
  # Making a new dataframe out of the predictions for the first dataset...
  df_predictions = pd.DataFrame(rows)
  
  # Reordering the columns in the dataframe for the first dataset...
  df_predictions = df_predictions[
      ['Country Code'] + [str(year) for year in years]]
  
  
  # melt df_predictions to tidy format
  year = map(str, range(2017, 2026))
  df_predictions = pd.concat([pd.melt(df_predictions, id_vars=['Country Code'], value_vars=val, var_name='Year', value_name=feature) for val in year])

  return(df_predictions)

In [15]:
# import 6 feature dataframes from /raw_data/features

# change directory
os.chdir('/Users/elliotgunn/Desktop/Lambda/deforestation/raw_data/features')

# list files
path = os.getcwd()
files = os.listdir(path)

# Pick out 'csv' files for features
features_csv = [f for f in files if f[-3:] == 'csv']
features_csv

# read in features
gdp = pd.read_csv(features_csv[0], skiprows=3)
urban_population = pd.read_csv(features_csv[1], skiprows=3)
agriculture = pd.read_csv(features_csv[2], skiprows=3)
livestock = pd.read_csv(features_csv[3], skiprows=3)
electricity = pd.read_csv(features_csv[4], skiprows=3)
mining = pd.read_csv(features_csv[5], skiprows=3)

# wrangle features
gdp = wrangle(gdp)
urban_population = wrangle(urban_population)
agriculture = wrangle(agriculture)
livestock = wrangle(livestock)
electricity = wrangle(electricity)
mining = wrangle(mining)

# extend features
gdp = extend_df(gdp)
urban_population = extend_df(urban_population)
agriculture = extend_df(agriculture)
livestock = extend_df(livestock)
electricity = extend_df(electricity)
mining = extend_df(mining)

# create list of dfs
feature_dfs_pred = [gdp, urban_population, agriculture, livestock, electricity, mining]

In [16]:
def merge_pred_features(list_dfs_pred):
  '''
  A function that takes predicted dataframes across all features
  and merges them.
  
  Returns a dataframe in tidy format.
  
  ''' 
  # create base df for merging, call it "test"
  test = list_dfs_pred.pop(0)
  
  for df in list_dfs_pred:
    test = test.merge(df, on=['Country Code', 'Year'])
    
  return(test)

In [19]:
merged_features_only_2017_2025 = merge_pred_features(feature_dfs_pred)

In [21]:
merged_features_only_2017_2025.to_csv(r'/Users/elliotgunn/Desktop/Lambda/deforestation/wrangled_data/merged_features_only_2017_2025.csv')
