# Data cleaning Red List Species 2016

1. Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np
import re
import math
import requests
import json
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime

2. Loading the dataset

In [2]:
red016= pd.read_csv("data/Red_list_2016/data.csv", encoding = "ISO-8859-1")

In [3]:
red016.head()

Unnamed: 0,Index,Country,Region,CC,Mammals,Birds,Reptiles,Amphibians,Fish,Molluscs,...,A_TH_ED,A_EX,A_EW,A_CR,A_EN,A_VU,A_NT,A_LC,A_DD,A_SP
0,0,,North America,,53,96,43,57,295,306,...,50,2,0,2,17,38,34,210,14,318
1,1,Canada,North America,CAN,14,15,6,1,43,5,...,0,0,0,0,0,1,2,43,0,46
2,2,Saint Pierre and Miquelon,North America,SPM,4,4,0,0,3,0,...,0,0,0,0,0,0,0,0,0,?
3,3,United States,North America,USA,35,77,37,56,249,301,...,50,2,0,2,17,37,32,167,14,272
4,4,,Caribbean Islands,,87,107,209,182,847,6,...,0,0,0,0,0,0,0,0,0,1


3. Dropping the index

In [4]:
red016.drop("Index", axis=1, inplace= True)

4. Rename columns

In [5]:
# Create a dictionary for the columns

dict_cols = ({"CC": "Country_code",
              "Mammals": "Mammals_CR+EN+VU",
              "T_EX":"extinct",
              "T_EW":"extinct_wild",
              "T_CR":"critically_endangered",
              "T_EN":"endangered",
              "T_VU":"vulnerable",
              "T_NT":"near_threatened",
              "T_DD":"data_deficient",
              "T_LC":"least_concern",
              "T_SP":"no_native_species",
              "M_ED":"no_endemic_species",
              "M_TH_ED":"no_endemics_red_list",
              "M_EX":"Mammals_extinct",
              "M_EW":"Mammals_extinct_wild",
              "M_CR":"mammals_critically_endangered",
              "M_EN":"mammals_endangered",
              "M_VU":"mammals_vulnerable",
              "M_NT":"mammals_near_threatened",
              "M_DD":"mammals_data_deficient",
              "M_LC":"mammals_least_concern",
              "M_SP":"mammals_no_native_species",
              "A_ED":"Amphibians_no_endemic_species",
              "A_TH_ED":"Amphi_no_endemics_red_list",
              "A_EX":"Amphi_extinct",
              "A_EW":"Amphi_extinct_wild",
              "A_CR":"Amphi_critically_endangered",
              "A_EN":"Amphi_endangered",
              "A_VU":"Amphi_vulnerable",
              "A_NT":"Amphi_near_threatened",
              "A_DD":"Amphi_data_deficient",
              "A_LC":"Amphi_least_concern",
              "A_SP":"Amphi_no_native_species", 
             })

In [6]:
# Rename columns

red016= red016.rename(columns = dict_cols)

In [7]:
indexNames = red016[red016["Country"].isna()].index

# Delete these row indexes from dataFrame
red016.drop(indexNames , inplace=True)

In [8]:
#Export to csv

red016.to_csv("data/data_clean.csv")

5. Create a derived data set

In [9]:
df_regions = red016[["Region", "Mammals_CR+EN+VU", "Birds", "Reptiles", "Amphibians", "Fish", "Molluscs", "Other_Inverts"]].groupby("Region").sum()

In [10]:
df_regions.head()

Unnamed: 0_level_0,Mammals_CR+EN+VU,Birds,Reptiles,Amphibians,Fish,Molluscs,Other_Inverts
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Antarctic,10,33,4,0,7,0,0
Caribbean Islands,87,107,209,182,847,6,311
East Asia,147,260,77,127,342,50,345
Europe,207,380,72,32,960,879,789
Mesoamerica,166,157,218,486,465,12,219


In [11]:
# transpose 
df_regions = df_regions.reset_index()
df_regions = df_regions.transpose().reset_index()
df_regions.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11,12
0,Region,Antarctic,Caribbean Islands,East Asia,Europe,Mesoamerica,North Africa,North America,North Asia,Oceania,South & Southeast Asia,South America,Sub-Saharan Africa,West & Central Asia
1,Mammals_CR+EN+VU,10,87,147,207,166,86,53,51,186,808,397,936,272
2,Birds,33,107,260,380,157,62,96,93,390,702,773,915,425
3,Reptiles,4,209,77,72,218,46,43,12,216,370,186,447,157
4,Amphibians,0,182,127,32,486,6,57,0,64,318,716,402,24


In [12]:
# Create a list with the names of columns and a dictionary with the desired names

cols = list(df_regions.iloc[0])

col_dic = {}

for i in range(13):
    col_dic[i] = cols[i+1]

In [13]:
# Rename columns

df = df_regions.rename(columns = {"index": "Species_group"})

df = df.rename(columns = col_dic)

In [14]:
# drop the first row

df = df.drop([0])

In [15]:
df.head()

Unnamed: 0,Species_group,Antarctic,Caribbean Islands,East Asia,Europe,Mesoamerica,North Africa,North America,North Asia,Oceania,South & Southeast Asia,South America,Sub-Saharan Africa,West & Central Asia
1,Mammals_CR+EN+VU,10,87,147,207,166,86,53,51,186,808,397,936,272
2,Birds,33,107,260,380,157,62,96,93,390,702,773,915,425
3,Reptiles,4,209,77,72,218,46,43,12,216,370,186,447,157
4,Amphibians,0,182,127,32,486,6,57,0,64,318,716,402,24
5,Fish,7,847,342,960,465,247,295,73,586,1065,541,2075,621


In [16]:
# Export to csv

df.to_csv("data/species_group.csv")