In [None]:
# Dependencies imported
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt

Import source datasets into MongoDB using the following imports

mongoimport --type csv -d project3 -c emissions --headerline --drop GCB2022v27_MtCO2_flat.csv

mongoimport --type csv -d project3 -c population --headerline --drop world_population.csv

mongoimport --type csv -d project3 -c gdp --headerline --drop gdp.csv

In [None]:
# Instance of MongoClient created
mongo = MongoClient(port=27017)

In [None]:
# Database names listed, to ensure that our database has been created
mongo.list_database_names()

In [None]:
# Database is named
db = mongo["project3"]

In [None]:
# Collection names are listed to ensure that our datasets have been imported
db.list_collection_names()

In [None]:
# Colections are named
emissions = db["emissions"]
population = db["population"]
gdp = db["gdp"]

In [None]:
# 1 record in the emission collection is printed to gauge its structure
pprint(emissions.find_one())

In [None]:
# As the dataset contains dates ranging from 1700, set up a query to filter for only years after 1921
emissions_query = {"Year" : {"$gte" : 1922}}

# Select desired columns with a field query
emissions_fields = {"Country", "ISO 3166-1 alpha-3", "Year", "Total", "Coal", "Oil", "Gas", "Cement", "Flaring", "Other"}

In [None]:
# Create an emissions DataFrame with the selected query and fields applied
emissions_df = pd.DataFrame(emissions.find(emissions_query, emissions_fields))

# Remove unwanted columns and rename remaining columns
del emissions_df["_id"]
emissions_df = emissions_df.rename(columns= {"ISO 3166-1 alpha-3" : "Code"})

# After inspecting the DataFrame at later stage, it was noted that the country of St. Kitts and Nevis was duplicated under a different name; this filter removes the duplicate
emissions_df = emissions_df.loc[emissions_df["Country"] != "St. Kitts-Nevis-Anguilla"]

emissions_df.head()

In [None]:
# 1 record in the population collection is printed to gauge its structure
pprint(population.find_one())

In [None]:
# Filter the population collection with a query to return all records, but only for certain columns
population_query = {}
population_fields = {"Country/Territory", "CCA3", "Continent", "1970 Population", "1980 Population", "1990 Population", "2000 Population", "2010 Population", "2020 Population"}

In [None]:
# Create a population DataFrame with the selected query and fields applied
population_df = pd.DataFrame(population.find(population_query, population_fields))

# Remove unwanted ID column and rename remaining columns
del population_df["_id"]
population_df = population_df.rename(columns= {"CCA3" : "Code"})

population_df.head()

In [None]:
# 1 record in the gdp collection is printed to gauge its structure
pprint(gdp.find_one())

In [None]:
# gdp collection is filtered with a query to return all records, but only for certain columns (dates)
gdp_query = {}
gdp_fields = {"Country Name", "Code", "1970", "1980", "1990", "2000", "2010", "2020"}

In [None]:
# gdp DataFrame is created, with the selected query and fields applied
gdp_df = pd.DataFrame(gdp.find(gdp_query, gdp_fields))

# Unwanted ID column deleted
del gdp_df["_id"]

gdp_df.head()

In [None]:
# As the 3 different datasets contain different names for countries, they are merged, so the country names can be standardised across all 3. Thry are merged on the "Code" column, as these are ubiquitously recognised codes
merged_df = pd.merge(population_df, gdp_df, on="Code", how="inner").merge(emissions_df, on="Code", how="inner")

merged_df.head()

In [None]:
# Cleaned emissions DataFrame is created by extracting and cleaning columns from the merged DataFrame
emissions_df_cleaned = pd.DataFrame(merged_df[["Country/Territory", "Code", "Year", "Total", "Coal", "Oil", "Gas", "Cement", "Flaring", "Other"]])
emissions_df_cleaned = emissions_df_cleaned.rename(columns= {"Country/Territory" : "Country"})
emissions_df_cleaned = emissions_df_cleaned.sort_values(["Country", "Year"])
emissions_df_cleaned = emissions_df_cleaned.reset_index(drop=True)

emissions_df_cleaned.head()

In [None]:
# Cleaned emissions DataFrame dtypes are checked and are found to be incorrect
emissions_df_cleaned.dtypes         

In [None]:
# After further investigation, it was realised that these incorrect dtypes were due to a TypeError. This is rectifed by replacing all spaces with "0"
emissions_df_cleaned = emissions_df_cleaned.replace('',0)

# dtypes are checked again and are still incorrect
emissions_df.dtypes

In [None]:
# "astype" is used to force the selected columns to float type
emissions_df_cleaned["Total"] = emissions_df_cleaned["Total"].astype("float")
emissions_df_cleaned["Coal"] = emissions_df_cleaned["Coal"].astype("float")
emissions_df_cleaned["Oil"] = emissions_df_cleaned["Oil"].astype("float")
emissions_df_cleaned["Gas"] = emissions_df_cleaned["Gas"].astype("float")
emissions_df_cleaned["Cement"] = emissions_df_cleaned["Cement"].astype("float")
emissions_df_cleaned["Flaring"] = emissions_df_cleaned["Flaring"].astype("float")
emissions_df_cleaned["Other"] = emissions_df_cleaned["Other"].astype("float")

# dtypes are checked again and are now correct
emissions_df_cleaned.dtypes

In [None]:
# Cleaned population DataFrame is created by extracting and cleaning columns from the merged DataFrame
population_df_cleaned = merged_df.drop_duplicates("Code")
population_df_cleaned = population_df_cleaned[["Country/Territory", "Code", "Continent", "1970 Population", "1980 Population", "1990 Population", "2000 Population", "2010 Population", "2020 Population"]]
population_df_cleaned = population_df_cleaned.rename(columns= {"Country/Territory" : "Country"})
population_df_cleaned = population_df_cleaned.sort_values("Country")
population_df_cleaned = population_df_cleaned.reset_index(drop=True)

population_df_cleaned.head()

In [None]:
# Cleaned population DataFrame dtypes are checked and are found to be correct; no further action needed
population_df_cleaned.dtypes

In [None]:
# Cleaned gdp DataFrame is created by extracting and cleaning columns from the merged DataFrame
gdp_df_cleaned = merged_df.drop_duplicates("Code")
gdp_df_cleaned = gdp_df_cleaned[["Country/Territory", "Code", "Continent", "1970", "1980", "1990", "2000", "2010", "2020"]]
gdp_df_cleaned = gdp_df_cleaned.rename(columns= {"Country/Territory" : "Country"})
gdp_df_cleaned = gdp_df_cleaned.sort_values("Country")
gdp_df_cleaned = gdp_df_cleaned.reset_index(drop=True)

gdp_df_cleaned.head()

In [None]:
# Cleaned population DataFrame dtypes are checked and are found to be incorrect
gdp_df_cleaned.dtypes

In [None]:
# After further investigation, it was realised that these incorrect dtypes were due to a TypeError. This is rectifed by replacing all spaces with "0"
gdp_df_cleaned = gdp_df_cleaned.replace('',0)

# dtypes are checked again and are now correct
gdp_df_cleaned.dtypes

In [None]:
gdp_df_cleaned.head()

In [None]:
# Countries DataFrame is created by extracting and cleaning columns from the cleaned population DataFrame.
countries_df = population_df_cleaned[["Country", "Code", "Continent"]]
countries_df = countries_df.sort_values("Country")
countries_df = countries_df.reset_index(drop=True)

countries_df.head()

In [None]:
# Country DataFrame dtypes are checked and are correct
countries_df.dtypes

In [None]:
# 4 DataFrames have now  been created; each have the same number of countries, with the same names
# The 4 DataFrames are exported as CSVs to the output folder, and can be used for future projects, if desired
emissions_df_cleaned.to_csv("output/emissions.csv", index=False)
population_df_cleaned.to_csv("output/population.csv", index=False)
gdp_df_cleaned.to_csv("output/gdp.csv", index=False)
countries_df.to_csv("output/countries.csv", index=False)

In [None]:
# The 4 DataFrames are also JSONified, to be used in this project, for the JavaScript powered dashboard
emissions_json = emissions_df_cleaned.to_json(orient="records")
population_json = population_df_cleaned.to_json(orient="records")
gdp_json = gdp_df_cleaned.to_json(orient="records")
countries_json = countries_df.to_json(orient="records")

In [None]:
# The 4 JSONified DataFrames are parsed through "json.loads"
emissions_parsed = json.loads(emissions_json)
population_parsed = json.loads(population_json)
gdp_parsed = json.loads(gdp_json)
countries_parsed = json.loads(countries_json)

In [None]:
# The 4 parsed JSON outputs are merged into 1 data JSON output, and exported to the output folder, for use in the Javascript element of this project 
with open("output/data.json", "w", encoding="utf8") as output:
    output.write(json.dumps({"emissions" : emissions_parsed,
                             "gdp" : gdp_parsed,
                             "population" : population_parsed,
                             "countries" : countries_parsed}, indent=4))

In [None]:
# Function to print Top 10 bar plot to console, and export.
def print_graph():
    # A Top 10 Data Frame is created, fed by user driven inputs
    top10_df = emissions_df_cleaned.loc[(emissions_df_cleaned["Year"] == int(barplot_year)),:]
    top10_df = top10_df[["Country", "Code", barplot_emission_type]]
    top10_df = top10_df.sort_values(by=barplot_emission_type, ascending=False)
    top10_df = top10_df.head(10)
    top10_df = top10_df.reset_index(drop=True)
    top10_df

    # Bar plot is created with Seaborn
    plt.figure(figsize=(8,10))

    barplot = sns.barplot(x = top10_df["Country"],
                        y = top10_df[barplot_emission_type],
                        color="b")

    barplot.set(xlabel="", ylabel = "Emissions (MtCO2)")
    barplot.set(title=f'Top 10 {barplot_emission_type} Emissions in {barplot_year}')
    barplot.set_xticklabels(labels = top10_df["Country"], rotation=45)

    # Bar plot exported to output folder
    plt.savefig(f'output/top10_{barplot_emission_type.lower()}_emissions_{barplot_year}.png', dpi=100)

    plt.show()

In [None]:
# While loop which takes in user inputs and uses the previously defined print_graph function to print bar plot graphs and export them to the PC
print_another = "yes"

while print_another == "yes":

    # Try/except block is used to handle errors that may arise if the user inputs invalid data
    try:
        barplot_year = input("What year would you like to make a Top 10 graph for? Please enter a valid year (1921 - 2021)")
        barplot_emission_type = str(input("What emission type would you like to display? Please enter a valid emission type (Coal, Oil, Gas, Cement, Flaring, Other, Total)").lower().title())
        print_graph()

    except ValueError:
        barplot_year = input("Please enter a valid year (1921 - 2021)")
        print_graph()

    except KeyError:
        barplot_emission_type = str(input("Please enter a valid emission type (Coal, Oil, Gas, Cement, Flaring, Other, Total)").lower().title())
        print_graph()

    print_another = input("would you like to make another graph?").lower()