In [1216]:
import pandas as pd
import numpy as np
import json
import pprint
import pymongo
from pymongo import MongoClient

In [1217]:
co_df = pd.read_csv('data/CO2_flat.csv')
pc_df = pd.read_csv('data/percapita_flat.csv')


In [1218]:
# Adding identifier columns to the dataframes so that we can merge them later
# Assuming df_totals and df_percapita are your two data frames
co_df['Type'] = 'Total CO2'
pc_df['Type'] = 'PerCapita CO2'

In [1219]:
# Drop the per capita column from the co_df as it will be maintained in the pc_df
co_df = co_df.drop(columns = ['Per Capita'])

In [1220]:
# Look at nulls
co_df.isnull().sum()

# Where is the iso code missing?
co_df[co_df['ISO 3166-1 alpha-3'].isnull()]

Unnamed: 0,Country,ISO 3166-1 alpha-3,Year,Total,Coal,Oil,Gas,Cement,Flaring,Other,Type
19312,French Equatorial Africa,,1750,0.0,,,,,,,Total CO2
19313,French Equatorial Africa,,1751,0.0,,,,,,,Total CO2
19314,French Equatorial Africa,,1752,0.0,,,,,,,Total CO2
19315,French Equatorial Africa,,1753,0.0,,,,,,,Total CO2
19316,French Equatorial Africa,,1754,0.0,,,,,,,Total CO2
...,...,...,...,...,...,...,...,...,...,...,...
47867,Ryukyu Islands,,2017,0.0,,,,,,,Total CO2
47868,Ryukyu Islands,,2018,0.0,,,,,,,Total CO2
47869,Ryukyu Islands,,2019,0.0,,,,,,,Total CO2
47870,Ryukyu Islands,,2020,0.0,,,,,,,Total CO2


In [1221]:
# Look at nulls in the other dataframe - both CO2 dataframes have the same country codes and data missing
pc_df.isnull().sum()

Country                   0
ISO 3166-1 alpha-3     1632
Year                      0
Total                 44132
Coal                  45966
Oil                   46065
Gas                   46092
Cement                47656
Flaring               46160
Other                 61484
Type                      0
dtype: int64

In [1222]:
# Drop the rows with null country codes
co_df = co_df.dropna(subset=['ISO 3166-1 alpha-3'])
pc_df = pc_df.dropna(subset=['ISO 3166-1 alpha-3'])

In [1223]:
# Concatenate the data frames
all_co_df = pd.concat([co_df, pc_df], ignore_index=True)

# Melt the data frame, id_vars is what columns to keep, value_vars is what columns to melt
all_co_melted = pd.melt(all_co_df, id_vars=['Country', 'ISO 3166-1 alpha-3', 'Year', 'Type'],
                    value_vars=['Total', 'Coal', 'Oil', 'Gas', 'Cement', 'Flaring', 'Other'],
                    var_name='Category', value_name='Emission')

In [1224]:
# Renaming columns for clarity
all_co_melted = all_co_melted.rename(columns = {'ISO 3166-1 alpha-3':'Country Code', 'Emission':'Value'})

all_co_melted.head()

Unnamed: 0,Country,Country Code,Year,Type,Category,Value
0,Afghanistan,AFG,1750,Total CO2,Total,0.0
1,Afghanistan,AFG,1751,Total CO2,Total,0.0
2,Afghanistan,AFG,1752,Total CO2,Total,0.0
3,Afghanistan,AFG,1753,Total CO2,Total,0.0
4,Afghanistan,AFG,1754,Total CO2,Total,0.0


In [1225]:
# Shortening the time frame from 1999-2019
all_co_melted_short = all_co_melted[all_co_melted['Year'] >= 1999]

In [1226]:
all_co_melted_short = all_co_melted_short.copy()

# Converting the data types to float
all_co_melted_short[['Value']] = all_co_melted_short[['Value']].astype(float)


In [1227]:
# Find where there are World or International Transport values
# all_co_melted[all_co_melted['Country'] == 'World']
# all_co_melted[all_co_melted['Country'] == 'International transport']

# Drop the rows with World and International Transport values
all_co_ready = all_co_melted_short[all_co_melted['Country'] != 'World']
all_co_ready = all_co_melted_short[all_co_melted['Country'] != 'International transport']


  
  import sys


In [1228]:
all_co_ready.head()


Unnamed: 0,Country,Country Code,Year,Type,Category,Value
249,Afghanistan,AFG,1999,Total CO2,Total,1.09164
250,Afghanistan,AFG,2000,Total CO2,Total,1.047128
251,Afghanistan,AFG,2001,Total CO2,Total,1.069098
252,Afghanistan,AFG,2002,Total CO2,Total,1.340995
253,Afghanistan,AFG,2003,Total CO2,Total,1.559602


### Bringing in the GDP and Population data
These are from the same source, in the same format just pulled as different times

In [1229]:
# Read in the GDP data
gdp_df= pd.read_csv('data/gdp_data.csv')
pop_df = pd.read_csv('data/population.csv')

# Concatenate the data frames
all_gdp_df = pd.concat([gdp_df, pop_df], ignore_index=True)

In [1230]:
# Cleaning up the column names
new_columns = []
for column in all_gdp_df.columns:
    new_columns.append(column.split('[')[0].strip())

all_gdp_df.columns = new_columns

# Renaming the series to match Type from the CO2 data
all_gdp_df = all_gdp_df.rename(columns = {'Series Name':'Category', 'Country Name':'Country'})

# Adding the Type column for merging later with the CO2 data, add it in column 2
all_gdp_df.insert(2, 'Type', 'GDP')

# Dropping the series code column
all_gdp_df_formatted = all_gdp_df.drop(columns = ['Series Code'])

all_gdp_df_formatted.head()

Unnamed: 0,Country,Country Code,Type,Category,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,AFG,GDP,GDP (current US$),537777811.111111,548888895.555556,546666677.777778,751111191.111111,800000044.444444,1006666637.77778,...,15633856787.0425,18190410820.6235,20203572959.5023,20564485419.1684,20550582746.8448,19998156214.3988,18019558181.9602,18896352021.94,18418848299.5825,18904490262.913
1,Afghanistan,AFG,GDP,GDP per capita (current US$),62.3693745050559,62.4437034626929,60.9503638210144,82.0217375781519,85.5110734102311,105.243195716467,...,554.595200222354,621.912310861592,663.141052810937,651.987861948108,628.146803888496,592.476537451681,520.252064031151,530.149830802984,502.056770622973,500.522664145294
2,Africa Eastern and Southern,AFE,GDP,GDP (current US$),21291524631.3606,21809435284.9773,23708060554.431,28211280561.2905,26119938954.6996,29683481336.0513,...,860361207235.72,964213016259.692,972002199045.518,982677082467.187,1003403000510.52,923143900034.922,889859250365.411,1030482257898.25,1016696893390.38,1009051505055.59
3,Africa Eastern and Southern,AFE,GDP,GDP per capita (current US$),162.913034498773,162.551683028333,172.002459991871,199.189238183763,179.387798934375,198.230367947484,...,1643.60556870139,1792.90750512756,1759.18239469392,1730.39468555029,1719.18372097247,1538.55226805656,1443.69237090812,1628.58678812124,1564.73433998697,1512.2705529912
4,Africa Western and Central,AFW,GDP,GDP (current US$),10404135069.15,11127894641.0191,11943187848.3043,12676330764.6917,13838369295.2313,14862225759.9135,...,597129288864.775,680455985950.414,736039861278.13,832216894624.603,892497905712.366,766957955078.105,690545418736.157,683748014299.694,766359667820.703,794719102944.191


In [1231]:
all_gdp_df_formatted['Category'].value_counts()

GDP per capita (current US$)    266
Population, total               266
GDP (current US$)               266
Name: Category, dtype: int64

In [1232]:
# Paring down the countries contain to match the all_co_melted
pared_gdp_df = all_gdp_df_formatted[all_gdp_df_formatted['Country Code'].isin(all_co_melted['Country Code'])]

In [1233]:
# Melt the data frame, id_vars is what columns to keep, value_vars is what columns to melt
gdp_melted = pd.melt(pared_gdp_df, id_vars=['Country', 'Country Code', 'Type', 'Category'],
                     var_name='Year', value_name='Value')

gdp_melted.head()

Unnamed: 0,Country,Country Code,Type,Category,Year,Value
0,Afghanistan,AFG,GDP,GDP (current US$),1960,537777811.111111
1,Afghanistan,AFG,GDP,GDP per capita (current US$),1960,62.3693745050559
2,Albania,ALB,GDP,GDP (current US$),1960,..
3,Albania,ALB,GDP,GDP per capita (current US$),1960,..
4,Algeria,DZA,GDP,GDP (current US$),1960,2723593384.78054


In [1234]:
gdp_melted = gdp_melted[gdp_melted['Country'] != 'World']
gdp_melted = gdp_melted[gdp_melted['Country'] != 'International transport']

In [1235]:
gdp_melted.isnull().sum()

Country         0
Country Code    0
Type            0
Category        0
Year            0
Value           0
dtype: int64

In [1236]:
gdp_melted.dtypes

Country         object
Country Code    object
Type            object
Category        object
Year            object
Value           object
dtype: object

In [1237]:
# Converting year to Int and GDP_Value to float, replace ".." with NaN
gdp_melted[['Year']] = gdp_melted[['Year']].astype(int).copy()
gdp_melted['Value'] = pd.to_numeric(gdp_melted['Value'], errors='coerce').copy()


In [1238]:
# Shortening the time frame from 1999-2019
gdp_melted_short = gdp_melted[gdp_melted['Year'] >= 1999]

In [1239]:
# reorder the columns
gdp_melted_ready = gdp_melted_short[['Country', 'Country Code', 'Year', 'Type', 'Category', 'Value']]

In [1240]:
gdp_melted_ready.head()

Unnamed: 0,Country,Country Code,Year,Type,Category,Value
24102,Afghanistan,AFG,1999,GDP,GDP (current US$),
24103,Afghanistan,AFG,1999,GDP,GDP per capita (current US$),
24104,Albania,ALB,1999,GDP,GDP (current US$),3212122000.0
24105,Albania,ALB,1999,GDP,GDP per capita (current US$),1033.243
24106,Algeria,DZA,1999,GDP,GDP (current US$),48640650000.0


### Make sure the CO2 df and GDP df have the same countries in them

In [1241]:
# Get unique country codes for each df and compare
co_codes = all_co_ready['Country Code'].unique() 
gdp_codes = gdp_melted_ready['Country Code'].unique()

# Are there any countries in the CO2 df that are not in the per capita df?
co_not_in_gdp = []
for code in co_codes:
    if code not in gdp_codes:
        co_not_in_gdp.append(code)
print(f"There are {len(co_not_in_gdp)} countries in the CO2 df that are not in the GDP df.\
      The country not in the other data frame is: {co_not_in_gdp}")

# Show the countries that are not in the per capita df
co_not_in_gdp

# Are there any countries in the per capita df that are not in the CO2 df?
gdp_not_in_co = []   
for code in gdp_codes:
    if code not in co_codes:
        gdp_not_in_co.append(code)   
print(f"There are {len(gdp_not_in_co)} countries in the GDP df that are not in the CO2 df.\
      The country not in the other data frame is: {gdp_not_in_co}")

There are 20 countries in the CO2 df that are not in the GDP df.      The country not in the other data frame is: ['AIA', 'ATA', 'BES', 'CXR', 'COK', 'PCZ', 'GUF', 'GLP', 'KSV', 'MTQ', 'MYT', 'MSR', 'NIU', 'REU', 'SHN', 'SPM', 'TWN', 'WLF', 'XIT', 'WLD']
There are 0 countries in the GDP df that are not in the CO2 df.      The country not in the other data frame is: []


In [1242]:
# Drop the countries in the CO2 df that are not in the GDP df
all_co_ready = all_co_ready[~all_co_ready['Country Code'].isin(co_not_in_gdp)]


### Merge the CO2 and GDP dataframes

In [1243]:
all_co_ready.head()

Unnamed: 0,Country,Country Code,Year,Type,Category,Value
249,Afghanistan,AFG,1999,Total CO2,Total,1.09164
250,Afghanistan,AFG,2000,Total CO2,Total,1.047128
251,Afghanistan,AFG,2001,Total CO2,Total,1.069098
252,Afghanistan,AFG,2002,Total CO2,Total,1.340995
253,Afghanistan,AFG,2003,Total CO2,Total,1.559602


In [1244]:
gdp_melted_ready.head()

Unnamed: 0,Country,Country Code,Year,Type,Category,Value
24102,Afghanistan,AFG,1999,GDP,GDP (current US$),
24103,Afghanistan,AFG,1999,GDP,GDP per capita (current US$),
24104,Albania,ALB,1999,GDP,GDP (current US$),3212122000.0
24105,Albania,ALB,1999,GDP,GDP per capita (current US$),1033.243
24106,Algeria,DZA,1999,GDP,GDP (current US$),48640650000.0


In [1245]:
# Merge the CO2 and GDP data frames
co_gdp_df = pd.merge(all_co_ready, gdp_melted_ready, how='outer', on=['Country', 'Country Code', 'Year', 'Type', 'Category', 'Value'])

In [1246]:
co_gdp_df.head()

Unnamed: 0,Country,Country Code,Year,Type,Category,Value
0,Afghanistan,AFG,1999,Total CO2,Total,1.09164
1,Afghanistan,AFG,2000,Total CO2,Total,1.047128
2,Afghanistan,AFG,2001,Total CO2,Total,1.069098
3,Afghanistan,AFG,2002,Total CO2,Total,1.340995
4,Afghanistan,AFG,2003,Total CO2,Total,1.559602


In [1247]:
# Test
test = co_gdp_df[co_gdp_df['Country Code'] == 'USA']

test['Category'].value_counts()

Total                           46
Coal                            46
Cement                          46
Oil                             46
Flaring                         46
Other                           46
Gas                             46
Population, total               21
GDP (current US$)               21
GDP per capita (current US$)    21
Name: Category, dtype: int64

#### Bringing in continent data onto the CO2/GDP/POP dataframe

In [1248]:
cont_df = pd.read_csv('data/country-and-continent-codes-list-csv.csv')

cont_df.head()

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0
2,Antarctica,AN,Antarctica (the territory South of 60 deg S),AQ,ATA,10.0
3,Africa,AF,"Algeria, People's Democratic Republic of",DZ,DZA,12.0
4,Oceania,OC,American Samoa,AS,ASM,16.0


In [1249]:
print("Missing values in 'Two_Letter_Country_Code' column:", cont_df['Two_Letter_Country_Code'].isna().sum())


Missing values in 'Two_Letter_Country_Code' column: 1


In [1250]:
# Drop the rows with missing values
cont_df = cont_df.dropna(subset=['Two_Letter_Country_Code'])

In [1251]:
# Rename the columns to match the CO2 data
cont_df = cont_df.rename(columns = {'Three_Letter_Country_Code':'Country Code', 'Continent_Name':'Continent', 'Continent_Code':'Continent Code'})

In [1252]:
# Check for duplicate country codes in the continent data frame
cont_df['Country Code'].value_counts()
# Drop the duplicate country codes
cont_df = cont_df.drop_duplicates(subset=['Country Code'])

In [1253]:
cont_df.head()

Unnamed: 0,Continent,Continent Code,Country_Name,Two_Letter_Country_Code,Country Code,Country_Number
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0
2,Antarctica,AN,Antarctica (the territory South of 60 deg S),AQ,ATA,10.0
3,Africa,AF,"Algeria, People's Democratic Republic of",DZ,DZA,12.0
4,Oceania,OC,American Samoa,AS,ASM,16.0


In [1254]:
print("Columns in co_gdp_df:", co_gdp_df.columns)
print("Columns in cont_df:", cont_df.columns)

Columns in co_gdp_df: Index(['Country', 'Country Code', 'Year', 'Type', 'Category', 'Value'], dtype='object')
Columns in cont_df: Index(['Continent', 'Continent Code', 'Country_Name',
       'Two_Letter_Country_Code', 'Country Code', 'Country_Number'],
      dtype='object')


In [1255]:
# Bringing in continent data on to the co2/gdp/pop data frame, only bringing the Continent_Name column and the continent_Code column 
co_gdp_cont_df = pd.merge(co_gdp_df, cont_df[['Continent', 'Continent Code', 'Country Code']], how='left', left_on='Country Code', right_on='Country Code')


In [1256]:
co_gdp_cont_df.head()

Unnamed: 0,Country,Country Code,Year,Type,Category,Value,Continent,Continent Code
0,Afghanistan,AFG,1999,Total CO2,Total,1.09164,Asia,AS
1,Afghanistan,AFG,2000,Total CO2,Total,1.047128,Asia,AS
2,Afghanistan,AFG,2001,Total CO2,Total,1.069098,Asia,AS
3,Afghanistan,AFG,2002,Total CO2,Total,1.340995,Asia,AS
4,Afghanistan,AFG,2003,Total CO2,Total,1.559602,Asia,AS


In [1257]:
# Find where continet code is null
missing_conts = co_gdp_cont_df[co_gdp_cont_df['Continent Code'].isnull()]
missing_conts['Continent'].value_counts()


North America    12257
Name: Continent, dtype: int64

In [1258]:
# Fill the Continent Code column for all North America countries with NA
co_gdp_cont_df['Continent Code'] = co_gdp_cont_df['Continent Code'].fillna('NA')

### Building the country GeoJSON dataframe
Because we are using boundaries and those coordinates are very lengthy, these will be housed in their own collection

In [1259]:
# Function to load geojson to a dataframe, so we can potentially load more than one
filepath = 'data/countries.geojson'

with open(filepath, 'r') as file:
    geojson_data = json.load(file)
    
# Extract the features from the GeoJSON data
features = geojson_data['features']

# Create a list to store the data for the DataFrame
data = []

# Iterate through the features and extract the relevant information
for feature in features:
    country_code = feature['properties']['ISO_A3']
    geometry = feature['geometry']
    data.append([country_code, geometry])

# Create a DataFrame from the extracted data
geojson_df = pd.DataFrame(data, columns=['country_code', 'geometry'])

In [1260]:
geojson_df.head()

Unnamed: 0,country_code,geometry
0,ABW,"{'type': 'Polygon', 'coordinates': [[[-69.9969..."
1,AFG,"{'type': 'Polygon', 'coordinates': [[[71.04980..."
2,AGO,"{'type': 'MultiPolygon', 'coordinates': [[[[11..."
3,AIA,"{'type': 'MultiPolygon', 'coordinates': [[[[-6..."
4,ALB,"{'type': 'Polygon', 'coordinates': [[[19.74776..."


In [1261]:
# merge the continent data from the co_gdp_cont_df to the geojson_df
geojson_cont_df = pd.merge(geojson_df, co_gdp_cont_df[['Country Code', 'Continent','Continent Code']], how='left', left_on='country_code', right_on='Country Code')

In [1262]:
geojson_cont_df.head()

Unnamed: 0,country_code,geometry,Country Code,Continent,Continent Code
0,ABW,"{'type': 'Polygon', 'coordinates': [[[-69.9969...",ABW,North America,
1,ABW,"{'type': 'Polygon', 'coordinates': [[[-69.9969...",ABW,North America,
2,ABW,"{'type': 'Polygon', 'coordinates': [[[-69.9969...",ABW,North America,
3,ABW,"{'type': 'Polygon', 'coordinates': [[[-69.9969...",ABW,North America,
4,ABW,"{'type': 'Polygon', 'coordinates': [[[-69.9969...",ABW,North America,


In [1263]:
# Drop the duplicate columns
geojson_cont_df = geojson_cont_df.drop(columns=['country_code'])


In [1264]:
# Reorder the columns
geojson_cont_df = geojson_cont_df[['Country Code', 'Continent', 'Continent Code', 'geometry']]
geojson_cont_df.head()

Unnamed: 0,Country Code,Continent,Continent Code,geometry
0,ABW,North America,,"{'type': 'Polygon', 'coordinates': [[[-69.9969..."
1,ABW,North America,,"{'type': 'Polygon', 'coordinates': [[[-69.9969..."
2,ABW,North America,,"{'type': 'Polygon', 'coordinates': [[[-69.9969..."
3,ABW,North America,,"{'type': 'Polygon', 'coordinates': [[[-69.9969..."
4,ABW,North America,,"{'type': 'Polygon', 'coordinates': [[[-69.9969..."


In [1265]:
# Drop the duplicate rows
geojson_cont_short_df = geojson_cont_df.drop_duplicates(subset=['Country Code'])

In [1266]:
geojson_cont_short_df.head()

Unnamed: 0,Country Code,Continent,Continent Code,geometry
0,ABW,North America,,"{'type': 'Polygon', 'coordinates': [[[-69.9969..."
385,AFG,Asia,AS,"{'type': 'Polygon', 'coordinates': [[[71.04980..."
770,AGO,Africa,AF,"{'type': 'MultiPolygon', 'coordinates': [[[[11..."
1155,,,,"{'type': 'MultiPolygon', 'coordinates': [[[[-6..."
1156,ALB,Europe,EU,"{'type': 'Polygon', 'coordinates': [[[19.74776..."


In [1267]:
# Split the 'geometry' column
geojson_cont_short_df[['Type', 'Coordinates']] = geojson_cont_short_df['geometry'].apply(lambda x: pd.Series({'Type': x['type'], 'Coordinates': x['coordinates']}))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [1268]:
# Drop the original 'geometry' column
geojson_cont_short_df = geojson_cont_short_df.drop(columns=['geometry'])

In [1269]:
geojson_cont_short_df.head()

Unnamed: 0,Country Code,Continent,Continent Code,Type,Coordinates
0,ABW,North America,,Polygon,"[[[-69.99693762899992, 12.577582098000036], [-..."
385,AFG,Asia,AS,Polygon,"[[[71.04980228700009, 38.40866445000009], [71...."
770,AGO,Africa,AF,MultiPolygon,"[[[[11.73751945100014, -16.692577982999836], [..."
1155,,,,MultiPolygon,"[[[[-63.037668423999946, 18.21295807500003], [..."
1156,ALB,Europe,EU,Polygon,"[[[19.747765747000074, 42.57890085900007], [19..."


In [1270]:
geojson_cont_short_df.isnull().sum()

Country Code      1
Continent         2
Continent Code    1
Type              0
Coordinates       0
dtype: int64

In [1271]:
# display where continent is null
geojson_cont_short_df[geojson_cont_short_df['Continent'].isnull()]

Unnamed: 0,Country Code,Continent,Continent Code,Type,Coordinates
1155,,,,MultiPolygon,"[[[[-63.037668423999946, 18.21295807500003], [..."
50403,NAM,,,Polygon,"[[[13.184910523000099, -16.9641830449999], [13..."


In [1272]:
# Drop the rows where country code is null
geo_countries_df = geojson_cont_short_df[~geojson_cont_short_df['Country Code'].isnull()]

In [1273]:
# This is ready to load into MongoDB
geo_countries_df.head()

Unnamed: 0,Country Code,Continent,Continent Code,Type,Coordinates
0,ABW,North America,,Polygon,"[[[-69.99693762899992, 12.577582098000036], [-..."
385,AFG,Asia,AS,Polygon,"[[[71.04980228700009, 38.40866445000009], [71...."
770,AGO,Africa,AF,MultiPolygon,"[[[[11.73751945100014, -16.692577982999836], [..."
1156,ALB,Europe,EU,Polygon,"[[[19.747765747000074, 42.57890085900007], [19..."
1542,AND,Europe,EU,Polygon,"[[[1.707006470000067, 42.5027814740001], [1.69..."


### Import to MongoDB

In [1274]:
"""
These are the two data frames that are ready to be loaded into MongoDB one with all the countries, their continent info, CO2, GDP, and population data, and the 
other with all the countries, continents, and geojson data.

co_gdp_cont_df - all co2, gdp, pop, and continent data
geo_countries_df - all country, continent, and geojson data
"""

'\nThese are the two data frames that are ready to be loaded into MongoDB one with all the countries, their continent info, CO2, GDP, and population data, and the \nother with all the countries, continents, and geojson data.\n\nco_gdp_cont_df - all co2, gdp, pop, and continent data\ngeo_countries_df - all country, continent, and geojson data\n'

In [1275]:
# Convert the dataframes to a list of dictionaries
co_gdp_cont_df = co_gdp_cont_df.to_dict('records')
geo_countries_df = geo_countries_df.to_dict('records')


In [1276]:
# connect to the MongoDB client and create the database and collection
client = MongoClient('mongodb://localhost:27017/')
db = client['global_emissions_db']
collection = db['CO2_gdp_population']

# Insert the data into the collection
collection.insert_many(co_gdp_cont_df)


<pymongo.results.InsertManyResult at 0x7fcb179ea1e0>

In [1277]:
db = client['global_emissions_db']
collection = db['country_geojson_data']

# Insert the data into the collection
collection.insert_many(geo_countries_df)

<pymongo.results.InsertManyResult at 0x7fcb178f9280>

### Test the Database   

In [1278]:
# Create a decorator to manage the connection to the MongoDB server
def with_mongo_client(func):
    def wrapper(*args, **kwargs):
        client = MongoClient("mongodb://localhost:27017/")
        try:
            return func(client, *args, **kwargs)
        finally:
            client.close()

    return wrapper

# Test fuctions for the decorator and databse 

@with_mongo_client
def list_all_collections(client):
    db = client['global_emissions_db']
    return db.list_collection_names()


@with_mongo_client
def find_five_countries_documents(client):
    db = client['global_emissions_db']
    collection = db['CO2_gdp_population']
    return list(collection.find().limit(5))

In [1279]:
find_five_countries_documents()

[{'_id': ObjectId('6436cec9fa5fdf5c944885ec'),
  'Country': 'Afghanistan',
  'Country Code': 'AFG',
  'Year': 1999,
  'Type': 'Total CO2',
  'Category': 'Total',
  'Value': 1.09164,
  'Continent': 'Asia',
  'Continent Code': 'AS'},
 {'_id': ObjectId('6436cec9fa5fdf5c944885ed'),
  'Country': 'Afghanistan',
  'Country Code': 'AFG',
  'Year': 2000,
  'Type': 'Total CO2',
  'Category': 'Total',
  'Value': 1.047128,
  'Continent': 'Asia',
  'Continent Code': 'AS'},
 {'_id': ObjectId('6436cec9fa5fdf5c944885ee'),
  'Country': 'Afghanistan',
  'Country Code': 'AFG',
  'Year': 2001,
  'Type': 'Total CO2',
  'Category': 'Total',
  'Value': 1.0690979999999999,
  'Continent': 'Asia',
  'Continent Code': 'AS'},
 {'_id': ObjectId('6436cec9fa5fdf5c944885ef'),
  'Country': 'Afghanistan',
  'Country Code': 'AFG',
  'Year': 2002,
  'Type': 'Total CO2',
  'Category': 'Total',
  'Value': 1.3409950000000002,
  'Continent': 'Asia',
  'Continent Code': 'AS'},
 {'_id': ObjectId('6436cec9fa5fdf5c944885f0'),
  

In [1280]:
list_all_collections()

['CO2_gdp_population', 'country_geojson_data']

In [1282]:
client = MongoClient('mongodb://localhost:27017/')
db = client['global_emissions_db']

db.list_collection_names()

['CO2_gdp_population', 'country_geojson_data']

In [1283]:
client = MongoClient('mongodb://localhost:27017/')
db = client['global_emissions_db']

# How many countries are in the country_geojson_data collection?
db.country_geojson_data.count_documents({})


205

In [1284]:
client = MongoClient('mongodb://localhost:27017/')
db = client['global_emissions_db']

# How many documents are in the CO2_gdp_population collection?
total_documents = db.CO2_gdp_population.count_documents({})
print(f'Total documents: {total_documents}')

# Count the number of unique countries in the CO2_gdp_population collection
unique_countries = len(db.CO2_gdp_population.distinct("Country Code"))
print(f'Number of unique countries: {unique_countries}')



Total documents: 79247
Number of unique countries: 205


In [1285]:
# client = MongoClient('mongodb://localhost:27017/')
# db = client['global_emissions_db']

# # Delete all the collections
# for collection_name in db.list_collection_names():
#     db[collection_name].drop() 