In [5]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import sqlalchemy


# Storing columns to keep in a list to re-iterate over multiple csv's
columns_to_keep = ["Country Name", "Country Code", "Indicator Name", "2015"]
# columns_to_keep = ["Country Name", "Country Code", "Indicator Name", "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015"]

rows_to_remove = ["WLD", "IBT", "LMY", "MIC", "IBD", "EAP", "EAR", "EAS", "ECA", "ECS", "EMU", "FCS", "HIC", "HPC", "IDB", "IDA", "IDX", "LAC", "LIC", "LDC", "LCN", "LMC", "LMY", "LTE", "MEA", "MNA", "NAC", "OED", "OSS", "PRE", "PST", "SAS", "SSA", "SSF", "SST", "TEA", "TEC", "TLA", "TMN", "TSA", "TSS", "UMC", "EUU", "ARB"]

## CO2 emissions cleaning

In [6]:
# Loading raw CO2 emissions data and cleaning
data = pd.read_csv('raw_data/CO2emissions_data.csv', error_bad_lines=False)

In [7]:
# Filtering to only columns to keep 
Organized_df = data[columns_to_keep]

In [8]:
# Drop NANs
CO2_data = Organized_df.dropna()

In [9]:
# Renaming columns 
CO2_data.rename(columns={"country_name":"Country Name","country_code":"Country Code","indicator_name":"Indicator Name"}, inplace = True)

# Resetting index to country code
CO2_data.set_index("Country Code", inplace=True)

# Counting initial amount of rows before dropping non-countries
CO2_data.count()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Country Name      250
Indicator Name    250
2015              250
dtype: int64

In [10]:
# Dropping any indices that are non-countries ex. WLD is for the entire World CO2 emissions
CO2_data = CO2_data.drop(index=["WLD", "IBT", "LMY", "MIC", "IBD", "EAP", "EAR", "EAS", "ECA", "ECS", "EMU", "FCS", "HIC", "HPC", "IDB", "IDA", "IDX", "LAC", "LIC", "LDC", "LCN", "LMC", "LMY", "LTE", "MEA", "MNA", "NAC", "OED", "OSS", "PRE", "PST", "SAS", "SSA", "SSF", "SST", "TEA", "TEC", "TLA", "TMN", "TSA", "TSS", "UMC", "EUU", "ARB", "CEB"])


In [11]:
# Confirming indices were dropped 
CO2_data.count()

Country Name      206
Indicator Name    206
2015              206
dtype: int64

In [278]:
CO2_data.head(5)

Unnamed: 0_level_0,Country Name,Indicator Name,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ABW,Aruba,CO2 emissions (kt),2379.883,2409.219,2438.555,2563.233,2618.238,2720.914,2717.247,2823.59,2658.575,2629.239,2508.228,2500.894,1349.456,861.745,872.746,898.415
AFG,Afghanistan,CO2 emissions (kt),773.737,817.741,1070.764,1213.777,916.75,1327.454,1650.15,2273.54,4206.049,6769.282,8463.436,12240.446,10755.311,9050.156,8467.103,9035.488
AGO,Angola,CO2 emissions (kt),9541.534,9732.218,12665.818,9064.824,18793.375,19156.408,22266.024,25151.953,25709.337,27792.193,29057.308,30586.447,34176.44,33692.396,44851.077,34583.477
ALB,Albania,CO2 emissions (kt),3021.608,3223.293,3751.341,4294.057,4165.712,4253.72,3898.021,3927.357,4374.731,4378.398,4598.418,5240.143,4924.781,4913.78,5489.499,4616.753
AND,Andorra,CO2 emissions (kt),524.381,524.381,531.715,535.382,561.051,575.719,546.383,539.049,539.049,517.047,517.047,491.378,487.711,476.71,462.042,465.709


## GDP Cleaning

In [279]:
# Loading raw GDP data and cleaning
data = pd.read_csv('raw_data/GDPpercapita-data.csv', error_bad_lines=False)

In [280]:
# Filtering to only columns to keep 
Organized_df = data[columns_to_keep]

In [281]:
# Drop NANs
GDP_data = Organized_df.dropna()

In [282]:
# Renaming columns 
GDP_data.rename(columns={"country_name":"Country Name","country_code":"Country Code","indicator_name":"Indicator Name"}, inplace = True)

# Resetting index
GDP_data.set_index("Country Code", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [283]:
# Dropping any indices that are non-countries ex. WLD is for the entire World CO2 emissions
GDP_data = GDP_data.drop(index=["WLD", "IBT", "LMY", "MIC", "IBD", "EAP", "EAR", "EAS", "ECA", "ECS", "EMU", "FCS", "HIC", "HPC", "IDB", "IDA", "IDX", "LAC", "LIC", "LDC", "LCN", "LMC", "LMY", "LTE", "MEA", "MNA", "NAC", "OED", "PRE", "PST", "SAS", "SSA", "SSF", "SST", "TEA", "TEC", "TLA", "TMN", "TSA", "TSS", "UMC", "EUU", "ARB", "CEB"])

In [284]:
GDP_data.head()

Unnamed: 0_level_0,Country Name,Indicator Name,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ABW,Aruba,GDP per capita growth (annual %),5.427608,-5.107188,-5.405878,-0.152951,6.031742,-0.094958,0.245883,1.41001,-0.224764,-10.6053,-3.88776,3.063882,-1.864168,3.593198,-0.294412,5.125616
AGO,Angola,GDP per capita growth (annual %),-0.267945,0.822114,9.943764,-0.431851,7.187036,11.030836,7.582329,9.890012,7.116873,-2.808634,0.640294,-0.220851,4.706498,1.292041,1.219835,-2.468719
ALB,Albania,GDP per capita growth (annual %),7.633866,9.311124,4.853922,5.92563,5.951881,6.071391,6.570332,6.783927,8.328036,4.048888,4.223038,2.821558,1.585156,1.187204,1.985426,2.516853
AND,Andorra,GDP per capita growth (annual %),1.913452,4.986929,0.504741,4.040888,3.792889,1.892421,2.057546,-0.523712,-6.885786,-5.976668,-1.958707,0.830102,-3.452688,-1.573746,4.524456,2.997046
ARE,United Arab Emirates,GDP per capita growth (annual %),4.909504,-3.778971,-2.750615,1.966528,-0.037811,-7.020584,-4.916643,-11.345504,-10.208775,-15.151256,-5.91481,2.187921,2.257926,4.410159,4.100219,4.553055


## Renewable Energy Cleaning

In [285]:
# Loading raw Renewable Energy data and cleaning
data = pd.read_csv('raw_data/Renewable_energy.csv')

In [286]:
# Filtering to only columns to keep 
Organized_df = data[columns_to_keep]

In [287]:
# Drop NANs
Renewable_data = Organized_df.dropna()

In [288]:
# Renaming columns
Renewable_data.rename(columns={"country_name":"Country Name","country_code":"Country Code","indicator_name":"Indicator Name"}, inplace = True)

# Resetting index
Renewable_data.set_index("Country Code", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [289]:
# Dropping any indices that are non-countries ex. WLD is for the entire World CO2 emissions
Renewable_data = Renewable_data.drop(index=["WLD", "IBT", "LMY", "MIC", "IBD", "EAP", "EAR", "EAS", "ECA", "ECS", "EMU", "FCS", "HIC", "HPC", "IDB", "IDA", "IDX", "LAC", "LIC", "LDC", "LCN", "LMC", "LMY", "LTE", "MEA", "MNA", "NAC", "OED", "OSS", "PRE", "PST", "SAS", "SSA", "SSF", "SST", "TEA", "TEC", "TLA", "TMN", "TSA", "TSS", "UMC", "EUU", "ARB", "CEB"])

In [290]:
Renewable_data.head(5)

Unnamed: 0_level_0,Country Name,Indicator Name,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ABW,Aruba,Renewable energy consumption (% of total final...,0.175266,0.180523,0.181391,0.18455,0.187055,0.186599,0.189937,0.190059,0.193546,0.299774,5.464716,5.661788,6.85585,6.889753,6.927502,6.726748
AFG,Afghanistan,Renewable energy consumption (% of total final...,54.243126,54.055055,43.771149,42.276141,49.843148,40.859171,37.137249,33.862579,21.343708,17.813855,14.839806,11.482706,13.973586,16.334293,19.314269,18.423477
AGO,Angola,Renewable energy consumption (% of total final...,74.618171,73.757859,72.125126,67.306117,65.493065,70.954202,65.022564,61.599704,58.107975,55.748977,54.193837,52.715679,52.245736,50.686116,50.797461,49.56821
ALB,Albania,Renewable energy consumption (% of total final...,41.445416,39.125664,35.896294,33.752729,35.935868,36.869489,31.710195,32.100937,35.912906,37.216638,37.11533,35.962532,40.0483,41.288974,38.689501,38.61521
AND,Andorra,Renewable energy consumption (% of total final...,14.890664,15.773868,16.221819,16.912308,16.874428,16.902437,17.485994,16.940777,17.422741,17.515948,19.09073,18.971546,19.195529,19.563698,19.886323,19.747809


## Exporting to clean CSVs

In [297]:
CO2_data.to_csv("Cleaned_CO2.csv", index=True)

GDP_data.to_csv("Cleaned_GDP.csv", index=True)

Renewable_data.to_csv("Cleaned_renewable.csv", index=True)


In [298]:
connection_string = "postgres:270504@localhost:5432/climate_change_db"
engine = create_engine(f'postgresql://{connection_string}')

NameError: name 'create_engine' is not defined

In [None]:
engine.table_names()

In [None]:
CO2_data.to_sql(name='carbon_dioxide', con=engine, if_exists='append', index=False)
GDP_data.to_sql(name='gdp', con=engine, if_exists='append', index=False)
Renewable_data.to_sql(name='urban_population', con=engine, if_exists='append', index=False)
Urban_data.to_sql(name='renewable_energy', con=engine, if_exists='append', index=False)