In [13]:
import pandas as pd # data science library o manipulate data
import matplotlib.pyplot as plt # visualization library
import seaborn as sb #visualization library specific for data science, based on matplotlib 
from sklearn.model_selection import train_test_split
from sklearn import  metrics
import statsmodels.api as sm
import pickle

In [14]:
carbon_emissions = pd.read_csv('china_carbon_co2_emissions.csv') #loads co2 emissions from 1990 to 2021

# Clean the CO2 emissions data
carbon_emissions['date'] = pd.to_datetime(carbon_emissions['date']).dt.year
carbon_emissions = carbon_emissions[['date', ' Kilotons of Co2']]
carbon_emissions = carbon_emissions.set_index('date')
carbon_emissions

Unnamed: 0_level_0,Kilotons of Co2
date,Unnamed: 1_level_1
1990,2173360.0
1991,2302180.0
1992,2418180.0
1993,2645410.0
1994,2767670.0
1995,3088620.0
1996,3070510.0
1997,3134110.0
1998,3236280.0
1999,3153660.0


In [15]:
gdp = pd.read_csv('china_gdp.csv') #loads china gdp from 1990 to 2021  


# Clean the GDP data
gdp = gdp.rename(columns={'TIME': 'date', 'Value': 'GDP (Billions of $US)'})
gdp['date'] = pd.to_datetime(gdp['date'], format='%Y').dt.year
gdp = gdp[['date', 'GDP (Billions of $US)']]
gdp = gdp.set_index('date')

In [16]:
inflation = pd.read_csv('china_inflation.csv') #loads china inflaction from 1990 to 2021

# Clean the inflation data
inflation = inflation.rename(columns={'TIME': 'date', 'Value': 'Inflation (%)'})
inflation = inflation[['date', 'Inflation (%)']]
inflation['date'] = pd.to_datetime(inflation['date'], format='%Y').dt.year
inflation = inflation.set_index('date')

In [17]:
power_consumption_capita = pd.read_csv('china_Power_Consumptiom_per_capita.csv') #loads china power consuption per capita from 1990 to 2021

# Clean the energy consumption data
power_consumption_capita['date'] = pd.to_datetime(power_consumption_capita['date'], format='%Y').dt.year
power_consumption_capita = power_consumption_capita.set_index('date')

In [18]:
population = pd.read_csv('china-population.csv') #loads china population from 1990 to 2021
population
# Clean the population data
population = population[['date', ' Population']]
population['date'] = pd.to_datetime(population['date'])
population = population.set_index('date')
population.index = population.index.year



In [19]:
countries_renewable = pd.read_csv('renewable-share-energy.csv')

renewable_energy = countries_renewable.loc[(countries_renewable['Entity'] == 'China') & (countries_renewable['Year']>1989)]
renewable_energy = renewable_energy.rename(columns={'Year': 'date'})
renewable_energy['date'] = pd.to_datetime(renewable_energy['date'], format='%Y').dt.year
renewable_energy = renewable_energy[['date', 'Renewables (% equivalent primary energy)']]
renewable_energy = renewable_energy.set_index('date')

In [20]:
gdp_rate = pd.read_csv('china_gdp_growth_rate.csv') #loads china gdp growth rate from 1990 to 2021

# Clean GDP growth rate data
gdp_rate = gdp_rate.drop(columns = ' Annual Change')
gdp_rate['date'] = pd.to_datetime(gdp_rate['date']).dt.year
gdp_rate = gdp_rate.set_index('date')
gdp_rate = gdp_rate.loc[1990:]

In [21]:
gdp_capita = pd.read_csv('china_gdp_per_capita.csv') #loads china gdp per capita from 1990 to 2021

#Clean GDP per capita data
gdp_capita = gdp_capita.drop(columns = ' Annual Growth Rate (%)')
gdp_capita['date'] = pd.to_datetime(gdp_capita['date']).dt.year
gdp_capita = gdp_capita.set_index('date')
gdp_capita = gdp_capita.loc[1990:]

In [22]:
# Merge the data
data = pd.concat([carbon_emissions, population, gdp, gdp_rate, gdp_capita,
                  inflation, power_consumption_capita, renewable_energy], axis=1)

data = data.rename(columns={' Kilotons of Co2':'Kilotons of Co2', ' Population':'Population',
                  ' GDP Growth (%)':'GDP Growth (%)', ' GDP Per Capita (US $)':'GDP Per Capita (US $)'})

In [23]:
data

Unnamed: 0_level_0,Kilotons of Co2,Population,GDP (Billions of $US),GDP Growth (%),GDP Per Capita (US $),Inflation (%),kWh per Capita,Renewables (% equivalent primary energy)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1990,2173360.0,1153704252,974.429258,3.9203,317.8847,3.1,511,4.723668
1991,2302180.0,1170626171,1086.530756,9.2628,333.1421,3.4,549,4.424778
1992,2418180.0,1183813389,1254.764538,14.2245,366.4607,6.4,605,4.412702
1993,2645410.0,1195855558,1446.231016,13.8837,377.3898,14.7,663,4.757861
1994,2767670.0,1207286675,1651.11518,13.0368,473.4923,24.1,727,4.952474
1995,3088620.0,1218144426,1850.763485,10.954,609.6567,17.1,770,5.55382
1996,3070510.0,1228298836,2050.191538,9.9226,709.4138,8.3,821,5.143133
1997,3134110.0,1237801448,2255.387151,9.2368,781.7442,2.8,853,5.361734
1998,3236280.0,1246836105,2437.344683,7.846,828.5805,-0.8,871,5.427778
1999,3153660.0,1255433236,2639.381556,7.6617,873.2871,-1.4,914,5.188286


In [24]:
data.to_csv('data_clean_china.csv', index=True)