In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## Population

In [2]:
pop_raw = pd.read_csv('world_populations.csv')

In [3]:
population = pop_raw[['Area Code', 'Area', 'Year', 'Value']]
population = population.rename(columns={'Value': 'Population'})

In [4]:
population.head(10)

Unnamed: 0,Area Code,Area,Year,Population
0,2,Afghanistan,1950,7752.118
1,2,Afghanistan,1951,7839.51
2,2,Afghanistan,1952,7934.98
3,2,Afghanistan,1953,8038.596
4,2,Afghanistan,1954,8150.447
5,2,Afghanistan,1955,8270.581
6,2,Afghanistan,1956,8399.03
7,2,Afghanistan,1957,8535.807
8,2,Afghanistan,1958,8680.946
9,2,Afghanistan,1959,8834.445


In [5]:
population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14867 entries, 0 to 14866
Data columns (total 4 columns):
Area Code     14867 non-null int64
Area          14867 non-null object
Year          14867 non-null int64
Population    14867 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 464.7+ KB


## Crops

In [6]:
crops_raw = pd.read_csv('world_crops.csv',  encoding = 'latin1')

In [8]:
crops = crops_raw.groupby(['Area Code', 'Area', 'Element', 'Year', 'Unit']).agg({'Value': 'sum'}).reset_index().sort_values(['Area', 'Year'])
crops = crops.rename(columns={'Element': 'Agri_Element', 'Unit': 'Agri_Unit', 'Value': 'Agri_Value'})

In [9]:
crops.head(10)

Unnamed: 0,Area Code,Area,Agri_Element,Year,Agri_Unit,Agri_Value
78,2,Afghanistan,Area harvested,1961,ha,11834963.0
135,2,Afghanistan,Production,1961,tonnes,14449553.0
192,2,Afghanistan,Yield,1961,hg/ha,1787380.0
79,2,Afghanistan,Area harvested,1962,ha,12229684.0
136,2,Afghanistan,Production,1962,tonnes,14552363.0
193,2,Afghanistan,Yield,1962,hg/ha,1828888.0
80,2,Afghanistan,Area harvested,1963,ha,12358092.0
137,2,Afghanistan,Production,1963,tonnes,13760181.0
194,2,Afghanistan,Yield,1963,hg/ha,1830173.0
81,2,Afghanistan,Area harvested,1964,ha,12444122.0


In [10]:
crops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19918 entries, 78 to 18394
Data columns (total 6 columns):
Area Code       19918 non-null int64
Area            19918 non-null object
Agri_Element    19918 non-null object
Year            19918 non-null int64
Agri_Unit       19918 non-null object
Agri_Value      19918 non-null float64
dtypes: float64(1), int64(2), object(3)
memory usage: 1.1+ MB


## Emissions

In [11]:
emissions_raw = pd.read_csv('emissions.csv')

In [12]:
emissions = emissions_raw.groupby(['Area Code', 'Area', 'Element', 'Year', 'Unit']).agg({'Value': 'sum'}).reset_index().sort_values(['Area', 'Year'])
emissions = emissions.rename(columns={'Element': 'Emis_Element', 'Unit': 'Emis_Unit', 'Value': 'Emis_Value'})

In [13]:
emissions.head()

Unnamed: 0,Area Code,Area,Emis_Element,Year,Emis_Unit,Emis_Value
78,2,Afghanistan,Emissions (CH4),1961,gigagrams,285.7793
135,2,Afghanistan,Emissions (CO2eq),1961,gigagrams,9255.6779
192,2,Afghanistan,Emissions (N2O),1961,gigagrams,10.4978
79,2,Afghanistan,Emissions (CH4),1962,gigagrams,290.8671
136,2,Afghanistan,Emissions (CO2eq),1962,gigagrams,9331.0188


In [14]:
emissions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36131 entries, 78 to 25796
Data columns (total 6 columns):
Area Code       36131 non-null int64
Area            36131 non-null object
Emis_Element    36131 non-null object
Year            36131 non-null int64
Emis_Unit       36131 non-null object
Emis_Value      36131 non-null float64
dtypes: float64(1), int64(2), object(3)
memory usage: 1.9+ MB


## Combined

In [15]:
combined = pd.merge(emissions, population, how='left', on=['Area Code', 'Area', 'Year'])
combined = pd.merge(combined, crops, how='left', on=['Area Code', 'Area', 'Year'])
combined = combined[
    ['Area Code', 'Area', 'Year', 
     'Emis_Element', 'Emis_Unit', 'Emis_Value', 
     'Agri_Element', 'Agri_Unit', 'Agri_Value', 
     'Population']
]

In [16]:
combined.head(10)

Unnamed: 0,Area Code,Area,Year,Emis_Element,Emis_Unit,Emis_Value,Agri_Element,Agri_Unit,Agri_Value,Population
0,2,Afghanistan,1961,Emissions (CH4),gigagrams,285.7793,Area harvested,ha,11834963.0,9166.764
1,2,Afghanistan,1961,Emissions (CH4),gigagrams,285.7793,Production,tonnes,14449553.0,9166.764
2,2,Afghanistan,1961,Emissions (CH4),gigagrams,285.7793,Yield,hg/ha,1787380.0,9166.764
3,2,Afghanistan,1961,Emissions (CO2eq),gigagrams,9255.6779,Area harvested,ha,11834963.0,9166.764
4,2,Afghanistan,1961,Emissions (CO2eq),gigagrams,9255.6779,Production,tonnes,14449553.0,9166.764
5,2,Afghanistan,1961,Emissions (CO2eq),gigagrams,9255.6779,Yield,hg/ha,1787380.0,9166.764
6,2,Afghanistan,1961,Emissions (N2O),gigagrams,10.4978,Area harvested,ha,11834963.0,9166.764
7,2,Afghanistan,1961,Emissions (N2O),gigagrams,10.4978,Production,tonnes,14449553.0,9166.764
8,2,Afghanistan,1961,Emissions (N2O),gigagrams,10.4978,Yield,hg/ha,1787380.0,9166.764
9,2,Afghanistan,1962,Emissions (CH4),gigagrams,290.8671,Area harvested,ha,12229684.0,9345.868


In [17]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75836 entries, 0 to 75835
Data columns (total 10 columns):
Area Code       75836 non-null int64
Area            75836 non-null object
Year            75836 non-null int64
Emis_Element    75836 non-null object
Emis_Unit       75836 non-null object
Emis_Value      75836 non-null float64
Agri_Element    59739 non-null object
Agri_Unit       59739 non-null object
Agri_Value      59739 non-null float64
Population      75575 non-null float64
dtypes: float64(3), int64(2), object(5)
memory usage: 6.4+ MB


In [18]:
combined.to_csv('combined_project_3_data.csv')