# NumPy

Allows us to perform mathematical operations on blocks of data

In [None]:
# to insert a library (a file consisting of Python code) use 'import'
# library must already be installed. numpy and pandas come pre-installed with Anaconda
import numpy as np

### Ranges & Basic Maths

In [None]:
np_range = np.arange(10)
np_range

In [None]:
np_range = np.arange(0, 20, 4)
np_range

In [None]:
np_range * 3

In [None]:
np_range * np_range

### N-Dimensional array object.

In [None]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2d

In [None]:
data = np.array([[ 1.9526, -0.246 , -0.8856],[ 0.5639, 0.2379, 0.9104]])
data

In [None]:
data.ndim

In [None]:
data.shape

In [None]:
data.dtype

In [None]:
np.sqrt(data)

In [None]:
np.mean(data)

### Boolean Indexing

In [None]:
rand_data = np.random.rand(5,3)
rand_data

In [None]:
rand_data >= 0.3

In [None]:
rand_data[(rand_data >= 0.3)]

In [None]:
np.where(rand_data >= 0.3)

### <span style="color: red">Your Turn:</span> 

1) Create a 3 x 2 numpy array (called array1) with whatever numeric values you want. They might even be random<br>
2) Create a new numpy array (called array2) that doubles those values <br>
3) Find the mean values of your new array

# Pandas

In [None]:
import pandas as pd

### Dataframe
- like tables in Excel
- represents a tabular, spreadsheet-like data structure 
- each column can be a different value type (numeric, string, boolean, etc.)
- has both a row and column index

### Grabbing our CO2 data from the World Bank

OG source: data.worldbank.org/indicator/EN.ATM.CO2E.PC/ <br>
Re-uploaded to Box in Technical Community here: https://ibm.ent.box.com/folder/126399220371

In [None]:
data = pd.read_csv('https://ibm.box.com/shared/static/xv7vmn58q7y4gijzb742d43unsyfkx42.csv', skiprows = 4)

In [None]:
data

In [None]:
type(data)

Display the first 5 rows

In [None]:
# stick a number in head() and it'll return that many from the top
data.head()

<span style="color: red; font-size: large">Pro Tip</span>
Don't remember what a function does? Use `?`

In [None]:
?data.head

In [None]:
# Guess what the tail method will do
data.tail(3)

### Understanding the data

In [None]:
# Remember .shape ?
data.shape

In [None]:
# Describe is super useful
data.describe()

In [None]:
data.columns

In [None]:
data.columns[0]

In [None]:
# Print out the 2nd column



### Select Columns in a DataFrame

In [None]:
data['Country Name']

In [None]:
# Use an Index of a column instead of a name

In [None]:
data[data.columns[0]]

In [None]:
data[data.columns[[0, 1]]]

### Select Rows Columns in a DataFrame

In [None]:
data[0:3] #First to third rows

In [None]:
data[:3] #First to third rows

### Combine Rows and Columns

In [None]:
data[:2][['Country Name', 'Country Code']]

### Conditional Selects

In [None]:
data[data['Country Name'] == 'China']

In [None]:
data[data['Country Name'].isin(['Afghanistan', 'China', 'United Kingdom', 'United States'])]

### <span style="color: red">Your Turn:</span> 

<h3>Find the CO2 Emission per capita for France and Germany in 2014, 2015, 2016</h3>

<img src = https://ibm.box.com/shared/static/eejuk241dgo4dl7w24r9ej3e8eti8ui2.png>

It's a 2-step process. Filtering by country, then returning particular columns

#### Tips
- Use `data['Column Name'.isin(['ValueToCheck#1', 'ValueToCheck#2', 'ValueToCheck#3])]` to filter <br>
- Once you have a filtered dataframe, you can return a selection of columns using a list of column names<br>
`[['ColumnYouWant#1', 'ColumnYouWant#2', etc]]`

## Cleaning Data

1. Some rows are aggregates of countries rather than actual countries (e.g., "World").
1. Some columns are irrelevant and can be removed (e.g., "Indicator Name").
1. Some years have no data for any country (e.g., 2012 to 2015).

### Some rows are aggregates of countries

In [None]:
metadata = pd.read_csv('https://ibm.box.com/shared/static/sr8g1f4e63n8sln77whrglo4cicbppm7.csv')

In [None]:
metadata

How do we identify when a listed "Country Name" is a country or an aggregated region?

Notice when the row is an aggregate like "Arab World", the Region and IncomeGroup are consistently NaN (Not a Number). We can use this rule to remove all non-country regions.

In [None]:
merged = pd.merge(data, metadata, on = "Country Code")

In [None]:
merged.head(10)

We see that the region values are NaN when the row isn't an actual country

In [None]:
pd.notnull(merged['Region'])

In [None]:
merged[pd.notnull(merged['Region'])]

In [None]:
merged = merged[pd.notnull(merged['Region'])]
merged.head(10)

### Get rid of irrelevant columns

In [None]:
merged.columns

In [None]:
merged = merged.drop(['Indicator Name', 'Indicator Code', 'SpecialNotes', 'TableName', 'Unnamed: 5', 'Unnamed: 65'], axis=1)

### Some Years have no Data for any Country

In [None]:
# .count() counts the non-NA cells for each column or row
merged.count()

In [None]:
merged = merged.drop(['2017', '2018', '2019', '2020'], axis=1)

### Some countries have no data for any year

In [None]:
# .mean() takes the mean of all numeric quantities for a row or column
merged.mean(axis=1)

We see a couple of rows have NaN means suggesting they have no numeric data

In [None]:
merged = merged[pd.notnull(merged.mean(axis=1))]

In [None]:
merged

## Export as CSV

In [None]:
merged.to_csv("102 - CO2emissions - cleaned.csv", index = False) #See Recent Data for exported csv

# Basic Visualisation

In [None]:
merged.groupby('Region').mean()

In [None]:
merged_region = merged.groupby('Region').mean()

In [None]:
import matplotlib.pyplot as plt

In [None]:
merged_region[merged_region.columns[1:]]

In [None]:
plt.plot(merged_region)
plt.show()

In [None]:
merged_region.transpose()

In [None]:
plt.plot(merged_region.transpose())
plt.show()

In [None]:
merged_region.index

In [None]:
plt.figure(figsize=(15,5))

plt.xlabel('Years')
plt.ylabel('CO2 emissions (metric tons per capita)')

plt.plot(merged_region.transpose())
plt.legend(merged_region.index)

plt.show()