# NumPy

In [1]:
import numpy as np # Canonical import

## Types

In [2]:
# 1 type per array
I = np.array([[1,2,3], [4,5,6]])
display(I)
display(type(I[0,0]))

F = np.array([[1,2,3.0], [4,5,6]]) 
display(F)
display(type(F[0,0]))

S = np.array([[1,2,'3'], [4,5,6]]) 
display(S)
display(type(S[0,0]))

array([[1, 2, 3],
       [4, 5, 6]])

numpy.int64

array([[1., 2., 3.],
       [4., 5., 6.]])

numpy.float64

array([['1', '2', '3'],
       ['4', '5', '6']], dtype='<U21')

numpy.str_

In [3]:
data_list = [
    [ 0,  1,  2,  3,  4],
    [10, 11, 12, 13, 14],
    [20, 21, 22, 23, 24],
    [30, 31, 32, 33, 34],
    [40, 41, 42, 43, 44],
]
data_np = np.array(data_list)
data_np

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14],
       [20, 21, 22, 23, 24],
       [30, 31, 32, 33, 34],
       [40, 41, 42, 43, 44]])

## Slicing and Operations

In [4]:
# Rows - 1 and 2, all columns
display(data_np[1:3,:])

# Columns - 2, all rows
data_np[:,2]

array([[10, 11, 12, 13, 14],
       [20, 21, 22, 23, 24]])

array([ 2, 12, 22, 32, 42])

In [38]:
# sum for each column
display(data_np.sum(axis=0))

# sum for each row
display(data_np.sum(axis=1))

# sum for our whole array
data_np.sum()

array([100, 105, 110, 115, 120])

array([ 10,  60, 110, 160, 210])

550

## Indexing

In [39]:
# Can check for an individual value by calling it on the whole array
data_np == 0

array([[ True, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False]])

In [40]:
# Or something more complex like Odd vs even
data_np % 2 == 0

array([[ True, False,  True, False,  True],
       [ True, False,  True, False,  True],
       [ True, False,  True, False,  True],
       [ True, False,  True, False,  True],
       [ True, False,  True, False,  True]])

In [41]:
# We can assign the comparison to a variable and use that to index
even_mask = data_np % 2 == 0
data_np[even_mask]

array([ 0,  2,  4, 10, 12, 14, 20, 22, 24, 30, 32, 34, 40, 42, 44])

In [42]:
# or wrap the comparison directly in [] to apply on the array
data_np[data_np % 2 == 0]

array([ 0,  2,  4, 10, 12, 14, 20, 22, 24, 30, 32, 34, 40, 42, 44])

# Pandas

In [5]:
import pandas as pd

## Series

In [6]:
# A series is just a 1 dimensional - if you use a dictionary keys will become indexs
s = pd.Series({'id1': 1, 'id2': 2, 'id3': 'three'})
s

id1        1
id2        2
id3    three
dtype: object

In [45]:
# Indexes are not a column - this series has 3 rows and 0 columns
s.shape

(3,)

In [9]:
# They just take the place of the purely numerical index we are used to
type(s['id1'])

int

In [10]:
# If you pass a single list, numerical indexs will be assigned
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

## Dataframes

In [48]:
# We can make dataframes from a dictionary whose values are identical size lists:
d = {
    "apples": [1, 2, 3],
    "oranges": [4, 5, 6],
}
display(d)
pd.DataFrame(d)

{'apples': [1, 2, 3], 'oranges': [4, 5, 6]}

Unnamed: 0,apples,oranges
0,1,4
1,2,5
2,3,6


In [49]:
# or a list full of dictionaries with identical keys
l = [{'apples': i, 'oranges':i+3} for i in range(1,4)]
display(l)
pd.DataFrame(l)

[{'apples': 1, 'oranges': 4},
 {'apples': 2, 'oranges': 5},
 {'apples': 3, 'oranges': 6}]

Unnamed: 0,apples,oranges
0,1,4
1,2,5
2,3,6


## DF Manipulation

In [45]:
file = "countries.csv" # Path relative to your notebook
countries_df = pd.read_csv(file, decimal=',')

In [12]:
countries_df.head()

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,48.0,0.0,23.06,163.07,700.0,36.0,3.2,12.13,0.22,87.65,1.0,46.6,20.34,0.38,0.24,0.38
1,Albania,EASTERN EUROPE,3581655,28748,124.6,1.26,-4.93,21.52,4500.0,86.5,71.2,21.09,4.42,74.49,3.0,15.11,5.22,0.232,0.188,0.579
2,Algeria,NORTHERN AFRICA,32930091,2381740,13.8,0.04,-0.39,31.0,6000.0,70.0,78.1,3.22,0.25,96.53,1.0,17.14,4.61,0.101,0.6,0.298
3,American Samoa,OCEANIA,57794,199,290.4,58.29,-20.71,9.27,8000.0,97.0,259.5,10.0,15.0,75.0,2.0,22.46,3.27,,,
4,Andorra,WESTERN EUROPE,71201,468,152.1,0.0,6.6,4.05,19000.0,100.0,497.2,2.22,0.0,97.78,3.0,8.71,6.25,,,


### Selecting columns

In [13]:
# Using just one square bracket returns a SERIES
s = countries_df['Country']

display(type(s))
display(s.shape)
s.head()

pandas.core.series.Series

(227,)

0       Afghanistan 
1           Albania 
2           Algeria 
3    American Samoa 
4           Andorra 
Name: Country, dtype: object

In [14]:
# Using two square brackets keeps it as a 2D dataframe
d = countries_df[['Country']]

display(type(d))
display(d.shape)
d.head()

pandas.core.frame.DataFrame

(227, 1)

Unnamed: 0,Country
0,Afghanistan
1,Albania
2,Algeria
3,American Samoa
4,Andorra


In [15]:
# Methods / attributes will be different based on what type of object you return
print('# of series options: ', len(dir(s)))
print('# of dataframe options: ', len(dir(d)))

# of series options:  433
# of dataframe options:  442


In [17]:
# BONUS - single "." notation for quickly returning series from column
countries_df.Country

0         Afghanistan 
1             Albania 
2             Algeria 
3      American Samoa 
4             Andorra 
            ...       
222         West Bank 
223    Western Sahara 
224             Yemen 
225            Zambia 
226          Zimbabwe 
Name: Country, Length: 227, dtype: object

### loc vs iloc

Now that we are no longer in standard python lists or numpy arrays we can be very specific with naming our row and column indexes

In [97]:
countries_df.head(2)

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,48.0,0.0,23.06,163.07,700.0,36.0,3.2,12.13,0.22,87.65,1.0,46.6,20.34,0.38,0.24,0.38
1,Albania,EASTERN EUROPE,3581655,28748,124.6,1.26,-4.93,21.52,4500.0,86.5,71.2,21.09,4.42,74.49,3.0,15.11,5.22,0.232,0.188,0.579


Right now our dataframe is using numerical indexes for rows (0,1 above) and string indexes for the columns ("Country", "Region", "Population").

In [18]:
# Indexing specific rows / columns with loc LOCATION, stop is included
# row indexs are still numerics at this point
countries_df.loc[0:1, 'Country':'Population']

Unnamed: 0,Country,Region,Population
0,Afghanistan,ASIA (EX. NEAR EAST),31056997
1,Albania,EASTERN EUROPE,3581655


In [19]:
# Indexing with iloc INDEX LOCATION is exclusive of the stop
# returns first row and first 2 columns
countries_df.iloc[0:2, 0:3]

Unnamed: 0,Country,Region,Population
0,Afghanistan,ASIA (EX. NEAR EAST),31056997
1,Albania,EASTERN EUROPE,3581655


So what's really happening?  First let's update the df with row indexes which aren't numerical

In [20]:
# Notice the column name dropped and the values are bold, these are now the indexes for our row
indexed_countries = countries_df.set_index('Country')
indexed_countries.head(3)

Unnamed: 0_level_0,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,48.0,0.0,23.06,163.07,700.0,36.0,3.2,12.13,0.22,87.65,1.0,46.6,20.34,0.38,0.24,0.38
Albania,EASTERN EUROPE,3581655,28748,124.6,1.26,-4.93,21.52,4500.0,86.5,71.2,21.09,4.42,74.49,3.0,15.11,5.22,0.232,0.188,0.579
Algeria,NORTHERN AFRICA,32930091,2381740,13.8,0.04,-0.39,31.0,6000.0,70.0,78.1,3.22,0.25,96.53,1.0,17.14,4.61,0.101,0.6,0.298


In [21]:
# Notice our .loc will no longer work if we pass an integer for the row - our new indexs are the country names
try:
    display(indexed_countries.loc[0:1, 'Country':'Population'])
except:
    print('Key Error')

Key Error


In [24]:
# We need to pass the string index for our row and remove country from our column selection
try:
    display(indexed_countries.loc['Afghanistan ': 'Albania ', 'Region':'Population'])
except:
    print('Key Error')

Unnamed: 0_level_0,Region,Population
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,ASIA (EX. NEAR EAST),31056997
Albania,EASTERN EUROPE,3581655


In [25]:
# If we still want to use a numeric location we switch over to iloc and stop becomes exclusive
display(indexed_countries.iloc[0:2, 0:2])

Unnamed: 0_level_0,Region,Population
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,ASIA (EX. NEAR EAST),31056997
Albania,EASTERN EUROPE,3581655


### Map function vs. calling function

In [26]:
# We can call methods () directly on a series
countries_df['Country'].str.upper()

0         AFGHANISTAN 
1             ALBANIA 
2             ALGERIA 
3      AMERICAN SAMOA 
4             ANDORRA 
            ...       
222         WEST BANK 
223    WESTERN SAHARA 
224             YEMEN 
225            ZAMBIA 
226          ZIMBABWE 
Name: Country, Length: 227, dtype: object

In [27]:
# Or map an uncalled function **notice - no ()** to the series
countries_df['Country'].map(str.upper)

0         AFGHANISTAN 
1             ALBANIA 
2             ALGERIA 
3      AMERICAN SAMOA 
4             ANDORRA 
            ...       
222         WEST BANK 
223    WESTERN SAHARA 
224             YEMEN 
225            ZAMBIA 
226          ZIMBABWE 
Name: Country, Length: 227, dtype: object

In [76]:
import math
countries_df['Population'].map(math.sqrt)

0      5572.880494
1      1892.526090
2      5738.474623
3       240.403827
4       266.835155
          ...     
222    1568.595550
223     522.501675
224    4632.082469
225    3391.461337
226    3498.114492
Name: Population, Length: 227, dtype: float64

### Group By

In [28]:
# Just using groupby returns an object
grouped_object = countries_df.groupby(by='Region')
grouped_object

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x122a47550>

In [175]:
print('Things to do with a dataframe: ',len(dir(countries_df)))
print('Things to do with grouped object: ',len(dir(grouped_object)))

Things to do with a dataframe:  450
Things to do with grouped object:  173


In [182]:
# We need to use an aggregation method to create a final df
print(type(grouped_object.sum()))
grouped_object.sum()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ASIA (EX. NEAR EAST),3687982236,23096712,35415.1,501.95,38.43,1169.84,225500.0,2227.5,4621.3,444.49,107.77,2247.74,53.0,592.42,213.84,4.974,8.46,14.563
BALTICS,7184974,175015,119.5,9.35,-6.1,24.31,33900.0,299.2,878.6,90.93,1.83,207.24,6.0,28.03,37.89,0.135,0.88,1.985
C.W. OF IND. STATES,280081548,22100843,680.5,1.12,-24.4,532.92,48000.0,1184.7,1966.2,233.45,24.27,942.28,25.5,214.27,124.1,2.304,3.936,5.762
EASTERN EUROPE,119914717,1152222,1210.8,12.42,-7.77,152.24,117700.0,873.8,3369.0,371.79,29.17,799.05,28.0,114.08,113.13,1.106,3.711,7.184
LATIN AMER. & CARIB,561824599,20544084,6129.1,718.75,-67.25,904.17,390700.0,3988.8,11953.1,485.59,221.11,3793.31,91.5,858.65,286.92,3.914,11.013,27.981
NEAR EAST,195068377,4355586,6833.3,57.64,42.4,374.04,167300.0,1113.3,3136.7,168.86,81.69,1349.33,25.0,400.51,76.95,1.021,6.496,8.48
NORTHERN AFRICA,161407133,6018890,233.6,1.91,-2.16,154.58,27300.0,336.2,501.0,44.61,16.83,538.56,6.0,104.07,24.03,0.675,2.131,2.592
NORTHERN AMERICA,331672307,21782471,1304.3,248.2,-1.37,43.14,130500.0,391.0,3433.7,57.13,0.24,442.63,6.0,65.77,38.47,0.042,0.598,2.361
OCEANIA,33131662,8519812,2754.8,2265.6,-39.15,383.87,173200.0,1510.2,3987.6,161.01,309.11,1629.88,42.0,442.16,110.4,2.802,3.444,9.743
SUB-SAHARAN AFRICA,749437000,24341406,4705.5,260.41,-17.09,4082.0,118500.0,3125.5,2167.9,589.29,189.44,4221.29,90.5,1838.24,773.16,13.894,13.077,22.038


In [30]:
# If you know exactly which columns you want, its quicker to select them (by indexing) before aggregating 
grouped_object[['Population']].sum()

Unnamed: 0_level_0,Population
Region,Unnamed: 1_level_1
ASIA (EX. NEAR EAST),3687982236
BALTICS,7184974
C.W. OF IND. STATES,280081548
EASTERN EUROPE,119914717
LATIN AMER. & CARIB,561824599
NEAR EAST,195068377
NORTHERN AFRICA,161407133
NORTHERN AMERICA,331672307
OCEANIA,33131662
SUB-SAHARAN AFRICA,749437000


In [54]:
# Using the agg funciton we can pass a dictionary with the different methods we want to use on different columns
display(countries_df.groupby(by='Region').agg({'Population':'sum', 'Birthrate':'mean'}))


# The long approach would be to run group by and aggs seperately and join them back together
s = countries_df.groupby(by='Region')[['Population']].sum()
m = countries_df.groupby(by='Region')[['Birthrate']].mean()
display(s.join(m))


Unnamed: 0_level_0,Population,Birthrate
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
ASIA (EX. NEAR EAST),3687982236,21.157857
BALTICS,7184974,9.343333
C.W. OF IND. STATES,280081548,17.855833
EASTERN EUROPE,119914717,10.370909
LATIN AMER. & CARIB,561824599,19.081111
NEAR EAST,195068377,25.031875
NORTHERN AFRICA,161407133,20.814
NORTHERN AMERICA,331672307,13.154
OCEANIA,33131662,22.108
SUB-SAHARAN AFRICA,749437000,36.043922


Unnamed: 0_level_0,Population,Birthrate
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
ASIA (EX. NEAR EAST),3687982236,21.157857
BALTICS,7184974,9.343333
C.W. OF IND. STATES,280081548,17.855833
EASTERN EUROPE,119914717,10.370909
LATIN AMER. & CARIB,561824599,19.081111
NEAR EAST,195068377,25.031875
NORTHERN AFRICA,161407133,20.814
NORTHERN AMERICA,331672307,13.154
OCEANIA,33131662,22.108
SUB-SAHARAN AFRICA,749437000,36.043922


In [36]:
countries_df.head()

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,48.0,0.0,23.06,163.07,700.0,36.0,3.2,12.13,0.22,87.65,1.0,46.6,20.34,0.38,0.24,0.38
1,Albania,EASTERN EUROPE,3581655,28748,124.6,1.26,-4.93,21.52,4500.0,86.5,71.2,21.09,4.42,74.49,3.0,15.11,5.22,0.232,0.188,0.579
2,Algeria,NORTHERN AFRICA,32930091,2381740,13.8,0.04,-0.39,31.0,6000.0,70.0,78.1,3.22,0.25,96.53,1.0,17.14,4.61,0.101,0.6,0.298
3,American Samoa,OCEANIA,57794,199,290.4,58.29,-20.71,9.27,8000.0,97.0,259.5,10.0,15.0,75.0,2.0,22.46,3.27,,,
4,Andorra,WESTERN EUROPE,71201,468,152.1,0.0,6.6,4.05,19000.0,100.0,497.2,2.22,0.0,97.78,3.0,8.71,6.25,,,


In [51]:
# This 
countries_df = countries_df.sort_values(by='Country', ascending=False)
# Does the same as this
countries_df.sort_values(by='Country', ascending=False, inplace=True)
# But never do this
# countries_df = countries_df.sort_values(by='Country', ascending=False, inplace=True)