## Numpy Arrays

- NumPy is a Python module to create and manipulate multidimensional arrays
- Each element should have same type
- could use functions and be multidimensional

### 1. Using NumPy to read files

In [1]:
import numpy as np
enrollments = np.genfromtxt('enrollments.csv', delimiter=',')
type(enrollments)


numpy.ndarray

### 2. Create the array -- np.array()

In [2]:
import numpy as np

# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosn ia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

### 3. Array Shape -- how many rows and columns

In [3]:
print countries.shape

(20,)


### 4. Array Operation

In [4]:
# Accessing elements
print countries[0]

# Slicing
print countries[0:3]
print countries[:3]
print countries[:]

Afghanistan
Angola
['Afghanistan' 'Albania' 'Algeria']
['Afghanistan' 'Albania' 'Algeria']
['Bhutan' 'Bolivia' 'Bosn ia and Herzegovina']
['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Argentina' 'Armenia'
 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia'
 'Bosn ia and Herzegovina']


In [5]:
# Element types .dtype 
print countries.dtype  
print employment.dtype
print np.array([0, 1, 2, 3]).dtype
print np.array([1.0, 1.5, 2.0, 2.5]).dtype
print np.array([True, False, True]).dtype
print np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype


|S23
float64
int64
float64
bool
|S2


In [6]:
# Converting data type .astype()
vector = np.array(['1','2','3'])
vector = vector.astype(float)
print vector

[ 1.  2.  3.]


In [7]:
# Looping
for country in countries:
        print 'Examining country {}'.format(country)

Examining country Afghanistan
Examining country Albania
Examining country Algeria
Examining country Angola
Examining country Argentina
Examining country Armenia
Examining country Australia
Examining country Austria
Examining country Azerbaijan
Examining country Bahamas
Examining country Bahrain
Examining country Bangladesh
Examining country Barbados
Examining country Belarus
Examining country Belgium
Examining country Belize
Examining country Benin
Examining country Bhutan
Examining country Bolivia
Examining country Bosn ia and Herzegovina


In [8]:
# go over pairs of value from two arrays
for i in range(len(countries)):
        country = countries[i]
        country_employment = employment[i]
        print 'Country {} has employment {}'.format(country,country_employment)

Country Afghanistan has employment 55.70000076
Country Albania has employment 51.40000153
Country Algeria has employment 50.5
Country Angola has employment 75.69999695
Country Argentina has employment 58.40000153
Country Armenia has employment 40.09999847
Country Australia has employment 61.5
Country Austria has employment 57.09999847
Country Azerbaijan has employment 60.90000153
Country Bahamas has employment 66.59999847
Country Bahrain has employment 60.40000153
Country Bangladesh has employment 68.09999847
Country Barbados has employment 66.90000153
Country Belarus has employment 53.40000153
Country Belgium has employment 48.59999847
Country Belize has employment 56.79999924
Country Benin has employment 71.59999847
Country Bhutan has employment 58.40000153
Country Bolivia has employment 70.40000153
Country Bosn ia and Herzegovina has employment 41.20000076


### 5. NumPy Functions

In [9]:
print employment.mean()
print employment.std()
print employment.max()
print employment.sum()

58.6850000385
9.33826911369
75.69999695
1173.70000077


In [10]:
# def the max employment -- the traditional way
def max_employment(countries,employment):
    max_country = None
    max_employment = 0
    for i in rang(lem(countries)):
        country = countries[i]
        country_employment = employment[i]
        
        if country_employment > max_employment:
            max_country = country
            max_employment = country_employment
    return(max_country,max_employment)
            

In [11]:
# def the max employment -- the Numpy way
def max_employment(countries,employment):
    i = employment.argmax() 
    return (countries[i], employment[i])
    # instead of return the max value, it return the position of max value
print countries[3]   

Angola


In [12]:
# selecting elements

In [15]:
# 2D Array 
matrix = np.array([[5,10,15],[20,25,30],[35,40,45]])
second_column_25 = (matrix[:,1] == 25)
print matrix[second_column_25,:]

[[20 25 30]]


In [16]:
# Replacing Values

In [17]:
matrix = np.array([[5,10,15],[20,25,30],[35,40,45]])
second_column_25 = matrix[:,1] == 25
matrix[second_column_25,1] = 10
print matrix

[[ 5 10 15]
 [20 10 30]
 [35 40 45]]


### 6. Computation with NumPy

In [18]:
# Arithmetic operations between 2 NumPy arrays
a = np.array([1, 2, 3, 4])
b = np.array([1, 2, 1, 2])
    
print a + b
print a - b
print a * b
print a / b
print a ** b

[2 4 4 6]
[0 0 2 2]
[1 4 3 8]
[1 1 3 2]
[ 1  4  3 16]


In [19]:
# Arithmetic operations between a NumPy array and a single number
a = np.array([1, 2, 3, 4])
b = 2
    
print a + b
print a - b
print a * b
print a / b
print a ** b

[3 4 5 6]
[-1  0  1  2]
[2 4 6 8]
[0 1 1 2]
[ 1  4  9 16]


In [20]:
# Logical operations with NumPy arrays
a = np.array([True, True, False, False])
b = np.array([True, False, True, False])
    
print a & b   # and
print a | b   # or
print ~a
    
print a & True
print a & False
    
print a | True
print a | False

[ True False False False]
[ True  True  True False]
[False False  True  True]
[ True  True False False]
[False False False False]
[ True  True  True  True]
[ True  True False False]


In [21]:
# Comparison operations between 2 NumPy Arrays
a = np.array([1, 2, 3, 4, 5])
b = np.array([5, 4, 3, 2, 1])
    
print a > b
print a >= b
print a < b
print a <= b
print a == b
print a != b

[False False False  True  True]
[False False  True  True  True]
[ True  True False False False]
[ True  True  True False False]
[False False  True False False]
[ True  True False  True  True]


In [22]:
# Comparison operations between a NumPy array and a single number

a = np.array([1, 2, 3, 4])
b = 2
    
print a > b
print a >= b
print a < b
print a <= b
print a == b
print a != b

[False False  True  True]
[False  True  True  True]
[ True False False False]
[ True  True False False]
[False  True False False]
[ True False  True  True]


In [23]:
# First 20 countries with school completion data
countries = np.array([
       'Algeria', 'Argentina', 'Armenia', 'Aruba', 'Austria','Azerbaijan',
       'Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia',
       'Botswana', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cambodia', 'Cameroon', 'Cape Verde'
])

# Female school completion rate in 2007 for those 20 countries
female_completion = np.array([
    97.35583,  104.62379,  103.02998,   95.14321,  103.69019,
    98.49185,  100.88828,   95.43974,   92.11484,   91.54804,
    95.98029,   98.22902,   96.12179,  119.28105,   97.84627,
    29.07386,   38.41644,   90.70509,   51.7478 ,   95.45072
])

# Male school completion rate in 2007 for those 20 countries
male_completion = np.array([
     95.47622,  100.66476,   99.7926 ,   91.48936,  103.22096,
     97.80458,  103.81398,   88.11736,   93.55611,   87.76347,
    102.45714,   98.73953,   92.22388,  115.3892 ,   98.70502,
     37.00692,   45.39401,   91.22084,   62.42028,   90.66958
])


In [24]:
def overall_completion_rate(female_completion,male_completion):
    return(female_completion + male_completion)/2

In [25]:
# Standardizing the data (stigma)

In [26]:
def standardize_date(values):
    standardize_values = (values - values.mean)/values.std()
    return standardize_values


### 7. NumPy Index Arrays

In [27]:
# Using index arrays

a = np.array([1, 2, 3, 4])
b = np.array([True, True, False, False])
print a[b]
print a[np.array([True, False, True, False])]  

[1 2]
[1 3]


In [28]:
# Creating the index array using vectorized operations

a = np.array([1, 2, 3, 2, 1])
b = (a >= 2)
print a[b]
print a[a >= 2]

[2 3 2]
[2 3 2]


In [29]:
# Creating the index array using vectorized operations on another array

a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 2, 3, 2, 1])
print b == 2
print a[b == 2]

[False  True False  True False]
[2 4]


In [30]:
# Time spent in the classroom in the first week for 20 students
time_spent = np.array([
       12.89697233,    0.        ,   64.55043217,    0.        ,
       24.2315615 ,   39.991625  ,    0.        ,    0.        ,
      147.20683783,    0.        ,    0.        ,    0.        ,
       45.18261617,  157.60454283,  133.2434615 ,   52.85000767,
        0.        ,   54.9204785 ,   26.78142417,    0.
])

# Days to cancel for 20 students
days_to_cancel = np.array([
      4,   5,  37,   3,  12,   4,  35,  38,   5,  37,   3,   3,  68,
     38,  98,   2, 249,   2, 127,  35
])

In [31]:
def mean_time_for_paid_students(time_spent,days_to_cancel):
    return time_spent[days_to_cancel >= 7].mean()

In [32]:
mean_time_for_paid_students(time_spent,days_to_cancel)

41.054003485454537

### 8. NumPy Axis

In [36]:
# Column: axis=0  Row: axis=1
a = np.array([
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]
    ])
print a.sum()  # all numbers' sum
print a.sum(axis=0)  # sum of columns
print a.sum(axis=1)  # sum of rows

45
[12 15 18]
[ 6 15 24]


In [37]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

In [41]:
def min_and_max_riders_per_day(ridership):
    
    # 1. find the mean rideship per day for each subway station
    station_riders = ridership.mean(axis=0)
    
    # 2. find the maximum daily ridership and the minium
    max_daily_ridership = station_riders.max()
    min_daily_ridership = station_riders.min()
    return (max_daily_ridership, min_daily_ridership)

In [42]:
min_and_max_riders_per_day(ridership)

(3239.9000000000001, 1071.2)