# NumPy

In [1]:
import numpy as np

In [12]:
# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

In [13]:
# Change False to True for each block of code to see what it does

# Accessing elements
if True:
    print countries[0]
    print countries[3]

# Slicing
if True:
    print countries[0:3]
    print countries[:3]
    print countries[17:]
    print countries[:]

# Element types
if True:
    print countries.dtype
    print employment.dtype
    print np.array([0, 1, 2, 3]).dtype
    print np.array([1.0, 1.5, 2.0, 2.5]).dtype
    print np.array([True, False, True]).dtype
    print np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype

# Looping
if True:
    for country in countries:
        print 'Examining country {}'.format(country)

    for i in range(len(countries)):
        country = countries[i]
        country_employment = employment[i]
        print 'Country {} has employment {}'.format(country,country_employment)

# Numpy functions
if True:
    print employment.mean()
    print employment.std()
    print employment.max()
    print employment.sum()


Afghanistan
Angola
['Afghanistan' 'Albania' 'Algeria']
['Afghanistan' 'Albania' 'Algeria']
['Bhutan' 'Bolivia' 'Bosnia and Herzegovina']
['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Argentina' 'Armenia'
 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina']
|S22
float64
int32
float64
bool
|S2
Examining country Afghanistan
Examining country Albania
Examining country Algeria
Examining country Angola
Examining country Argentina
Examining country Armenia
Examining country Australia
Examining country Austria
Examining country Azerbaijan
Examining country Bahamas
Examining country Bahrain
Examining country Bangladesh
Examining country Barbados
Examining country Belarus
Examining country Belgium
Examining country Belize
Examining country Benin
Examining country Bhutan
Examining country Bolivia
Examining country Bosnia and Herzegovina
Country Afghanistan has employment 55.70000076
Count

In [18]:
def max_employment(countries, employment):
    max_value = 0
    max_country = None
    index = 0
    for i in range(len(employment)):
        if employment[i] > max_value:
            max_value = employment[i]
            max_country = countries[i]
        else:
            continue
    return (max_country, max_value)

In [19]:
max_employment(countries, employment)

('Angola', 75.699996949999999)

## There is an easier way to do this !!

In [20]:
def max_employment2(countries,employment):
    i = employment.argmax()
    return(countries[i],employment[i])

In [21]:
max_employment2(countries, employment)

('Angola', 75.699996949999999)

## Standardize the data in Numpy

In [22]:
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

# Change this country name to change what country will be printed when you
# "Run". Your function will be called to determine the standardized
# score for this country for each of the given 5 Gapminder variables in 2007.
# The possible country names are available in the Downloadables section.

country_name = 'United States'

def standardize_data(values):
    values_mean = values.mean()
    values_std = values.std()
    std_data = (values - values_mean)/values_std
    return std_data
std_employment = standardize_data(employment)

In [23]:
std_employment

array([-0.31965231, -0.780123  , -0.87650077,  1.82207181, -0.03051941,
       -1.99019768,  0.30144772, -0.16973184,  0.23719615,  0.84758731,
        0.18365304,  1.00821665,  0.87971351, -0.56595055, -1.07996476,
       -0.20185762,  1.38301845, -0.03051941,  1.2545153 , -1.87240259])

In [7]:
# Time spent in the classroom in the first week for 20 students
time_spent = np.array([
       12.89697233,    0.        ,   64.55043217,    0.        ,
       24.2315615 ,   39.991625  ,    0.        ,    0.        ,
      147.20683783,    0.        ,    0.        ,    0.        ,
       45.18261617,  157.60454283,  133.2434615 ,   52.85000767,
        0.        ,   54.9204785 ,   26.78142417,    0.
])

# Days to cancel for 20 students
days_to_cancel = np.array([
      4,   5,  37,   3,  12,   4,  35,  38,   5,  37,   3,   3,  68,
     38,  98,   2, 249,   2, 127,  35
])




In [15]:
def mean_time_for_paid_students(time_spent, days_to_cancel):
    '''
    Fill in this function to calculate the mean time spent in the classroom
    for students who stayed enrolled at least (greater than or equal to) 7 days.
    '''
    days_cancel_bool = 7 <= days_to_cancel ## this will create a boolean index array that can be used below
    mean_time = time_spent[days_cancel_bool].mean() ## subset the time_spent array by using boolen index
    return mean_time

print 'Mean time spent for students that did not cancel in 7 days is : ', mean_time_for_paid_students(time_spent, days_to_cancel)

Mean time spent for students that did not cancel in 7 days is :  60.0


## A simpler way !

In [19]:
mean_time = time_spent[days_to_cancel >= 7].mean()
print mean_time

60.0


## Be careful when working with slicing inside NumPy !!!!

### slicing will only create a 'view' of the data and not a new array like in Python arrays, so any modification will be done of the original array. See example below

In [22]:
### NumPy array slice

a = np.array([1,2,3,4,5])
slice = a[:3]
slice[0] = 100
print a

[100   2   3   4   5]


In [23]:
## Base Pyhton slice

a = [1,2,3,4,5]
slice = a[:3]
slice[0] = 100
print a

[1, 2, 3, 4, 5]


# Pandas

In [24]:
import pandas as pd

countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda',
             'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan',
             'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
             'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']

life_expectancy_values = [74.7,  75. ,  83.4,  57.6,  74.6,  75.4,  72.3,  81.5,  80.2,
                          70.3,  72.1,  76.4,  68.1,  75.2,  69.8,  79.4,  70.8,  62.7,
                          67.3,  70.6]

gdp_values = [ 1681.61390973,   2155.48523109,  21495.80508273,    562.98768478,
              13495.1274663 ,   9388.68852258,   1424.19056199,  24765.54890176,
              27036.48733192,   1945.63754911,  21721.61840978,  13373.21993972,
                483.97086804,   9783.98417323,   2253.46411147,  25034.66692293,
               3680.91642923,    366.04496652,   1175.92638695,   1132.21387981]

# Life expectancy and gdp data in 2007 for 20 countries
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)


In [28]:
life_expectancy[:3]
gdp[:3]

0     1681.613910
1     2155.485231
2    21495.805083
dtype: float64

In [44]:
print 'Mean Life expectancy:', life_expectancy.mean(),'\n'
print 'Mean GDP:', gdp.mean()

Mean Life expectancy: 72.87 

Mean GDP: 9147.87991648


In [43]:
## check if both life expectency and gpd move in the same direction ie., both ave mean or both below mean

def variable_correlation(var1,var2):
    both_above = (var1 > var1.mean()) & (var2 > var2.mean())
    both_below = (var1 < var1.mean()) & (var2 < var2.mean())
    is_same_direction = both_above | both_below # using Logical OR "|" function
    
    num_same_direction = is_same_direction.sum()
    num_diff_direction = len(var1) - num_same_direction
    return(num_same_direction,num_diff_direction)

variable_correlation(life_expectancy,gdp)

(17, 3)

In [45]:
life_expectancy.describe()

count    20.000000
mean     72.870000
std       6.213999
min      57.600000
25%      70.175000
50%      73.450000
75%      75.650000
max      83.400000
dtype: float64

In [61]:
countries = [
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
]


employment_values = [
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
]

# Employment data in 2007 for 20 countries
employment = pd.Series(employment_values, index=countries)
print employment[:3] # print the first 3 of the series

Afghanistan    55.700001
Albania        51.400002
Algeria        50.500000
dtype: float64


In [58]:
def max_employment(employment):
    '''
    Function to return the name of the country with the highest employment in the given employment
    data, and the employment in that country. The input will be a Pandas series where the values
    are employment and the index is country names.
    
    Try using the Pandas argmax() function. Documention is
    here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.argmax.html
    '''
    max_country = employment.argmax() # since country is the index here we get the country with max employment rate
    max_value = employment.max() # this will just output max value
    
    return (max_country, max_value)

In [59]:
max_employment(employment)

('Angola', 75.699996949999999)

## Pandas series operations

In [62]:
# Addition when indexes are the same
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print s1 + s2

a    11
b    22
c    33
d    44
dtype: int64


In [63]:
# Indexes have same elements in a different order
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['b', 'd', 'a', 'c'])
print s1 + s2

a    31
b    12
c    43
d    24
dtype: int64


In [65]:
# Indexes overlap, but do not have exactly the same elements

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])
print s1 + s2

a     NaN
b     NaN
c    13.0
d    24.0
e     NaN
f     NaN
dtype: float64


In [66]:
# Indexes do not overlap
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])
print s1 + s2

a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
f   NaN
g   NaN
h   NaN
dtype: float64


### Filling in missing values with 0 instead of NaN's when adding series with different indices

In [67]:

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])

sum_series = (s1 + s2)
sum_series.fillna(value=0) # replace NaN with 0

a     0.0
b     0.0
c    13.0
d    24.0
e     0.0
f     0.0
dtype: float64

### But this is not what we want, we are loosing data in series here

#### Use 'add' and fill_value

In [74]:
sum_series = s1.add(s2,fill_value=0) # this will "add" s1 to s2 and while adding if the index is missing fills with 0

print sum_series

a     1.0
b     2.0
c    13.0
d    24.0
e    30.0
f    40.0
dtype: float64


# Using apply() in Pandas

In [1]:
import pandas as pd

In [18]:
names = pd.Series([
    'Andre Agassi',
    'Barry Bonds',
    'Christopher Columbus',
    'Daniel Defoe',
    'Emilio Estevez',
    'Fred Flintstone',
    'Greta Garbo',
    'Humbert Humbert',
    'Ivan Ilych',
    'James Joyce',
    'Keira Knightley',
    'Lois Lane',
    'Mike Myers',
    'Nick Nolte',
    'Ozzy Osbourne',
    'Pablo Picasso',
    'Quirinus Quirrell',
    'Rachael Ray',
    'Susan Sarandon',
    'Tina Turner',
    'Ugueth Urbina',
    'Vince Vaughn',
    'Woodrow Wilson',
    'Yoji Yamada',
    'Zinedine Zidane'
])

In [36]:
def reverse_name(names):
    
    strip_name = names.split(" ") # split() acts like a strip function which strips each part of a string
    first_name = strip_name[0]
    last_name = strip_name[1]
    reverse_name = last_name + ',' + first_name # reverse the order to lastname,firstname
    
    return reverse_name

In [38]:
print names.apply(reverse_name)

0             Agassi,Andre
1              Bonds,Barry
2     Columbus,Christopher
3             Defoe,Daniel
4           Estevez,Emilio
5          Flintstone,Fred
6              Garbo,Greta
7          Humbert,Humbert
8               Ilych,Ivan
9              Joyce,James
10         Knightley,Keira
11               Lane,Lois
12              Myers,Mike
13              Nolte,Nick
14           Osbourne,Ozzy
15           Picasso,Pablo
16       Quirrell,Quirinus
17             Ray,Rachael
18          Sarandon,Susan
19             Turner,Tina
20           Urbina,Ugueth
21            Vaughn,Vince
22          Wilson,Woodrow
23             Yamada,Yoji
24         Zidane,Zinedine
dtype: object


pandas.core.series.Series