# Lecture 15 – Applying

## Data 6, Summer 2022

In [1]:
from datascience import *
import numpy as np

## Motivation

In [2]:
pups = Table.read_table('data/pups.csv')

In [3]:
pups

name,age,size
Junior Smith,11,medium
Rex Rogers,7,big
Flash Heat,3,big
Reese Bo,4,medium
Polo Cash,2,small


In [4]:
pups.with_columns(
    'human years', pups.column('age') * 7
)

name,age,size,human years
Junior Smith,11,medium,77
Rex Rogers,7,big,49
Flash Heat,3,big,21
Reese Bo,4,medium,28
Polo Cash,2,small,14


## Apply

In [5]:
def seven_times(x):
    return 7 * x

In [6]:
pups.apply(seven_times, 'age')

array([77, 49, 21, 28, 14])

Note, we wouldn't actually use the above example since we could just write `pups.column('age') * 7`.

Here's a more useful example:

In [7]:
def email_from_name(name):
    first, last = name.split(' ')
    email = first + '.' + last + '@dogschool.edu'
    return email.lower()

In [8]:
# Can use email_from_name on a single argument
email_from_name('Champ Major')

'champ.major@dogschool.edu'

In [9]:
pups.apply(email_from_name, 'name')

array(['junior.smith@dogschool.edu', 'rex.rogers@dogschool.edu',
       'flash.heat@dogschool.edu', 'reese.bo@dogschool.edu',
       'polo.cash@dogschool.edu'],
      dtype='<U26')

In [10]:
pups.with_columns('email', pups.apply(email_from_name, 'name'))

name,age,size,email
Junior Smith,11,medium,junior.smith@dogschool.edu
Rex Rogers,7,big,rex.rogers@dogschool.edu
Flash Heat,3,big,flash.heat@dogschool.edu
Reese Bo,4,medium,reese.bo@dogschool.edu
Polo Cash,2,small,polo.cash@dogschool.edu


In [11]:
# Note, the parameter names don't
# need to be 'age' and 'size'
def human_years_converter(years_old, kind):
    if kind == 'small':
        return years_old * 6
    elif kind == 'medium':
        return years_old * 7
    else:
        return years_old * 8

In [12]:
human_years_converter(11, 'medium')

77

In [13]:
human_years_converter(11, 'small')

66

In [14]:
pups.apply(human_years_converter, 'age', 'size')

array([77, 56, 24, 28, 12])

In [15]:
pups.with_columns('accurate human years', pups.apply(human_years_converter, 'age', 'size'))

name,age,size,accurate human years
Junior Smith,11,medium,77
Rex Rogers,7,big,56
Flash Heat,3,big,24
Reese Bo,4,medium,28
Polo Cash,2,small,12


### Quick Check 1

In [16]:
# Large file – this may take ~10 seconds to load
salary = Table.read_table('https://media.githubusercontent.com/media/dailycal-projects/ucb-faculty-salary/master/data/salary/salary_2015.csv')
salary

year,location,first,last,title,gross,regular,overtime,other
2015,Berkeley,ANNE,AABOE,BUS SYS ANL 4,124454,124454,0,0
2015,Berkeley,DAVID,AAKER,RECALL FACULTY,2500,0,0,2500
2015,Berkeley,ELIZABETH,ABEL,PROF-AY,138775,138775,0,0
2015,Berkeley,NORMAN,ABRAHAMSON,ADJ PROF-AY-1/9-B/E/E,19668,19668,0,0
2015,Berkeley,BARBARA,ABRAMS,PROF-AY,191162,169862,0,21300
2015,Berkeley,JOHN,ACZON,FINANCIAL SVC ANL 3,83510,78510,0,5000
2015,Berkeley,ANTHONY,ADAMS,RES-FY,9587,9587,0,0
2015,Berkeley,PENNY,HINES,ACAD HR ANL 5,126707,126707,0,0
2015,Berkeley,ANINDITA,ADHIKARI,SR LECT SOE-AY,107345,107345,0,0
2015,Berkeley,ILAN,ADLER,PROF-AY-B/E/E,166617,151617,0,15000


In [17]:
profs = salary.select('first', 'last', 'title', 'gross').where('title', are.containing('PROF'))
profs

first,last,title,gross
ELIZABETH,ABEL,PROF-AY,138775
NORMAN,ABRAHAMSON,ADJ PROF-AY-1/9-B/E/E,19668
BARBARA,ABRAMS,PROF-AY,191162
ILAN,ADLER,PROF-AY-B/E/E,166617
VINOD,AGGARWAL,PROF-AY,167525
ALICE,AGOGINO,PROF-AY-B/E/E,243259
DAVID,ALDOUS,PROF-AY,218666
RONELLE,ALEXANDER,PROF-AY,167642
NEZAR,ALSAYYAD,PROF-AY,210389
GENEVIEVE,AMES,ADJ PROF-AY,9783


Look at the very last row of the output – that gross income doesn't look right.

In [18]:
profs.sort('gross', descending = True)

first,last,title,gross
STEVEN H,APPLEBAUM,HS ASSOC CLIN PROF-HCOMP,999756
JOHN A,GLASPY,PROF-HCOMP,999631
FRANK P.K.,HSU,PROF OF CLIN-HCOMP,998340
JOHN STUART,NELSON,PROF-HCOMP,997975
HANMIN,LEE,PROF OF CLIN-HCOMP,995434
DENNIS J,SLAMON,PROF-HCOMP,991973
BENJAMIN J,ANSELL,HS CLIN PROF-HCOMP,991543
NICHOLAS C,SAENZ,HS CLIN PROF-HCOMP,991463
JOSEPH F,GRECO,HS ASST CLIN PROF-HCOMP,991458
OMRI Y.,MARIAN,ACT PROF-AY-LAW,99997


It's because the entries in the `'gross'` column are strings, not integers.

In [19]:
profs.column('gross').item(0)

'138,775'

Your job is to fix that!

In [None]:
def fix_income(income):
    return _____

fixed_income = profs.apply(_____, _____)

profs = profs.with_columns(
    'gross', _____
)

## Example: 

## Masking

In [23]:
numbers = np.array([15, 14, -2, 1, 9])

In [41]:
numbers[[True, False, True, False, False]]

array([15, -2])

In [24]:
gradebook = Table().with_columns(
    'Name', np.array(['Carrera', 'Panamera', 'Taycan', 'Cayenne', 'Macan', 'Cayman', 'Boxster']),
    'Grading Option', np.array(['GRD', 'PNP', 'PNP', 'GRD', 'GRD', 'GRD', 'PNP']),
    'Score', np.array([98, 86, 67.5, 45, 82, 88, 71])
)

In [25]:
gradebook

Name,Grading Option,Score
Carrera,GRD,98.0
Panamera,PNP,86.0
Taycan,PNP,67.5
Cayenne,GRD,45.0
Macan,GRD,82.0
Cayman,GRD,88.0
Boxster,PNP,71.0


In [34]:
gradebook.where(make_array(True, False, False, True, True, True, False))

Name,Grading Option,Score
Carrera,GRD,98
Cayenne,GRD,45
Macan,GRD,82
Cayman,GRD,88


In [35]:
letter_grade = gradebook.column("Grading Option") == 'GRD'

In [36]:
gradebook.where(letter_grade)

Name,Grading Option,Score
Carrera,GRD,98
Cayenne,GRD,45
Macan,GRD,82
Cayman,GRD,88


### Example: Countries

Run the following cell – ignore the `lambda` parts:

In [None]:
countries = Table.read_table('data/countries.csv')
countries = countries.relabeled('Country(or dependent territory)', 'Country') \
           .relabeled('% of world', '%') \
           .relabeled('Source(official or UN)', 'Source')
countries = countries.with_columns(
    'Country', countries.apply(lambda s: s[:s.index('[')].lower() if '[' in s else s.lower(), 'Country'),
    'Population', countries.apply(lambda i: int(i.replace(',', '')), 'Population'),
    '%', countries.apply(lambda f: float(f.replace('%', '')), '%')
)

In [None]:
countries

In [None]:
def starts_or_ends_with_a(name):
    return name[0] == 'a' or name[-1] == 'a'

In [None]:
countries.apply(starts_or_ends_with_a, 'Country')

In [None]:
countries.where(countries.apply(starts_or_ends_with_a, 'Country'))