Chapter 11 - Strings and Text Data

In [1]:
word = 'grail'
sent = 'a scratch'

In [2]:
# get first letter
print(word[0])

g


In [3]:
# get first 3 characters
print(word[0:3])

gra


In [4]:
# get the last letter from "a scratch"
print(sent[-1])

h


In [5]:
# get the last character in a string
s_len = len(sent)
print(s_len)

9


In [6]:
print(sent[2:s_len])

scratch


In [9]:
# specify entire string
print(sent[:])

a scratch


In [10]:
# slice increments
print(sent[::2])

asrth


In [11]:
# join
d1 = '40°'
m1 = "46'"
s1 = '52.837"'
u1 = 'N'

d2 = '73°'
m2 = "58'"
s2 = '26.302"'
u2 = 'W'

coords = ' '.join([d1, m1, s1, u1, d2, m2, s2, u2])
print(coords)

40° 46' 52.837" N 73° 58' 26.302" W


In [12]:
# splitlines
multi_str = """Guard: What? Ridden on a horse?
King Arthur: Yes!
Guard: You're using coconuts!
King Arthur: What?
Guard: You've got two empty halves of coconut and you're bangin' 'em together."""

print(multi_str)

Guard: What? Ridden on a horse?
King Arthur: Yes!
Guard: You're using coconuts!
King Arthur: What?
Guard: You've got two empty halves of coconut and you're bangin' 'em together.


In [13]:
multi_str_split = multi_str.splitlines()
print(multi_str_split)

['Guard: What? Ridden on a horse?', 'King Arthur: Yes!', "Guard: You're using coconuts!", 'King Arthur: What?', "Guard: You've got two empty halves of coconut and you're bangin' 'em together."]


In [14]:
guard = multi_str_split[::2]
print(guard)

['Guard: What? Ridden on a horse?', "Guard: You're using coconuts!", "Guard: You've got two empty halves of coconut and you're bangin' 'em together."]


In [15]:
guard = multi_str.replace("Guard: ", "").splitlines()[::2]
print(guard)

['What? Ridden on a horse?', "You're using coconuts!", "You've got two empty halves of coconut and you're bangin' 'em together."]


In [16]:
# string formatting
s = f"hello"
print(s)

hello


In [17]:
num = 7
s = f"hello {num}"
print(s)

hello 7


In [18]:
lat = "40.7815° N"
lon = "73.9733° W"
s = f"Hayden Planetarium Coordinates: {lat}, {lon}"
print(s)

Hayden Planetarium Coordinates: 40.7815° N, 73.9733° W


In [19]:
# formatting numbers
digits = 67890
s = f"In 2005, Lu Chao of China recited {digits:,} digits of pi"
print(s)

In 2005, Lu Chao of China recited 67,890 digits of pi


In [20]:
prop = 7 / 67890
s = f"I remember {prop:.4} or {prop:.4%} of what Lu Chao recited."
print(s)

I remember 0.0001031 or 0.0103% of what Lu Chao recited.


In [21]:
id_zfill = "42".zfill(5)
print(f"My ID number is {id_zfill}")

My ID number is 00042


In [22]:
# regular expressions
import re

In [23]:
tele_num = "1234567890"

m = re.match(pattern='\d\d\d\d\d\d\d\d\d\d', string=tele_num)
print(type(m))

<class 're.Match'>


In [25]:
print(m)

<re.Match object; span=(0, 10), match='1234567890'>


In [26]:
if m:
    print('match')
else:
    print('no match')

match


In [27]:
print(m.start())

0


In [28]:
print(m.span())

(0, 10)


In [29]:
print(m.group())

1234567890


In [30]:
# telephone number pattern
tele_num_spaces = "123 456 7890"

p = '\d{3}\s?\d{3}\s?\d{4}'
m = re.match(pattern=p, string=tele_num_spaces)
print(m)

<re.Match object; span=(0, 12), match='123 456 7890'>


In [31]:
# area code pattern
tele_num_space_paren_dash = '(123) 456-7890'
p = '\(?\d{3}\)?\s?\d{3}\s?-?\d{4}'
m = re.match(pattern=p, string=tele_num_space_paren_dash)
print(m)

<re.Match object; span=(0, 14), match='(123) 456-7890'>


In [32]:
# country code pattern
cnty_tele_num_space_paren_dash = '+1 (123) 456-7890'
p = '\+?1\s?\(?\d{3}\)?\s?\d{3}\s?-?\d{4}'
m = re.match(pattern=p, string=cnty_tele_num_space_paren_dash)
print(m)

<re.Match object; span=(0, 17), match='+1 (123) 456-7890'>


In [33]:
p = (
'\+?' # maybe starts with a +
'1' # the number 1
'\s?' # maybe there's a whitespace
'\(?' # maybe there's an open round parenthesis (
'\d{3}' # 3 numbers
'\)?' # maybe there's a closing round parenthesis )
'\s?' # maybe there's a whitespace
'\d{3}' # 3 numbers
'\s?' # maybe there's a whitespace
'-?' # maybe there's a dash character
'\d{4}' # 4 numbers
)
print(p)

\+?1\s?\(?\d{3}\)?\s?\d{3}\s?-?\d{4}


In [34]:
# python will concatenate 2 strings next to each other
s = (
"14 Ncuti Gatwa, "
"13 Jodie Whittaker, war John Hurt, 12 Peter Capaldi, "
"11 Matt Smith, 10 David Tennant, 9 Christopher Eccleston"
)
print(s)

14 Ncuti Gatwa, 13 Jodie Whittaker, war John Hurt, 12 Peter Capaldi, 11 Matt Smith, 10 David Tennant, 9 Christopher Eccleston


In [35]:
# pattern to match 1 or more digits
p = "\d+"
m = re.findall(pattern=p, string=s)
print(m)

['14', '13', '12', '11', '10', '9']


In [36]:
# substitute a pattern
multi_str = """Guard: What? Ridden on a horse?
King Arthur: Yes!
Guard: You're using coconuts!
King Arthur: What?
Guard: You've got ... coconut[s] and you're bangin' 'em together.
"""
p = '\w+\s?\w+:\s?'
s = re.sub(pattern=p, string=multi_str, repl='')
print(s)

What? Ridden on a horse?
Yes!
You're using coconuts!
What?
You've got ... coconut[s] and you're bangin' 'em together.



In [37]:
guard = s.splitlines()[ ::2]
kinga = s.splitlines()[1::2] # skip the first element
print(guard)
print(kinga)

['What? Ridden on a horse?', "You're using coconuts!", "You've got ... coconut[s] and you're bangin' 'em together."]
['Yes!', 'What?']


In [38]:
# compile a pattern
# pattern to match 10 digits
p = re.compile('\d{10}')
s = '1234567890'
# note: calling match on the compiled pattern
# not using the re.match function
m = p.match(s)
print(m)

<re.Match object; span=(0, 10), match='1234567890'>


In [39]:
# find all
p = re.compile('\d+')
s = (
"14 Ncuti Gatwa, "
"13 Jodie Whittaker, war John Hurt, 12 Peter Capaldi, "
"11 Matt Smith, 10 David Tennant, 9 Christopher Eccleston"
)
m = p.findall(s)
print(m)

['14', '13', '12', '11', '10', '9']
