***

# Strings

### String Types and Operators

https://docs.python.org/3/tutorial/introduction.html#strings 

https://docs.python.org/3/library/stdtypes.html#string-methods 

In [1]:
# declaring strings using single or double quotes

a = 'This is a string with single quotes'
b = "This is a string with double quotes"
c = "You can put a 'quoted bit' inside a string if you use a different type of quote"
print(c)
print(type(c))

You can put a 'quoted bit' inside a string if you use a different type of quote
<class 'str'>


In [2]:
# unlike other languages, Python does not distinguish between characters and strings - 
# a “character” is a string with one element 

# this is a 1-element string
a = "A"

# this is a multi-element string
a = "This is a test"

In [3]:
# length of a string

s = "Example String"
print(len(s))

# convert from string to numbers
a = int("4")
b = float("4")
print(a)
print(b)

14
4
4.0


In [4]:
# operations on strings

# concatenate strings (operator overloading in Python)
a = "This"
b = "is"
c = "a"
d = "string"
print(a+b+c+d)
print(a+" "+b+" "+c+" "+d)

# replicate strings
print(3*a)

Thisisastring
This is a string
ThisThisThis


In [5]:
# illegal string operations

a = "ABC"
b = "A"
print(a-b)

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [6]:
# this is defined in some languages (like Matlab), not in Python

A = "a"
print(A+1)

TypeError: can only concatenate str (not "int") to str

In [7]:
# many methods for string types

s = "THIS is A String"
print(dir(s))

print(s)

['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'capitalize', 'casefold', 'center', 'count', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'format_map', 'index', 'isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'isidentifier', 'islower', 'isnumeric', 'isprintable', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip', 'maketrans', 'partition', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']
THIS is A String


see https://docs.python.org/3/library/stdtypes.html#string-methods

In [8]:
print(s.lower())
print(s.split())
print(s.upper())

this is a string
['THIS', 'is', 'A', 'String']
THIS IS A STRING


### format strings

https://docs.python.org/3/library/string.html#formatstrings 

In [9]:
a = 1
b = 3.4
c = "Tom"
s = "an int: {}; a float: {}; a string: {}"
s = "an int: {}; a float: {}; a string: {}".format(a, b, c)
print(s)

an int: 1; a float: 3.4; a string: Tom


In [10]:
# format strings

# often used with print() function (to screen) or write() method (to file) but can be used anywhere with strings

a = 1
b = 3.4
c = "Tom"
print("an int: {}; a float: {}; a string: {}".format(a, b, c))

# this does the same thing
print("an int: {0}; a float: {1}; a string: {2}".format(a, b, c))

# with indices, you can do this
print("an int: {0}; a float: {1}; a string: {2}; a float again: {1}; an int again: {0}".format(a, b, c))

# can do the same thing this way of course
print("an int: {0}; a float: {1}; a string: {2}; a float again: {1}; an int again: {0}".format(1, 3.4, "Tom"))

an int: 1; a float: 3.4; a string: Tom
an int: 1; a float: 3.4; a string: Tom
an int: 1; a float: 3.4; a string: Tom; a float again: 3.4; an int again: 1
an int: 1; a float: 3.4; a string: Tom; a float again: 3.4; an int again: 1


In [11]:
# can control the spacing and use tabs
print("")
print("{0:^8s} \t {1:^8s}\t {2:^8s}".format("Name", "Age", "Score"))
print("{0:^8s} \t {1:^8d}\t {2:^8.1f}".format("Tom", 56, 7.324))
print("{0:^8s} \t {1:^8d}\t {2:^8.1f}".format("Amy", 53, 8.516))


  Name   	   Age   	  Score  
  Tom    	    56   	   7.3   
  Amy    	    53   	   8.5   


### string literals / f-strings 

https://docs.python.org/3/reference/lexical_analysis.html#literals

https://realpython.com/python-f-strings/

In [23]:
# string literals / f-strings 

a = 1
b = 3.4523
c = "Tom"
print(f"an int: {a}; a float: {b}; a string: {c}")

# you can do this
print(f"an int: {a}; a float: {b}; a string: {c}; a float again: {b}; an int again: {a}")

# can control the spacing and use tabs
print("")
print(f"an int: {a:5d}; a float: {b:5.2f}; a string: {c:8s}")

# can have operations inside with f-strings
a = 4.3
b = 23.5
c = "ABC"
d = "def"
print(f"first {a*b:6.3f} and then {c+d}")

an int: 1; a float: 3.4523; a string: Tom
an int: 1; a float: 3.4523; a string: Tom; a float again: 3.4523; an int again: 1

an int:     1; a float:  3.45; a string: Tom     
first 101.050 and then ABCdef


### string comparison

In [24]:
# string comparison

a = "Truck"
b = "Truck"
c = "truck"
d = "Truck "

# string comparison is case-sensitive and fairly stupid
print(a == b)
print(a == c)
print(a == d)

print(a != c)

True
False
False
True


In [25]:
# can convert to upper or lower case before doing the comparison
print(a.lower() == c.lower())

True


In [26]:
# determine if part of one string is embedded in another (case-sensitive)

a = "This is a string"
print("is" in a)
print("this" in a)

True
False


### String Indices and String Slicing

In [None]:
# reference individual characters in a string using square brackets and integer index

s = "This is a string"
print(s[0])
print(s[1])
print(s[2])

In [None]:
# note that indices in Python start at 0, not at 1

# and the last index of the string is len(s)-1

s = "This is a string"

L = len(s)
print(s[L-1])

In [None]:
# this gives an error

s = "This is a string"

print(s[L])

In [None]:
# negative indices count from the end of the string

s = "This is a string"

print(s[-1])
print(s[-2])
print(s[-3])

In [None]:
# string slicing (pulling out a section of a string)

s = "This is a string"

# from (start-index) to (end-index + 1)
print("s[0:4]  ", s[0:4])
print("s[6:12] ", s[6:12])

In [None]:
# if first integer is missing, defaults to 0 (start of the list)

s = "This is a string"

print("s[:6]  ", s[:6])
print("s[0:6] ", s[0:6])

In [None]:
# if last integer is missing, defaults to end of the list

s = "This is a string"

print("s[8:]        ", s[8:])
print("s[8:len(s)]  ", s[8:len(s)])

In [None]:
# if both missing, entire list

s = "This is a string"

print("s[:] ", s[:])
print("s    ", s)

In [None]:
# slicing with a step

s = "This is a string"

print("*"+s[1:11:2]+"*")
print("*"+s[1]+s[3]+s[5]+s[7]+s[9]+"*")

In [None]:
# slicing with backwards steps

s = "This is a string"

print(s[::-1])

### Iterating over Strings with For Loops

In [8]:
# introducing for loops

s = "This is a string"

print("len(s) = ", len(s))
print()

# indenting is necessary in Python; must be consistent through a program; convention is to use spaces
# spaces added automatically by Jupyter Notebooks and most IDEs 

# range returns a (virtual) sequence of numbers from 0 to len(s)-1

for i in range(len(s)):
    print(i, "\t", s[i])

len(s) =  16

0 	 T
1 	 h
2 	 i
3 	 s
4 	  
5 	 i
6 	 s
7 	  
8 	 a
9 	  
10 	 s
11 	 t
12 	 r
13 	 i
14 	 n
15 	 g


In [9]:
# range can take a start, an end, and a step

s = "This is a string"

for i in range(0, len(s), 1):
    print(i, "\t", s[i])

0 	 T
1 	 h
2 	 i
3 	 s
4 	  
5 	 i
6 	 s
7 	  
8 	 a
9 	  
10 	 s
11 	 t
12 	 r
13 	 i
14 	 n
15 	 g


In [10]:
# with a step of 2

s = "This is a string"

for i in range(0, len(s), 2):
    print(i, "\t", s[i])

0 	 T
2 	 i
4 	  
6 	 s
8 	 a
10 	 s
12 	 r
14 	 n


In [11]:
# range (in Python 3) does not actually create a list

N = 10 ** 15        # this is larger than the memory capacity of any personal computers
for i in range(N):
    if i >= 10: 
        break
    print(i, end=', ')

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 

In [12]:
# can also iterate directly over the string itself

s = "This is a string"

for c in s:
    print(c)

T
h
i
s
 
i
s
 
a
 
s
t
r
i
n
g


In [13]:
# by default, print() issues a return/enter, but this can be overridden with end

s = "This is a string"

for c in s:
    print(c, end=" ")

T h i s   i s   a   s t r i n g 

In [None]:
# by default, print() issues a return/enter, but this can be overridden with end

s = "This is a string"

for c in s:
    print(c, end="")

### Strings are Immutable

In [None]:
# strings are immutable

s = "This is a string"

# cannot change a string
s[3] = "X"

In [None]:
# can only create a new string from old strings (and possibly overwrite)

s = "This is a string"

s = s[0:3] + "X" + s[4:]
print(s)

### Regular Expressions in Python

https://jakevdp.github.io/WhirlwindTourOfPython/14-strings-and-regular-expressions.html 

https://www.activestate.com/wp-content/uploads/2020/03/Python-RegEx-Cheatsheet.pdf 

https://docs.python.org/3/library/re.html 

https://docs.python.org/3/howto/regex.html#regex-howto 

In [None]:
# put r in front of a string makes it a "raw" string (does not interpret what is inside wrt Python)
pattern = r"S.*.dat"
s = r"C:\Project\Data\S102-3.dat"
print(s)

In [None]:
# eliminating r either interprets these as old C-style formatting commands

pattern = "S.*.dat"
s = "C:\tom\newproject\data\S102-3.dat"
print(s)

In [None]:
# or throws an error

pattern = "S.*.dat"
s = "C:\Users\tom\newproject\data\S102-3.dat"
print(s)

In [None]:
# regular expressions

import re

pattern = r"S.*.dat"
fname1 = r"C:\Project\Data\S102-3.dat"
fname2 = r"C:\Project\Data\T104-1.dat"
print("fname1 : ", re.search(pattern, fname1))
print("fname2 : ", re.search(pattern, fname2))


In [None]:
# if successful, returns an object
# otherwise, return a None

if re.search(pattern, fname1):
    print("Success!")
else:
    print("Failure!")

print()

# print the matched part
print(re.search(pattern,fname1)[0])

In [None]:
# search vs. findall

# finds first occurence of 'a' or 'b' or 'c' anywhere in string
print(re.search(r"[abc]", r"abacus"))

# finds all occurences anywhere in string
print(re.findall(r"[abc]", r"abacus"))

### examples of regular expression matching

https://www.activestate.com/wp-content/uploads/2020/03/Python-RegEx-Cheatsheet.pdf

https://docs.python.org/3/library/re.html 

https://docs.python.org/3/howto/regex.html#regex-howto 

In [None]:
# examples of regular expression matching

# ^      match expression to right at beginning of string
# .      match any character
# [3aj]  match any one of these characters
# *      match 0 or more occurrences of the previous
# +      match 1 or more occurrences of the previous
# $      match express to left at end of string
# [a-z]  match any one character in this range
# [0-9]  match any one character in this range
print("1) ", re.search(r"^.[3aj]x*z+Q$", r"XaxxxzzzQ"))
print("2) ", re.search(r"^.[3aj]x*z+Q$", r"axxxzzzQ"))
print("3) ", re.search(r"^.[3aj]x*z+Q$", r"XaxxxQ"))
print("4) ", re.search(r"^.[3aj]x*z+Q$", r"XazzzQ"))
print("5) ", re.search(r"^.[3aj]x*z+Q$", r"Xaxxxzzz"))
print()
print("6) ", re.search(r"^.*[3aj]x*z+Q$", r"X12345axxxzzzQ"))
print()
print("7) ", re.search(r"^[a-z].*[3aj]x*z+Q$", r"X12345axxxzzzQ"))
print("8) ", re.search(r"^[a-z].*[3aj]x*z+Q$", r"x12345axxxzzzQ"))
print("9) ", re.search(r"^[0-9].*[3aj]x*z+Q$", r"012345axxxzzzQ"))

In [None]:
# can use grouping to parse a complex string into its components using regular expressions 
# groups specified by parentheses

# imagine parsing a file name into the subject number, session number, and condition
print(re.findall(r"S-([0-9]*)-([0-9]*)-([a-zA-Z]*)", r"S-154-2-Control"))

# same thing without grouping (parentheses)
print(re.findall(r"S-[0-9]*-[0-9]*-[a-zA-Z]*", r"S-154-2-Control"))