# String Types

In [1]:
new_string = "This is a String"  # storing a string

print('ID:', id(new_string))  # shows the object identifier (address)
print('Type:', type(new_string))  # shows the object type
print('Value:', new_string)  # shows the object value

ID: 2216980011744
Type: <class 'str'>
Value: This is a String


### Simple String

In [2]:
simple_string = 'Hello!' + " I'm a simple string"
print(simple_string)

Hello! I'm a simple string


### Multi-line String

In [3]:
# Note the \n (newline) escape character automatically created
multi_line_string = """Hello I'm
a multi-line
string!"""

multi_line_string

"Hello I'm\na multi-line\nstring!"

In [4]:
print(multi_line_string)

Hello I'm
a multi-line
string!


### Escape sequences

In [3]:
# Normal string with escape sequences leading to a wrong file path!
escaped_string = "C:\the_folder\new_dir\file.txt"
print(escaped_string)  # will cause errors if we try to open a file here

C:	he_folder
ew_dirile.txt


In [6]:
# raw string keeping the backslashes in its normal form
raw_string = r'C:\the_folder\new_dir\file.txt'
print(raw_string)

C:\the_folder\new_dir\file.txt


### Unicode literals

In [4]:
# unicode string literals
string_with_unicode = 'H\u00e8llo!'
print(string_with_unicode)

Hèllo!


In [5]:
more_unicode = 'I love Pizza 🍕!  Shall we book a cab 🚕 to get pizza?'
print(more_unicode)

I love Pizza 🍕!  Shall we book a cab 🚕 to get pizza?


## Your Turn: How can we reverse the above string?

In [9]:
more_unicode[::-1]  # reverses the string

'?azzip teg ot 🚕 bac a koob ew llahS  !🍕 azziP evol I'

# String Operations


### String Concatenation

In [10]:
'Hello 😊' + ' and welcome ' + 'to Python 🐍!'

'Hello 😊 and welcome to Python 🐍!'

In [11]:
'Hello 😊' ' and welcome ' 'to Python 🐍!'

'Hello 😊 and welcome to Python 🐍!'

In [12]:
s3 = ('This '
      'is another way '
      'to concatenate '
      'several strings!')
s3

'This is another way to concatenate several strings!'

### Substring check

In [13]:
'way' in s3

True

In [14]:
'python' not in s3

True

### String Length

In [15]:
len(s3)

51

# String Indexing and Slicing

In [16]:
# creating a string
s = 'PYTHON'
s, type(s)

('PYTHON', str)

## String Indexing

In [17]:
# depicting string indexes
for index, character in enumerate(s):
    print('Character ->', character, 'has index->', index)

Character -> P has index-> 0
Character -> Y has index-> 1
Character -> T has index-> 2
Character -> H has index-> 3
Character -> O has index-> 4
Character -> N has index-> 5


In [18]:
s[0], s[1], s[2], s[3], s[4], s[5]

('P', 'Y', 'T', 'H', 'O', 'N')

In [19]:
s[-1], s[-2], s[-3], s[-4], s[-5], s[-6]

('N', 'O', 'H', 'T', 'Y', 'P')

## String Slicing

In [20]:
s[:] 

'PYTHON'

In [21]:
s[1:4]

'YTH'

In [22]:
s[:3], s[3:]

('PYT', 'HON')

## String slicing with offsets

In [23]:
s[::1]  # no offset

'PYTHON'

In [24]:
s[::2]  # print every 2nd character in string

'PTO'

# String Immutability

In [26]:
# strings are immutable hence assignment throws error
s[0] = 'X'

TypeError: ignored

In [27]:
print('Original String id:', id(s))
# creates a new string
s = 'X' + s[1:]
print(s)
print('New String id:', id(s))

Original String id: 139698623376440
XYTHON
New String id: 139698050130024


# Useful String methods

## Case Conversions

In [0]:
s = 'python is great'

In [29]:
s.capitalize()

'Python is great'

In [30]:
s.upper()

'PYTHON IS GREAT'

In [31]:
s.title()

'Python Is Great'

## String Replace

In [32]:
s.replace('python', 'NLP')

'NLP is great'

## Numeric Checks

In [33]:
'12345'.isdecimal()

True

In [34]:
'apollo11'.isdecimal()

False

## Alphabet Checks

In [35]:
'python'.isalpha()

True

In [36]:
'number1'.isalpha()

False

## Alphanumeric Checks

In [37]:
'total'.isalnum()

True

In [38]:
'abc123'.isalnum()

True

In [39]:
'1+1'.isalnum()

False

## String splitting and joining

In [40]:
s = 'I,am,a,comma,separated,string'
s

'I,am,a,comma,separated,string'

In [41]:
s.split(',')

['I', 'am', 'a', 'comma', 'separated', 'string']

In [42]:
' '.join(s.split(','))

'I am a comma separated string'

In [43]:
# stripping whitespace characters
s = '   I am surrounded by spaces    '
s

'   I am surrounded by spaces    '

In [44]:
s.strip()

'I am surrounded by spaces'

In [45]:
sentences = 'Python is great. NLP is also good.'
sentences.split('.')

['Python is great', ' NLP is also good', '']

# String formatting

## Formatting expressions with different data types - old style

In [46]:
'We have %d %s containing %.2f gallons of %s' %(2, 'bottles', 2.5, 'milk')

'We have 2 bottles containing 2.50 gallons of milk'

In [47]:
'We have %d %s containing %.2f gallons of %s' %(5.21, 'jugs', 10.86763, 'juice')

'We have 5 jugs containing 10.87 gallons of juice'

## Formatting strings using the format method - new style

In [48]:
'Hello {} {}, it is a great {} to meet you at {}'.format('Mr.', 'Jones', 'pleasure', 5)

'Hello Mr. Jones, it is a great pleasure to meet you at 5'

In [49]:
'Hello {} {}, it is a great {} to meet you at {} o\' clock'.format('Sir', 'Arthur', 'honor', 9)

"Hello Sir Arthur, it is a great honor to meet you at 9 o' clock"

## Alternative ways of using string format

In [50]:
'I have a {food_item} and a {drink_item} with me'.format(drink_item='soda', food_item='sandwich')

'I have a sandwich and a soda with me'

In [51]:
'The {animal} has the following attributes: {attributes}'.format(animal='dog', attributes=['lazy', 'loyal'])

"The dog has the following attributes: ['lazy', 'loyal']"

# Regular Expressions

In [0]:
s1 = 'Python is an excellent language'
s2 = 'I love the Python language. I also use Python to build applications at work!'

In [0]:
import re

pattern = 'python'
# match only returns a match if regex match is found at the beginning of the string
re.match(pattern, s1)

In [54]:
# pattern is in lower case hence ignore case flag helps
# in matching same pattern with different cases
re.match(pattern, s1, flags=re.IGNORECASE)

<_sre.SRE_Match object; span=(0, 6), match='Python'>

In [55]:
# printing matched string and its indices in the original string
m = re.match(pattern, s1, flags=re.IGNORECASE)
print('Found match {} ranging from index {} - {} in the string "{}"'.format(m.group(0), 
                                                                            m.start(), 
                                                                            m.end(), s1))

Found match Python ranging from index 0 - 6 in the string "Python is an excellent language"


In [0]:
# match does not work when pattern is not there in the beginning of string s2
re.match(pattern, s2, re.IGNORECASE)

In [57]:
# illustrating find and search methods using the re module
re.search(pattern, s2, re.IGNORECASE)

<_sre.SRE_Match object; span=(11, 17), match='Python'>

In [58]:
re.findall(pattern, s2, re.IGNORECASE)

['Python', 'Python']

In [59]:
match_objs = re.finditer(pattern, s2, re.IGNORECASE)
match_objs

<callable_iterator at 0x7f0dfca76c88>

In [60]:
print("String:", s2)
for m in match_objs:
    print('Found match "{}" ranging from index {} - {}'.format(m.group(0), 
                                                               m.start(), m.end()))

String: I love the Python language. I also use Python to build applications at work!
Found match "Python" ranging from index 11 - 17
Found match "Python" ranging from index 39 - 45


In [61]:
# illustrating pattern substitution using sub and subn methods
re.sub(pattern, 'Java', s2, flags=re.IGNORECASE)

'I love the Java language. I also use Java to build applications at work!'

In [62]:
re.subn(pattern, 'Java', s2, flags=re.IGNORECASE)

('I love the Java language. I also use Java to build applications at work!', 2)

In [63]:
# dealing with unicode matching using regexes
s = u'H\u00e8llo! this is Python 🐍'
s

'Hèllo! this is Python 🐍'

In [64]:
re.findall(r'\w+', s)

['Hèllo', 'this', 'is', 'Python']

In [65]:
re.findall(r"[A-Z]\w+", s)

['Hèllo', 'Python']

In [66]:
emoji_pattern = r"['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
re.findall(emoji_pattern, s, re.UNICODE)

['🐍']