In [1]:
# Normal strings vs. Raw strings
path = "C:\Desktop\baochung"
print("String: ", path)

String:  C:\Desktopaochung


In [2]:
path = r"C:\Desktop\baoChung"
print("Raw string: ", path)

Raw string:  C:\Desktop\baoChung


In [3]:
import re

# COMMONLY USED METHODS

# Match a word at the beginning of a string
result = re.match('Hi', r'Hi, my name is Chung.')
print(result)

<re.Match object; span=(0, 2), match='Hi'>


In [4]:
print(result.group())

Hi


In [5]:
# Search for the pattern "my" in a given string.
result = re.search('Hi', r'Hi, my name is Chung.')
print(result.group())

Hi


In [6]:
result = re.findall('is', r'Hi, my name is Chung. My major is Computer Science.')
print(result)

['is', 'is']


In [7]:
# SPECIAL SEQUENCES

# \A: The pattern is at the beginning of the sentence
str = r'Hi, my name is Chung.'
x = re.findall('\AHi', str) 
print(x)

['Hi']


In [8]:
# \b: The pattern is at the beginning or the end of a word.
x = re.findall(r'me\b', str)
print(x)

['me']


In [9]:
# \B: The pattern is present, but NOT at the beginning or the end of a word.
x = re.findall(r'i\B', str)
print(x)

['i']


In [10]:
# \d: Digits
str = r'I am 24 years old.'
x = re.findall('\d', str)
print(x)

['2', '4']


In [11]:
x = re.findall('\d+', str)
print(x)

['24']


In [12]:
# \D: Not digits
x = re.findall('\D', str)
print(x)

['I', ' ', 'a', 'm', ' ', ' ', 'y', 'e', 'a', 'r', 's', ' ', 'o', 'l', 'd', '.']


In [13]:
x = re.findall('\D+', str)
print(x)

['I am ', ' years old.']


In [14]:
# \w: a-Z, 0-9 and _
x = re.findall('\w', str)
print(x)

['I', 'a', 'm', '2', '4', 'y', 'e', 'a', 'r', 's', 'o', 'l', 'd']


In [15]:
x = re.findall('\w+', str)
print(x)

['I', 'am', '24', 'years', 'old']


In [16]:
# \w: Non a-Z, 0-9 and _
x = re.findall('\W', str)
print(x)

[' ', ' ', ' ', ' ', '.']


In [17]:
# METACHARACTERS

# .: Any characters
str = r'Hi, my name is Chung.'
x = re.findall('.m', str)
print(x)

[' m', 'am']


In [18]:
# ^: Start with
x = re.findall('^my', str)
print(x)

[]


In [19]:
# $: End with
x = re.findall('.$', str)
print(x)

['.']


In [20]:
# *: Zero or more characters
x = re.findall('m*y', str)
print(x)

['my']


In [21]:
# *: One or more characters
x = re.findall('m+y', str)
print(x)

['my']


In [22]:
# ?: Zero or one character
x = re.findall('m?y', str)
print(x)

['my']


In [23]:
# :: Either ... or ...
x = re.findall('my|is', str)
print(x)

['my', 'is']


In [24]:
# SETS

# []: Has characters in this set
x = re.findall('[chung]', str) # a, b or c
print(x)

['n', 'h', 'u', 'n', 'g']


In [25]:
x = re.findall('[a-g]', str)
print(x)

['a', 'e', 'g']


In [26]:
# Extract the numbers starting with 0 to 4 in the string below
str = "Mars' average distance from the Sun is roughly 230 million km and its orbital period is 687 (Earth) days."
x = re.findall(r'\b[0-4]+\d+', str)
print(x)

['230']


In [27]:
# [^]: Has no characters in this set
str = r'Hi, my name is Chung.'
x = re.findall('[^baochung]', str)
print(x)

['H', 'i', ',', ' ', 'm', 'y', ' ', 'm', 'e', ' ', 'i', 's', ' ', 'C', '.']


In [28]:
# [a-zA-Z0-9]
str = r'Hi, my name is @Chung.'
x = re.findall("[^a-zA-Z0-9 ]\w+", str)
print(x)

['@Chung']


In [29]:
# COMPLEX QUERIES

# Extract email IDs 
str = 'Send a mail to rohan.1997@gmail.com, smith_david34@yahoo.com and priya@yahoo.com about the meeting @2PM'
x = re.findall('[a-zA-Z0-9._-]+@\w+\.com', str)
print(x)

['rohan.1997@gmail.com', 'smith_david34@yahoo.com', 'priya@yahoo.com']


In [30]:
# Extract Dates
text = "London Olympic 2012 was held from 2012-07-27 to 2012/08/12."
x = re.findall(r'\d{4}.\d{2}.\d{2}', text)
print(x)

['2012-07-27', '2012/08/12']


In [31]:
# Extract Dates with Varying Length
text = "London Olympic 2012 was held from 27 July 2012 to 12 August 2012."
match = re.findall('\d{2}.\w{3,10}.\d{4}', text)
print(match)

['27 July 2012', '12 August 2012']


In [32]:
# Extract Title from Names - Titanic Dataset
import pandas as pd

# Load dataset
data = pd.read_csv('titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [33]:
data['Name'].head(10)

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
5                                     Moran, Mr. James
6                              McCarthy, Mr. Timothy J
7                       Palsson, Master. Gosta Leonard
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                  Nasser, Mrs. Nicholas (Adele Achem)
Name: Name, dtype: object

In [34]:
# Method 1
str = "Braund, Mr. Owen Harris"
name = str.split('.')[0].split(',')[1]

In [35]:
title = data['Name'].apply(lambda x: x.split('.')[0].split(',')[1])
title.value_counts()

 Mr              517
 Miss            182
 Mrs             125
 Master           40
 Dr                7
 Rev               6
 Major             2
 Col               2
 Mlle              2
 Jonkheer          1
 Sir               1
 Lady              1
 Mme               1
 the Countess      1
 Ms                1
 Don               1
 Capt              1
Name: Name, dtype: int64

In [36]:
# Method 2
def split_title(name):
    return re.findall('\w+\.', name)

title = data['Name'].apply(lambda x: split_title(x))
title.value_counts()

[Mr.]          517
[Miss.]        182
[Mrs.]         124
[Master.]       40
[Dr.]            7
[Rev.]           6
[Col.]           2
[Mlle.]          2
[Major.]         2
[Capt.]          1
[Sir.]           1
[Lady.]          1
[Mrs., L.]       1
[Jonkheer.]      1
[Ms.]            1
[Mme.]           1
[Don.]           1
[Countess.]      1
Name: Name, dtype: int64