In [1]:
import pandas as pd 
import re
import string

# Phone number 

In [2]:
phone = pd.DataFrame({"(541) 471 3918",
              "(603)281-0308",
               "(814)-462-8074",
               "9704443106"}, columns = ["original_number"])
phone

Unnamed: 0,original_number
0,9704443106
1,(541) 471 3918
2,(814)-462-8074
3,(603)281-0308


In [3]:
phone_pattern = '.?(\d{3}).*(\d{3}).*(\d{4})'
phone['area_code'] = phone.original_number.apply(lambda x: re.match(phone_pattern, x).group(1))
phone['exchange'] = phone.original_number.apply(lambda x: re.match(phone_pattern, x).group(2))
phone['line_number'] = phone.original_number.apply(lambda x: re.match(phone_pattern, x).group(3))
phone

Unnamed: 0,original_number,area_code,exchange,line_number
0,9704443106,970,444,3106
1,(541) 471 3918,541,471,3918
2,(814)-462-8074,814,462,8074
3,(603)281-0308,603,281,308


In [4]:
original_number = "(541) 471 3918"
res = re.match(phone_pattern, original_number)
res.groups()

('541', '471', '3918')

# Date

In [5]:
date = pd.DataFrame({'20-02-2019', '15-07-2020', '14-09-2021'}, columns = ['original_date'])
date

Unnamed: 0,original_date
0,20-02-2019
1,14-09-2021
2,15-07-2020


In [6]:
date[['day', 'month', 'year']] = date.original_date.str.split("-", expand = True)
date

Unnamed: 0,original_date,day,month,year
0,20-02-2019,20,2,2019
1,14-09-2021,14,9,2021
2,15-07-2020,15,7,2020


In [7]:
date = pd.DataFrame({'20-02-2019', '15/07/2020', '14.09.2021'}, columns = ['original_date'])
date

Unnamed: 0,original_date
0,20-02-2019
1,14.09.2021
2,15/07/2020


In [8]:
date_pattern = '(\d{2}).(\d{2}).(\d{4})'
date['day'] = date.original_date.apply(lambda x: re.match(date_pattern, x).group(1))
date['month'] = date.original_date.apply(lambda x: re.match(date_pattern, x).group(2))
date['year'] = date.original_date.apply(lambda x: re.match(date_pattern, x).group(3))
date

Unnamed: 0,original_date,day,month,year
0,20-02-2019,20,2,2019
1,14.09.2021,14,9,2021
2,15/07/2020,15,7,2020


# Names

In [9]:
names = pd.DataFrame({"Smith, Mr. John", "Davis, Ms Nicole", "Robinson, Mrs. Rebecca", "Armstrong, Dr Sam", "Downey, Mr. Robert"},
                    columns = ['full_name'])
names

Unnamed: 0,full_name
0,"Smith, Mr. John"
1,"Davis, Ms Nicole"
2,"Robinson, Mrs. Rebecca"
3,"Armstrong, Dr Sam"
4,"Downey, Mr. Robert"


In [10]:
name_pattern = '(\w+),\s(Mr|Ms|Mrs|Dr).?\s(\w+)'
names['family_name'] = names.full_name.apply(lambda x: re.match(name_pattern, x).group(1))
names['title'] = names.full_name.apply(lambda x: re.match(name_pattern, x).group(2))
names['given_name'] = names.full_name.apply(lambda x: re.match(name_pattern, x).group(3))
names

Unnamed: 0,full_name,family_name,title,given_name
0,"Smith, Mr. John",Smith,Mr,John
1,"Davis, Ms Nicole",Davis,Ms,Nicole
2,"Robinson, Mrs. Rebecca",Robinson,Mrs,Rebecca
3,"Armstrong, Dr Sam",Armstrong,Dr,Sam
4,"Downey, Mr. Robert",Downey,Mr,Robert


In [11]:
names[names.given_name.str.contains('^R')]

Unnamed: 0,full_name,family_name,title,given_name
2,"Robinson, Mrs. Rebecca",Robinson,Mrs,Rebecca
4,"Downey, Mr. Robert",Downey,Mr,Robert


In [12]:
names[names.given_name.str.startswith('R')]

Unnamed: 0,full_name,family_name,title,given_name
2,"Robinson, Mrs. Rebecca",Robinson,Mrs,Rebecca
4,"Downey, Mr. Robert",Downey,Mr,Robert


# URL

In [13]:
url = pd.DataFrame({"https://www.google.com/gmail", "http://www.medium.com", "https://twitter.com/home"},
                  columns = ['full_url'])
url

Unnamed: 0,full_url
0,https://twitter.com/home
1,https://www.google.com/gmail
2,http://www.medium.com


In [14]:
url_pattern = "(https?)://(www)?.?(\w+).(\w+)/?(\w+)?"
url['schema'] = url.full_url.apply(lambda x: re.match(url_pattern, x).group(1))
url['subdomain'] = url.full_url.apply(lambda x: re.match(url_pattern, x).group(2))
url['second_level_domain'] = url.full_url.apply(lambda x: re.match(url_pattern, x).group(3))
url['top_level_domain'] = url.full_url.apply(lambda x: re.match(url_pattern, x).group(4))
url['subdirectory'] = url.full_url.apply(lambda x: re.match(url_pattern, x).group(5))
url

Unnamed: 0,full_url,schema,subdomain,second_level_domain,top_level_domain,subdirectory
0,https://twitter.com/home,https,,witter,com,home
1,https://www.google.com/gmail,https,www,google,com,gmail
2,http://www.medium.com,http,www,medium,com,


# Email address

In [15]:
email = pd.DataFrame({"jasonchong_98@hotmail.com", 
              "jason-chong-14@unimelb.edu.au",
              "Jason.Chong@quantium.com.au"}, columns = ['full_email'])
email

Unnamed: 0,full_email
0,Jason.Chong@quantium.com.au
1,jason-chong-14@unimelb.edu.au
2,jasonchong_98@hotmail.com


In [16]:
email_pattern = "([a-zA-Z0-9\_\-\.]+)@([a-zA-Z]+).(.+)"
email['username'] = email.full_email.apply(lambda x: re.match(email_pattern, x).group(1))
email['domain_name'] = email.full_email.apply(lambda x: re.match(email_pattern, x).group(2))
email['domain'] = email.full_email.apply(lambda x: re.match(email_pattern, x).group(3))
email

Unnamed: 0,full_email,username,domain_name,domain
0,Jason.Chong@quantium.com.au,Jason.Chong,quantium,com.au
1,jason-chong-14@unimelb.edu.au,jason-chong-14,unimelb,edu.au
2,jasonchong_98@hotmail.com,jasonchong_98,hotmail,com


# Address

In [17]:
address = pd.DataFrame({"21 Bungana Drive, Kybunga SA 5453", 
                "Thomas Lane, Fitzroy North VIC 3068",
                "107 Quayside Vista, Kingston ACT 2604",
                "94 Prince Street, Lower Coldstream NSW 2460", 
                "George Street, Brisbane QLD 4000"}, columns = ['full_address'])
address

Unnamed: 0,full_address
0,"107 Quayside Vista, Kingston ACT 2604"
1,"21 Bungana Drive, Kybunga SA 5453"
2,"George Street, Brisbane QLD 4000"
3,"Thomas Lane, Fitzroy North VIC 3068"
4,"94 Prince Street, Lower Coldstream NSW 2460"


In [18]:
address_pattern = "(\d*)\s?(.+),\s(.+)\s([A-Z]{2,3})\s(\d{4})"
address['house_number'] = address.full_address.apply(lambda x: re.match(address_pattern, x).group(1))
address['street_name'] = address.full_address.apply(lambda x: re.match(address_pattern, x).group(2))
address['suburb'] = address.full_address.apply(lambda x: re.match(address_pattern, x).group(3))
address['state'] = address.full_address.apply(lambda x: re.match(address_pattern, x).group(4))
address['postcode'] = address.full_address.apply(lambda x: re.match(address_pattern, x).group(5))
address

Unnamed: 0,full_address,house_number,street_name,suburb,state,postcode
0,"107 Quayside Vista, Kingston ACT 2604",107.0,Quayside Vista,Kingston,ACT,2604
1,"21 Bungana Drive, Kybunga SA 5453",21.0,Bungana Drive,Kybunga,SA,5453
2,"George Street, Brisbane QLD 4000",,George Street,Brisbane,QLD,4000
3,"Thomas Lane, Fitzroy North VIC 3068",,Thomas Lane,Fitzroy North,VIC,3068
4,"94 Prince Street, Lower Coldstream NSW 2460",94.0,Prince Street,Lower Coldstream,NSW,2460
