In [94]:
import pandas as pd
import regex as re

In [95]:
time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15pm at the latest.",
                  "Friday: Take the train at 08:10am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30am.
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15pm at the latest.
4,"Friday: Take the train at 08:10am, arrive at 0..."


In [96]:
# find the number of characters for each string in df['text']
df['text'].str.len()

0    46
1    49
2    49
3    48
4    53
Name: text, dtype: int64

In [97]:
# find the number of tokens for each string in df['text']
df['text'].str.split(' ').str.len()

0    7
1    7
2    8
3    9
4    9
Name: text, dtype: int64

In [98]:
#find the sentences in df['text'] which contains appointment
df[df['text'].str.contains('appointment')]

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30am.


In [99]:
#find the number of digits in each row
df['text'].str.count(r'\d')

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

In [100]:
#find all occurences of digits
print(df[df['text'].str.contains(r'\d')])

                                                text
0     Monday: The doctor's appointment is at 2:45pm.
1  Tuesday: The dentist's appointment is at 11:30am.
2  Wednesday: At 7:00pm, there is a basketball game!
3   Thursday: Be back home by 11:15pm at the latest.
4  Friday: Take the train at 08:10am, arrive at 0...


In [110]:
# group and find the hours and minutes
df['text'].str.findall(r'(\d?\d):(\d\d)')

0               [(2, 45)]
1              [(11, 30)]
2               [(7, 00)]
3              [(11, 15)]
4    [(08, 10), (09, 00)]
Name: text, dtype: object

In [80]:
# replace weekdays with '???'
df['text'].str.replace(r'\w+day','???')

0          ???: The doctor's appointment is at 2:45pm.
1        ???: The dentist's appointment is at 11:30am.
2          ???: At 7:00pm, there is a basketball game!
3          ???: Be back home by 11:15pm at the latest.
4    ???: Take the train at 08:10am, arrive at 09:0...
Name: text, dtype: object

In [81]:
# replace weekdays with 3 letter abbrevations
df['text']=df['text'].str.replace(r'(\w+day\b)',lambda x:x.groups()[0][:3])

In [86]:
#create new columns from first match of extracted groups
df['text'].str.extract(r'(\d?\d):(\d\d)')

Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [114]:
# extract the entire time, the hours, the minutes, and the period
df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15pm,11,15,pm
4,0,08:10am,8,10,am
4,1,09:00am,9,0,am


In [88]:
# extract the entire time, the hours, the minutes, and the period with group names
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minute,period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15pm,11,15,pm
4,0,08:10am,8,10,am
4,1,09:00am,9,0,am


In [118]:
s=['abv','abcxyz','xyza']
s=pd.Series(s)

In [123]:
s.str.findall(r'xyz$')

0       []
1    [xyz]
2       []
dtype: object