# Data Formatting

![gif](imgs/DF005.gif)

## Import

In [1]:
import pandas as pd
import numpy as np
import re

## Regex

In [2]:
text = 'foo    bar\t bax \tqux'
text

'foo    bar\t bax \tqux'

In [3]:
re.split('\s+', text)

['foo', 'bar', 'bax', 'qux']

### compile

In [4]:
regex = re.compile('\s+')
regex

re.compile(r'\s+', re.UNICODE)

In [5]:
regex.split(text)

['foo', 'bar', 'bax', 'qux']

### findall

In [6]:
regex.findall(text)

['    ', '\t ', ' \t']

### search

In [7]:
regex.search(text)

<_sre.SRE_Match object; span=(3, 7), match='    '>

### e-mail example

In [8]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

regex = re.compile(pattern=pattern, flags=re.IGNORECASE)
regex

re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}', re.IGNORECASE|re.UNICODE)

#### findall

In [9]:
text = '''Andy beardness.andy@gmail.com — actual mail
Andy2 andy.beardness@gmail.com — old second version
'''

regex.findall(text)

['beardness.andy@gmail.com', 'andy.beardness@gmail.com']

#### search

In [10]:
m = regex.search(text)
m

<_sre.SRE_Match object; span=(5, 29), match='beardness.andy@gmail.com'>

In [11]:
text[m.start():m.end()]

'beardness.andy@gmail.com'

#### match

In [12]:
print(regex.match(text))

None


#### sub

In [13]:
print(regex.sub('THERE_WAS_MAIL', text))

Andy THERE_WAS_MAIL — actual mail
Andy2 THERE_WAS_MAIL — old second version



#### mail components splitting

In [14]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9._]+)\.([A-Z]{2,4})'

regex = re.compile(pattern=pattern, flags=re.IGNORECASE)
regex

re.compile(r'([A-Z0-9._%+-]+)@([A-Z0-9._]+)\.([A-Z]{2,4})',
re.IGNORECASE|re.UNICODE)

In [15]:
regex.findall(text)

[('beardness.andy', 'gmail', 'com'), ('andy.beardness', 'gmail', 'com')]

In [16]:
print(regex.sub(r'User: \1, Domain: \2, Suffix: \3', text))

Andy User: beardness.andy, Domain: gmail, Suffix: com — actual mail
Andy2 User: andy.beardness, Domain: gmail, Suffix: com — old second version



## Vector string functions

In [17]:
df = pd.DataFrame({'Bash': ['joelc@google.com'], 
                   'Talan': ['talan@gmail.com'], 
                   'Rob': ['fast@gmail.com'], 
                   'Bunas': [np.nan]})

df = df.transpose()
df = df[0]
df

Bash     joelc@google.com
Talan     talan@gmail.com
Rob        fast@gmail.com
Bunas                 NaN
Name: 0, dtype: object

In [18]:
df.isnull()

Bash     False
Talan    False
Rob      False
Bunas     True
Name: 0, dtype: bool

### str

In [19]:
df.str.contains('gmail')

Bash     False
Talan     True
Rob       True
Bunas      NaN
Name: 0, dtype: object

In [20]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9._]+)\\.([A-Z]{2,4})'

In [21]:
matches = df.str.findall(pattern, flags=re.IGNORECASE)
matches

Bash     [(joelc, google, com)]
Talan     [(talan, gmail, com)]
Rob        [(fast, gmail, com)]
Bunas                       NaN
Name: 0, dtype: object

In [22]:
matches.str[0]

Bash     (joelc, google, com)
Talan     (talan, gmail, com)
Rob        (fast, gmail, com)
Bunas                     NaN
Name: 0, dtype: object