In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'Full Name': [
        '   alice johnson   ',
        'BOB SMITH',
        '  charlie brown ',
        None  # Represents missing data
    ]
}
names = pd.DataFrame(data)

In [3]:
print(names)

             Full Name
0     alice johnson   
1            BOB SMITH
2       charlie brown 
3                 None


In [4]:
names['Clean Name'] = names['Full Name'].str.strip().str.title()

In [5]:
print(names)

             Full Name     Clean Name
0     alice johnson     Alice Johnson
1            BOB SMITH      Bob Smith
2       charlie brown   Charlie Brown
3                 None           None


In [6]:
names['First Name'] = names['Clean Name'].str.split().str[0]
names['Last Name'] = names['Clean Name'].str.split().str[-1]

In [7]:
names

Unnamed: 0,Full Name,Clean Name,First Name,Last Name
0,alice johnson,Alice Johnson,Alice,Johnson
1,BOB SMITH,Bob Smith,Bob,Smith
2,charlie brown,Charlie Brown,Charlie,Brown
3,,,,


In [8]:
data = {
    'Name': [
        'Alice Smith', 'Bob Johnson', 'Charlie Lee', 'Diana Brown',
        'Evan Davis', 'Fiona Miller', 'George Wilson', 'Hannah Taylor',
        'Ian Clark', 'Julia Roberts'
    ],
    'Email': [
        'alice.smith@example.com',
        'bob.johnson@gmail.com',
        'charlie.lee@yahoo.com',
        'diana.brown@outlook.com',
        'evan.davis@domain.com',
        'fiona.miller@service.org',
        'george.wilson@mail.com',
        'hannah.taylor@provider.net',
        'ian.clark@webmail.co',
        'julia.roberts@internet.com'
    ]
}

df = pd.DataFrame(data)

In [9]:
df

Unnamed: 0,Name,Email
0,Alice Smith,alice.smith@example.com
1,Bob Johnson,bob.johnson@gmail.com
2,Charlie Lee,charlie.lee@yahoo.com
3,Diana Brown,diana.brown@outlook.com
4,Evan Davis,evan.davis@domain.com
5,Fiona Miller,fiona.miller@service.org
6,George Wilson,george.wilson@mail.com
7,Hannah Taylor,hannah.taylor@provider.net
8,Ian Clark,ian.clark@webmail.co
9,Julia Roberts,julia.roberts@internet.com


In [10]:
df['Domain'] = df['Email'].str.extract(r"@([\w.-]+)")

In [11]:
df

Unnamed: 0,Name,Email,Domain
0,Alice Smith,alice.smith@example.com,example.com
1,Bob Johnson,bob.johnson@gmail.com,gmail.com
2,Charlie Lee,charlie.lee@yahoo.com,yahoo.com
3,Diana Brown,diana.brown@outlook.com,outlook.com
4,Evan Davis,evan.davis@domain.com,domain.com
5,Fiona Miller,fiona.miller@service.org,service.org
6,George Wilson,george.wilson@mail.com,mail.com
7,Hannah Taylor,hannah.taylor@provider.net,provider.net
8,Ian Clark,ian.clark@webmail.co,webmail.co
9,Julia Roberts,julia.roberts@internet.com,internet.com


In [12]:
data = {
    'Description': [
        "Product 123: The best gadget!",
        "Model 456 is now available.",
        "No numbers here, just text.",
        "Version 789 update released."
    ]
}
desc = pd.DataFrame(data)

In [13]:
print(desc)

                     Description
0  Product 123: The best gadget!
1    Model 456 is now available.
2    No numbers here, just text.
3   Version 789 update released.


In [14]:
desc['Description'] = desc['Description'].str.replace(r"[.!,:]", '', regex = True)

In [15]:
desc

Unnamed: 0,Description
0,Product 123 The best gadget
1,Model 456 is now available
2,No numbers here just text
3,Version 789 update released


In [16]:
desc['Numbers'] = desc['Description'].str.extract(r"(\d+)")

In [17]:
desc

Unnamed: 0,Description,Numbers
0,Product 123 The best gadget,123.0
1,Model 456 is now available,456.0
2,No numbers here just text,
3,Version 789 update released,789.0


In [18]:
data = {
    'Address': [
        "123 Main St, Springfield, IL",
        "456 Oak Ave, Metropolis, NY",
        "789 Pine Rd, Gotham, NJ",
        "101 Maple Dr, Star City, CA",
        "202 Birch Ln, Smallville, KS"
    ]
}

df = pd.DataFrame(data)

In [19]:
df

Unnamed: 0,Address
0,"123 Main St, Springfield, IL"
1,"456 Oak Ave, Metropolis, NY"
2,"789 Pine Rd, Gotham, NJ"
3,"101 Maple Dr, Star City, CA"
4,"202 Birch Ln, Smallville, KS"


In [20]:
df[['Street', 'City', 'State']] = df['Address'].str.split(',', expand = True)

In [21]:
df['Street'] = df['Street'].str.strip()
df['City'] = df['City'].str.strip()
df['State'] = df['State'].str.strip()

In [22]:
df

Unnamed: 0,Address,Street,City,State
0,"123 Main St, Springfield, IL",123 Main St,Springfield,IL
1,"456 Oak Ave, Metropolis, NY",456 Oak Ave,Metropolis,NY
2,"789 Pine Rd, Gotham, NJ",789 Pine Rd,Gotham,NJ
3,"101 Maple Dr, Star City, CA",101 Maple Dr,Star City,CA
4,"202 Birch Ln, Smallville, KS",202 Birch Ln,Smallville,KS


In [23]:
df['Re-joined'] = df[['Street', 'City', 'State']].apply(lambda x : ':'.join(x), axis = 1)

In [24]:
df

Unnamed: 0,Address,Street,City,State,Re-joined
0,"123 Main St, Springfield, IL",123 Main St,Springfield,IL,123 Main St:Springfield:IL
1,"456 Oak Ave, Metropolis, NY",456 Oak Ave,Metropolis,NY,456 Oak Ave:Metropolis:NY
2,"789 Pine Rd, Gotham, NJ",789 Pine Rd,Gotham,NJ,789 Pine Rd:Gotham:NJ
3,"101 Maple Dr, Star City, CA",101 Maple Dr,Star City,CA,101 Maple Dr:Star City:CA
4,"202 Birch Ln, Smallville, KS",202 Birch Ln,Smallville,KS,202 Birch Ln:Smallville:KS


In [25]:
data = {
    'Phone': [
        "(123) 456-7890",
        "123-456-7890",
        "123.456.7890",
        "123 456 7890",
        "+1 (123) 456-7890",
        "1234567890",
        None
    ]
}
df = pd.DataFrame(data)

In [26]:
df

Unnamed: 0,Phone
0,(123) 456-7890
1,123-456-7890
2,123.456.7890
3,123 456 7890
4,+1 (123) 456-7890
5,1234567890
6,


In [27]:
df['Phone'] = df['Phone'].str.replace(r"[^\d]", '', regex = True)

In [28]:
df

Unnamed: 0,Phone
0,1234567890.0
1,1234567890.0
2,1234567890.0
3,1234567890.0
4,11234567890.0
5,1234567890.0
6,


In [29]:
def std(n):
    if pd.isna(n):
        return n

    if len(n) == 10:
        return f'({n[:3]}) {n[3:6]}-{n[6:]}'

    if len(n) == 11 and n.startswith('1'):
        return f'({n[1:4]}) {n[4:7]}-{n[7:]}'
        

In [30]:
df['Standardize'] = df['Phone'].apply(std)

In [31]:
df

Unnamed: 0,Phone,Standardize
0,1234567890.0,(123) 456-7890
1,1234567890.0,(123) 456-7890
2,1234567890.0,(123) 456-7890
3,1234567890.0,(123) 456-7890
4,11234567890.0,(123) 456-7890
5,1234567890.0,(123) 456-7890
6,,


In [32]:
data = {
    'Post': [
        "Loving the new features in #Python and #Pandas!",
        "Just attended a great workshop on #DataScience.",
        "No hashtags here.",
        "Mixing #fun and #learning every day! #coding",
        None
    ]
}
df = pd.DataFrame(data)

In [33]:
print(df)

                                              Post
0  Loving the new features in #Python and #Pandas!
1  Just attended a great workshop on #DataScience.
2                                No hashtags here.
3     Mixing #fun and #learning every day! #coding
4                                             None


In [34]:
df['Hashtag'] = df['Post'].str.findall(r"(#\w+)")

In [35]:
df

Unnamed: 0,Post,Hashtag
0,Loving the new features in #Python and #Pandas!,"[#Python, #Pandas]"
1,Just attended a great workshop on #DataScience.,[#DataScience]
2,No hashtags here.,[]
3,Mixing #fun and #learning every day! #coding,"[#fun, #learning, #coding]"
4,,
