#### Strings (text)

What a string is : Immutable sequence of Unicode code points.

##### Essential creation & inspection

In [None]:
s = "Hello, TechConvos!"

# length
len(s)
print("variable length is: >", len(s))

# indexing
print("\nFirst Char of variable :>", s[0])
print("nLast char of variable : >", s[-1])

# slicing, reverse
# syntax of slicing Start:End:interval
print(s[2:7])

print("Reverse char of variable: >",s[::-1])

variable length is: > 18

First Char of variable :> H
nLast char of variable : > !
llo, 
Reverse char of variable: > !sovnoChceT ,olleH


#### Trimming & normalizing whitespace

In [82]:
# STRING MANIPULATION & WHITESPACE HANDLING IN PYTHON
# ----------------------------------------------
# Define three strings with different spacing issues

name = "Dhiraj     Mishra"        # This string has multiple spaces **between** the words.
leftstrip = " Tech Convos"        # This string has one leading space at the beginning.
right_strip = "Tech Convos "      # This string has one trailing space at the end.

# -----------------------------------------------------
# STEP 1: CALCULATE LEADING AND TRAILING SPACES

# Count how many spaces are on the **left side** (leading) of 'name'
# len(name) gives total length, len(name.lstrip()) removes leading spaces
left_space_name =  len(name) - len(name.lstrip())

# Count how many spaces are on the **right side** (trailing) of 'name'
right_space_name = len(name) - len(name.rstrip())

# Repeat the same logic for 'leftstrip'
lspace_leftstrip = len(leftstrip) - len(leftstrip.lstrip())

# Repeat the same logic for 'righttstrip'

rspace_rstrip = len(right_strip) - len(right_strip.rstrip())

# PRINT ORIGINAL STRINGS (with repr to show hidden spaces)
print("=== ORIGINAL STRINGS ===\n")
print("name :> ", repr(name))           # repr() shows the string with quotes and visible spaces
print("leftstrip", repr(leftstrip))     # Useful for debugging leading/trailing whitespace
print("right_tstrip", repr(right_strip))

# DISPLAY COUNT OF SPACES

print("\n===Space Count====")
print("Left spaces in 'name':", left_space_name)
print("Right spaces in 'name':", right_space_name)

print("Left spaces in 'leftstrip':", lspace_leftstrip)
print("Right spaces in 'right_strip':", rspace_rstrip )


# NORMALIZE 'name' (Collapse multiple spaces to a single space)
# name.split() splits string into words (splits on ANY whitespace)
# " ".join(...) joins the words with **a single space**

clean_name = " ".join(name.split())

print("\n=== NORMALIZED NAME ===")

print("Original length: >", len(name))
print("Cleaned Length: >", len(clean_name))
print("Cleaned output: >",clean_name)

#  STRIP SPACES FROM STRINGS
# .strip() removes spaces from both ends
trimed_name = name.strip()


# .strip() works the same for the other strings
trimed_leftstrip = leftstrip.strip()
trimed_rightstrip = right_strip.strip()


print("\n=== TRIMMED STRINGS ===")
print("Trimmed 'name':", repr(trimed_name))               # No leading/trailing spaces
print("Trimmed 'leftstrip':", repr(trimed_leftstrip))     # Should remove 1 leading space
print("Trimmed 'right_strip':", repr(trimed_rightstrip))  # Should remove 1 trailing space

# RECONSTRUCT STRINGS WITH PRESERVED PADDING

# Recreate the left-padded version of 'name' by prepending the counted spaces

left_padded_name = " " * left_space_name + name


# Do the same for 'leftstrip' and 'right_strip'
left_padded_leftstrip = " " * lspace_leftstrip + leftstrip
right_padded_rightstrip = right_strip + " " * rspace_rstrip

print("\n=== STRINGS WITH PRESERVED PADDING ===")
print("Left-padded 'name':", repr(left_padded_name))
print("Right-padded 'name':", repr(right_padded_name))
print("Left-padded 'leftstrip':", repr(left_padded_leftstrip))
print("Right-padded 'right_strip':", repr(right_padded_rightstrip))

=== ORIGINAL STRINGS ===

name :>  'Dhiraj     Mishra'
leftstrip ' Tech Convos'
right_tstrip 'Tech Convos '

===Space Count====
Left spaces in 'name': 0
Right spaces in 'name': 0
Left spaces in 'leftstrip': 1
Right spaces in 'right_strip': 1

=== NORMALIZED NAME ===
Original length: > 17
Cleaned Length: > 13
Cleaned output: > Dhiraj Mishra

=== TRIMMED STRINGS ===
Trimmed 'name': 'Dhiraj     Mishra'
Trimmed 'leftstrip': 'Tech Convos'
Trimmed 'right_strip': 'Tech Convos'

=== STRINGS WITH PRESERVED PADDING ===
Left-padded 'name': 'Dhiraj     Mishra'
Right-padded 'name': 'Dhiraj     Mishra'
Left-padded 'leftstrip': '  Tech Convos'
Right-padded 'right_strip': 'Tech Convos  '


#### Splitting (your key request) & joining

In [11]:
text = "Python   for   Data   &   BI"
token = text.split()  # ['Python', 'for', 'Data', '&', 'BI']

print(token)


['Python', 'for', 'Data', '&', 'BI']


#### Split on a specific delimiter

In [18]:
# Step 1: Define the full file path as a string
path = "C:/Data/Raw/olist_orders.csv"

# Step 2: Split the path string at each "/" to separate the components
# This will create a list of parts: drive letter, folders, and the filename
paths = path.split("/")   # Output: ['C:', 'Data', 'Raw', 'olist_orders.csv']

# Step 3: Use unpacking to extract the first and last elements from the list
# root      -> 'C:'        (the drive letter or root folder)
# *_        -> ['Data', 'Raw'] (the middle folders, ignored here with _)
# filename  -> 'olist_orders.csv' (the actual file name at the end of the path)
root, *_, filename = paths

# Step 4: Define a string that simulates a row of data, with fields separated by "|"
row = "Dhiraj|Mishra|Team Lead"

# Step 5: Split the row at the "|" symbol to get a list of individual fields
# Output: ['Dhiraj', 'Mishra', 'Team Lead']
# (Although you’re not storing the result here, this line splits the row)
print(row.split("|"))

# Step 6: Print the extracted root (e.g., 'C:')
print(root)

# Step 7: Print the extracted filename (e.g., 'olist_orders.csv')
print(filename)

# Step 8: Print the full list of path parts
# Output: ['C:', 'Data', 'Raw', 'olist_orders.csv']
print(paths)


['Dhiraj', 'Mishra', 'Team Lead']
C:
olist_orders.csv
['C:', 'Data', 'Raw', 'olist_orders.csv']


#### Limit splits (performance & control)

In [5]:
record = "id=42;name=Dhiraj;role=Team Lead;dept=BI"

first, rest = record.split(";",1)  # split only once (left to right)


# first: 'id=42', rest: 'name=Dhiraj;role=Team Lead;dept=BI'


print(first)
print(rest)

id=42
name=Dhiraj;role=Team Lead;dept=BI


#### Right split (useful for filenames, trailing fields)

In [11]:
# Assign the filename as a string
fname = "report.2025.08.26.final.csv"

# Split the filename from the right at the last '.' to separate the extension
# 'rsplit(".", 1)' means: split from the right, only once
# This will return ['report.2025.08.26.final', 'csv']
base, ext = fname.rsplit(".", 1)

# Print the base part of the filename (everything before the last dot)
print(base)
print(ext)
# Print the extension (everything after the last dot

report.2025.08.26.final
csv


#### Lines handling (robust across OS)

In [15]:
# Define a string with mixed newline characters (\r\n and \n)
doc = "line1\r\nline2\nline3"

# Split the string into lines, handling all newline types
lines = doc.splitlines()

# Print the original string (will display as 3 lines)
print(doc)

line1
line2
line3


#### Re‑join tokens — the fast way to build strings

In [22]:
words = ["Python", "Data", "Fabric", "Snowflake"]
" | ".join(words)              # 'Python | Data | Fabric | Snowflake'
# Best approach: accumulate pieces in a list, ''.join(pieces) at the end.

'Python | Data | Fabric | Snowflake'

#### Searching & replacing

In [63]:
# Assign a string to the variable 's'
s = "Power BI with Python and Snowflake"

# Find the index of the first occurrence of the character 'i'
# Returns the index (0-based), or -1 if not found
search_output = s.find('i')

# Print the index where 'i' first appears in the string
print(search_output)

# Replace the substring "Power BI" with "Fabric" in 's'
# This returns a new string; 's' itself is not modified
replace_output = s.replace("Power BI", "Fabric")

# Print the modified string
print(replace_output)

10
Fabric with Python and Snowflake


#### Case handling (for normalization)

In [83]:
# Original email string in uppercase
email = "USER@EXAMPLE.COM"

# Convert the email to lowercase using .lower()
# This is useful for case-insensitive comparisons or deduplication (e.g. treating 'User@Example.com' and 'user@example.com' as the same)
email_lower = email.lower()

# Print the normalized lowercase email
print("original str : > ", email)
print("Lowercase email:", email_lower)  # Output: user@example.com

# --- International Case Handling Example ---

# Suppose we are working with a string containing international characters
# For example, the German letter "ß" (sharp S) which should be treated like "ss" in comparisons
german_word = "straße"

# Using .lower() — basic lowercase conversion (won't change ß)
lowered = german_word.lower()

# Using .casefold() — more aggressive and Unicode-aware
# It converts ß → ss, which is more appropriate for case-insensitive comparisons in some languages
folded = german_word.casefold()

# Print results to show the difference
print("Lowered:", lowered)    # Output: straße
print("Casefolded:", folded)  # Output: strasse


original str : >  USER@EXAMPLE.COM
Lowercase email: user@example.com
Lowered: straße
Casefolded: strasse


#### Formatting (f‑strings: best approach)

In [99]:
# Assign the user's name to a variable
user = 'dhiraj'

# Assign the user's role to a variable
role = 'team lead'

# Assign the year to a variable
year = 2025

# Convert 'Mastek' to uppercase using .upper()
msg = f"{user.upper()} - {role.upper()} @ {'Mastek'.upper()} {year}"

# Print the final message
print(msg)


DHIRAJ - TEAM LEAD @ MASTEK 2025


#### Validation & parsing (digits, alpha, etc.)

In [None]:
# --- isdigit() ---
# Only digits → returns True
print("12345".isdigit())    # True

# Contains a letter → returns False
print("123a5".isdigit())    # False


# --- isalpha() ---
# Only letters → returns True
print("abcXYZ".isalpha())   # True

# Contains a number → returns False
print("abc123".isalpha())   # False

# --- isalnum() ---
# Letters and numbers only → returns True
print("abc123".isalnum())   # True

# Contains a space → returns False
print("abc 123".isalnum())  # False


# --- isspace() ---
# Only spaces → returns True
print("   ".isspace())      # True

# Contains a tab and a letter → returns False
print(" \ta".isspace())     # False


True
False
True
False
True
False
True
False


#### Cleaning data (ETL‑style string scenarios)

_A) Normalize names (spaces, case, punctuation)_

In [115]:
def normalize_name(raw: str) -> str:
    """
    Normalize user-entered names:
    - Remove leading/trailing/multiple spaces
    - Fix inconsistent capitalization (e.g., dHiRaJ -> Dhiraj)
    Steps:
    1. raw.split() breaks the string into words, removing extra spaces
    2. " ".join(...) joins words back with single spaces
    3. .title() capitalizes the first letter of each word
    """
    core = " ".join(raw.split())     # Remove extra/multiple spaces
    return core.title()              # Capitalize each word correctly


# Example inputs (messy names)
name1 = "   dHiRaJ   mIsHrA   "
name2 = "JOHN    doe"
name3 = "  alice   o'connor  "
name4 = "eLon    MUsK"
name5 = "  mAry     AnnE  "

# Normalize each name using the function
print(normalize_name(name1))  # Output: Dhiraj Mishra
print(normalize_name(name2))  # Output: John Doe
print(normalize_name(name3))  # Output: Alice O'Connor
print(normalize_name(name4))  # Output: Elon Musk
print(normalize_name(name5))  # Output: Mary Anne


Dhiraj Mishra
John Doe
Alice O'Connor
Elon Musk
Mary Anne


_B) Extract domain from email_

In [117]:
def email_domain(email: str) -> str | None:
    email = email.strip()                      # Remove leading/trailing spaces
    if '@' not in email or email.count("@") != 1:  # Check for exactly one '@'
        return None                            # Return None if invalid email
    _, domain = email.rsplit("@", 1)           # Split from right at '@' to get domain
    return domain.lower()                       # Return domain in lowercase

# Test cases to check the function behavior
print(email_domain(" User@Example.COM "))   # Output: example.com (valid email)
print(email_domain("no-at-symbol"))          # Output: None (missing '@')
print(email_domain("wrong@@example.com"))    # Output: None (more than one '@')
print(email_domain("admin@Sub.Domain.COM"))  # Output: sub.domain.com (valid email)
print(email_domain("   alice@Example.co.uk "))  # Output: example.co.uk (valid email)

example.com
None
None
sub.domain.com
example.co.uk


_C) Robust CSV line split (when fields may contain commas)_

In [123]:
import csv
from io import StringIO

line = "name,age,city\nAlice,30,New York\nBob,25,Los Angeles"  # CSV data in a string

reader = csv.reader(StringIO(line))  # Convert string to file-like object, then parse CSV

for row in reader:
    print(row)  # Prints each row as a list of values

['name', 'age', 'city']
['Alice', '30', 'New York']
['Bob', '25', 'Los Angeles']


_D) File path utilities (use pathlib, best approach)_

In [None]:
from pathlib import Path  # Import Path class from pathlib module

p = Path(r"C:\Data\Raw\olist_orders.csv")  # Create a Path object representing the file path

print(p.name)    # 'olist_orders.csv'   --> The file name with extension
print(p.stem)    # 'olist_orders'       --> The file name without the extension
print(p.suffix)  # '.csv'               --> The file extension (including the dot)
print(p.parent)  # WindowsPath('C:/Data/Raw') --> The directory path containing the file

olist_orders.csv
olist_orders
.csv
C:\Data\Raw


#### Encoding / decoding (text ↔ bytes)

In [131]:
data = "नमस्ते, TechConvos!"       # Original string containing Unicode characters (Hindi + English)

b = data.encode("utf-8")           # Convert the string into bytes using UTF-8 encoding
                                  # Bytes are necessary for writing data to disk or sending over a network

txt = b.decode("utf-8")            # Convert the bytes back to a string using UTF-8 decoding
                                  # This restores the original text from the bytes

# Note:
# Always specify the encoding (like 'utf-8') when reading/writing files or streams,
# especially when sharing data between different systems,
# to avoid corruption or misinterpretation of characters.


print(data)    # Prints: नमस्ते, TechConvos!
print(b)       # Prints: b'\xe0\xa4\xa8\xe0\xa\
         #             4\xae\xe0\xa4\xb8\xe0\xa4\xa4\xe0\xa5\x87, TechConvos!'
print(txt)     # Prints: नमस्ते, TechConvos!


नमस्ते, TechConvos!
b'\xe0\xa4\xa8\xe0\xa4\xae\xe0\xa4\xb8\xe0\xa5\x8d\xe0\xa4\xa4\xe0\xa5\x87, TechConvos!'
नमस्ते, TechConvos!


#### Regex (for advanced splitting/parsing)

Use when simple split is not enough (multiple delimiters, patterns).

In [17]:
import re

text = "id=42; name = Dhiraj   Mishra ; role=Team Lead"

# Clean and split
clean_text = re.sub(r"\s+", " ", text).strip()
parts = re.split(r"\s*;\s*", clean_text)

# Optional: turn into dict
data = dict(part.strip().split("=", 1) for part in parts)

# Print the final dictionary
print(data)

{'id': '42', 'name ': ' Dhiraj Mishra', 'role': 'Team Lead'}


### **_Practice tasks (strings_**)

Splitter clinic:

In [64]:
# Task: Normalize to "Dhiraj Mishra|Team Lead|Mastek" and then get ["Dhiraj Mishra","Team Lead","Mastek"]


import re

raw_txt = " Dhiraj Mishra | Team Lead | Mastek "

# Step 1: Trim leading/trailing spaces
trim_space = raw_txt.strip()

# Step 2: Replace all ' | ' (with optional spaces) with '|'
replace_txt = re.sub(r'\s*\|\s*', '|', trim_space)

# Step 3: Split into parts and strip individual items (optional but good)
split_txt = [item.strip() for item in replace_txt.split('|')]

# Output
print("Normalized_txt : >", replace_txt)
print("Split Text:>", split_txt)


Normalized_txt : > Dhiraj Mishra|Team Lead|Mastek
Split Text:> ['Dhiraj Mishra', 'Team Lead', 'Mastek']


In [9]:
# Email validator: build is_valid_email(s) (basic rules) and email_domain(s) returning domain in lowercase.
import re

def is_valid_email(s):
    pattern = r'^[A-Za-z0-9._-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'
    return bool(re.match(pattern,s))

def email_domain(s):
    if not is_valid_email(s):
        raise ValueError("Invalid Email Address")
    return s.split('@')[1].lower()

# output
print(is_valid_email("Test.User@example.com"))     # True
print(is_valid_email("bademail@@example.com"))     # False
print(email_domain("Test.User@Example.COM"))        # "example.com"

True
False
example.com


In [19]:
# Filename parts: from "sales.2025-08-26.final.csv" 
# return: name=sales, date=2025-08-26, tag=final, ext=csv. Prefer Path + rsplit. 
 
from pathlib import Path  # Import Path from pathlib for easy filename handling

file_name = "sales.2025-08-26.final.csv"   # ✅ Global variable — declared before use

def parse_filename(filename):   # ✅ Function defined before calling
    path = Path(filename)       # Convert the filename to a Path object
    stem = path.stem            # Get the filename without the extension (e.g., "sales.2025-08-26.final")
    ext = path.suffix[1:]       # Get the extension and remove the leading dot (e.g., "csv")

    # Split the stem from the right into exactly 3 parts: name, date, tag
    parts = stem.rsplit('.', 2)

    # If the filename doesn't match the expected format, raise an error
    if len(parts) != 3:
        raise ValueError("Filename format must be: name.date.tag.ext")

    # Unpack the parts into name, date, and tag
    name, date, tag = parts

    # Return the extracted parts as a dictionary
    return {
        "name": name,
        "date": date,
        "tag": tag,
        "ext": ext
    }

# Call the function and store the result
result = parse_filename(file_name)  # ✅ Called after variable and function are both defined

# Print the result
print(result)                        # ✅ Uses result correctly

{'name': 'sales', 'date': '2025-08-26', 'tag': 'final', 'ext': 'csv'}


In [43]:
import re
from collections import Counter

text = """
In the age of information, data is everything. The more data you have,
the better decisions you can make. But with more data comes more responsibility.
"""

def top_10_words(paragraph):
    # 1. Case-fold the paragraph (Unicode-aware lowercase)
    paragraph = paragraph.casefold()

    # 2. Remove punctuation (keep only letters, numbers, underscores, and spaces)
    paragraph = re.sub(r'[^\w\s]', '', paragraph)

    # 3. Split paragraph into words on whitespace
    words = paragraph.split()

    # 4. Count frequencies of each word
    word_counts = Counter(words)

    # 5. Return top 10 most common words as (word, count) tuples
    return word_counts.most_common(10)

# Call the function with the input text
result = top_10_words(text)

# Nicely print the results
print("Top 10 words by frequency:\n")
for word, count in result:
    print(f"{word.ljust(15)} : {count}")

Top 10 words by frequency:

the             : 3
data            : 3
more            : 3
you             : 2
in              : 1
age             : 1
of              : 1
information     : 1
is              : 1
everything      : 1


In [54]:
# Smart title case: title‑case but keep small words (“and”, “of”, “the”) lowercase unless at start/end.


# ✅ What is "Smart Title Case"?

# It follows these rules:
# Capitalize the first and last words — no matter what they are.
# Capitalize all major words — nouns, verbs, adjectives, etc.
# Do NOT capitalize short/common words (called "small words") like: and, or, but, the, a, an, of, in, on, at, to, for, etc.

import re

def smart_title_case(title):
    small_words = {
        'a', 'an', 'and', 'as', 'at', 'but', 'by', 'for', 'if', 'in', 'nor',
        'of', 'on', 'or', 'per', 'the', 'to', 'vs', 'via', 'with'
    }

    # Correct regex to keep words and punctuation
    words = re.findall(r'\b\w+\b|\W+', title.lower())

    word_indices = [i for i, w in enumerate(words) if w.isalnum()]
    first = word_indices[0]
    last = word_indices[-1]

    result = []

    for i, word in enumerate(words):
        if not word.isalnum():
            result.append(word)
        elif i == first or i == last or word not in small_words:
            result.append(word.capitalize())
        else:
            result.append(word)
    
    return ''.join(result)


# Test cases
print(smart_title_case("the quick brown fox jumps over the lazy dog"))
print(smart_title_case("a tale of two cities and a revolution"))
print(smart_title_case("war and peace: the story of us"))


The Quick Brown Fox Jumps Over the Lazy Dog
A Tale of Two Cities and a Revolution
War and Peace: the Story of Us


In [78]:
# Masking: mask middle digits of phone "9876543210" → "98*****210" (handle variable lengths).

def mask_phone_number(phone):
    # Convert input to string in case it's given as an integer or other type
    phone = str(phone)
    
    # If the phone number is too short (5 digits or fewer), we skip masking
    # because there's not enough middle space to hide — just return as-is

    if len(phone) <= 5:
        return phone

    # Extract the first 2 digits (to be shown)
    first = phone[:2]

    # Extract the last 3 digits (to be shown)
    last = phone[-3:]
    

    # Calculate the number of middle digits to mask with '*'
    # Total masked = total length - (2 at start + 3 at end) = len - 5

    middle = '*' * (len(phone) -5)

    # Concatenate first part + masked part + last part
    return first + middle + last

print(mask_phone_number("9876543210"))
# Output: "98*****210"

98*****210


In [83]:
# Template fill: with f‑strings or .format, create "Hello {name}, your order {order_id} ships on {date}." safely using defaults when missing.

from collections import defaultdict

def fill_template(template, data):
    """
    Safely fills the template string with values from 'data'.
    Missing keys default to an empty string to avoid errors.

    Args:
        template (str): Template string with placeholders, e.g. "Hello {name}, your order {order_id} ships on {date}."
        data (dict): Dictionary with keys and values to fill the template.

    Returns:
        str: Formatted string with placeholders replaced or defaulted.
    """

    # Create a defaultdict that returns '' for missing keys
    safe_data = defaultdict(str, data)

    # Use format_map which accepts a mapping and substitutes missing keys with ''
    return template.format_map(safe_data)


# Example usage:
template = "Hello {name}, your order {order_id} ships on {date}."
data = {"name": "Alice", "order_id": 12345}  # 'date' key is missing

print(fill_template(template, data))
# Output: Hello Alice, your order 12345 ships on .


Hello Alice, your order 12345 ships on .


#### Translation & Mapping (str.translate, str.maketrans)

In [91]:
# Create a translation table that maps certain characters to their replacements
# str.maketrans() takes a dictionary where:
#   - keys are characters to find in the original string
#   - values are characters to replace those keys with
table = str.maketrans({
    "a": "@",  # replace 'a' with '@'
    "e": "3",  # replace 'e' with '3'
    "i": "1",  # replace 'i' with '1'
    "o": "0"   # replace 'o' with '0'
})

# Original string to transform
text = "Hellow, world"

# Use the translate() method of the string to replace characters based on the table
# It goes through each character in 'text', and if the character is in 'table',
# it replaces it with the mapped character. Otherwise, it keeps the character as is.
result = text.translate(table)

# result will be: "H3ll0w, w0rld"
print(result)

H3ll0w, w0rld


#### Partitioning (partition, rpartition)

In [97]:
url = "https://openai.com/blog"  # Define a URL string

# Use the partition() method to split the URL at the first occurrence of "://"
# partition() returns a tuple with three parts:
#   1) The part before "://"
#   2) The separator itself ("://")
#   3) The part after "://"
schema, _, rest = url.partition("://")

# After this:
# schema will hold "https"      -> the URL protocol (or scheme)
# _ will hold "://"             -> the separator itself (ignored by using _)
# rest will hold "openai.com/blog"  -> the remaining part of the URL (domain + path)

# Print the extracted parts to verify
print("Protocol (schema):", schema)
print("Separator:", _)  # Usually ignored, but printed here for clarity
print("Rest of URL:", rest)


Protocol (schema): https
Separator: ://
Rest of URL: openai.com/blog


#### Justifying & Padding (ljust, rjust, center, zfill)

In [102]:
# ljust(width) - Left-justify the string (pads right with spaces)
print("Hi".ljust(5))   # Output: 'Hi   '

# rjust(width) - Right-justify the string (pads left with spaces)
print("Hi".rjust(5))   # Output: '   Hi'

# center(width) - Center the string (pads both sides with spaces)
print("Hi".center(5))  # Output: ' Hi  '

# zfill(width) - Pad left with zeros (for numeric strings)
print("42".zfill(5))   # Output: '00042'


Hi   
   Hi
  Hi 
00042


#### Advanced Formatting (format specifiers)

In [103]:
pi = 3.14159265

# Format pi to 2 decimal places:
# {:.2f} means:
#   - ':' starts format spec
#   - '.2' limits to 2 decimal places
#   - 'f' formats as a fixed-point number (float)
formatted_pi = f"{pi:.2f}"   # Output: '3.14'

# Format 1000 with comma as thousands separator and no decimals:
# {:,.0f} means:
#   - ',' adds commas for thousands
#   - '.0' means zero decimal places
#   - 'f' fixed-point number formatting
formatted_number = f"{1000:,.0f}"  # Output: '1,000'

# Format 0.85 as a percentage with no decimals:
# {:.0%} means:
#   - '.' starts precision specifier
#   - '0' means zero decimal places
#   - '%' multiplies number by 100 and adds '%'
formatted_percentage = f"{0.85:.0%}"  # Output: '85%'

print(formatted_pi)        # 3.14
print(formatted_number)    # 1,000
print(formatted_percentage) # 85%


3.14
1,000
85%


#### Unicode awareness

In [104]:
import unicodedata

# Unicode characters can have multiple representations.
# For example, "é" can be:
# 1. A single composed character (U+00E9, 'é')
# 2. A combination of 'e' (U+0065) + an accent (U+0301)

# Unicode normalization helps standardize these forms.

# 'NFC' (Normalization Form C) composes characters to their combined form
normalized_text = unicodedata.normalize("NFC", "café")

# Here, "café" with an 'é' that might be decomposed (e + accent)
# is normalized into a single composed character for 'é'.
print(normalized_text)  # Output: 'café'


café


#### String constants (string module)

In [105]:
import string

# Get all lowercase ASCII letters
print(string.ascii_lowercase)  
# Output: 'abcdefghijklmnopqrstuvwxyz'

# Get all digits
print(string.digits)           
# Output: '0123456789'

# Check if a character is punctuation
char = '!'
if char in string.punctuation:
    print(f"'{char}' is a punctuation mark.")


abcdefghijklmnopqrstuvwxyz
0123456789
'!' is a punctuation mark.


In [108]:
# Check if a character is a letter, digit, or punctuation

import string

char = 'A'

if char in string.ascii_letters:
    print(f"'{char}' is a letter.")
elif char in string.digits:
    print(f"'{char}' is a digit.")
elif char in string.punctuation:
    print(f"'{char}' is punctuation.")
else:
    print(f"'{char}' is something else.")


'A' is a letter.


#### Performance hacks (big data ETL)

In [109]:
import re
import io

# 1. re.compile() - Precompile regex for repeated use
# Compiling the regex pattern once saves time when matching many strings,
# because Python won't recompile the pattern each time.
pattern = re.compile(r'\d{3}-\d{2}-\d{4}')  # Pattern for SSN format: 123-45-6789

data = ['123-45-6789', '987-65-4321', 'invalid']
for item in data:
    if pattern.match(item):
        print(f"{item} matches")
    else:
        print(f"{item} does NOT match")

# 2. str.join() - Faster string concatenation than looping with '+'
# When combining many strings, collect them in a list and join once,
# instead of concatenating in a loop which creates many intermediate strings.
words = ['Extract', 'Transform', 'Load']
sentence = ' '.join(words)  # Efficient and fast
print(sentence)  # Output: Extract Transform Load

# 3. io.StringIO - Efficiently build very large strings
# StringIO acts like an in-memory file buffer, allowing multiple writes
# without creating new string objects each time.
buffer = io.StringIO()
buffer.write("This ")
buffer.write("is ")
buffer.write("a ")
buffer.write("large string built efficiently.")
result = buffer.getvalue()
print(result)
buffer.close()  # Close the buffer when done to free resources


123-45-6789 matches
987-65-4321 matches
invalid does NOT match
Extract Transform Load
This is a large string built efficiently.


#### Suggested Final Practice Task (Capstone for Strings)

In [157]:
# Importing the re(regular expressions) module:
import re    

def clean_str(data):
    # Step 1: Extract key-value pairs using a regular expression:
    pattern = r'(\w+)\s*=\s*([^;]+)'
    extracted_data = re.findall(pattern, data)

    # Step 2: Clean and organize the data:
    result = {}  # initialize an empty dictionary to store the cleaned and structured data

    # Looping through the extracted key-value pairs:
    for key, value in extracted_data:
        # remove any leading or trailing spaces from both the key and the value
        key = key.strip()
        value = value.strip()

        # Handling different types of values (conditional logic):
        if key == "OrderID":                    # For "OrderID", convert it to integer
            result[key] = int(value)

        # Handling the "Name" key:
        elif key == "Name":                     # For "Name", keep the string as it is
            result[key] = " ".join(value.split())
        
        # For "Email", convert to lowercase
        elif key == 'Email':
            result[key] = value.strip().lower()

        # For "File", process the file string
        elif key == "file":
            # Use '.' as the delimiter to split file parts
            file_parts = value.strip().split('.')
            
            # Check if we have the expected number of parts (6 parts)
            if len(file_parts) == 6:
                file_info = {
                    "name": file_parts[0],
                    "date": file_parts[1] + '-' + file_parts[2] + '-' + file_parts[3],
                    "tag": file_parts[4],
                    "ext": file_parts[5]
                }
                result["File"] = file_info
            else:
                print(f"Error: Unexpected file format in '{value}'")
                # Optional: Set a default or error value for "File" in case of unexpected format
                result["File"] = None

    return result

# Input data 
data = "   OrderID=  42;  Name= Dhiraj   Mishra ;Email= DHIRAJ@Example.COM; file= report.2025.08.26.final.csv   "

# Process the data
output = clean_str(data)

# Print the result
print(output)

{'OrderID': 42, 'Name': 'Dhiraj Mishra', 'Email': 'dhiraj@example.com', 'File': {'name': 'report', 'date': '2025-08-26', 'tag': 'final', 'ext': 'csv'}}


#### Process a Batch of Inputs (test function)

In [None]:
test_data = [
    "   OrderID=  42;  Name= Dhiraj   Mishra ;Email= DHIRAJ@Example.COM; file= report.2025.08.26.final.csv   ",
    "   OrderID=  101;  Name= Alice   Wonderland ;Email= ALICE@Example.com; file= data.2024.11.01.report.csv   ",
    "   OrderID=  999;  Name= John Doe ;Email= JOHN@EXAMPLE.COM; file= sales.2023.12.12.final.csv   "
]

for data in test_data:
    print(clean_str(data))
    print()  # To add a blank line between outputs

{'OrderID': 42, 'Name': 'Dhiraj Mishra', 'Email': 'dhiraj@example.com', 'File': {'name': 'report', 'date': '2025-08-26', 'tag': 'final', 'ext': 'csv'}}

{'OrderID': 101, 'Name': 'Alice Wonderland', 'Email': 'alice@example.com', 'File': {'name': 'data', 'date': '2024-11-01', 'tag': 'report', 'ext': 'csv'}}

{'OrderID': 999, 'Name': 'John Doe', 'Email': 'john@example.com', 'File': {'name': 'sales', 'date': '2023-12-12', 'tag': 'final', 'ext': 'csv'}}

