#### Strings (text)

What a string is : Immutable sequence of Unicode code points.

##### Essential creation & inspection

In [None]:
s = "Hello, TechConvos!"

# length
len(s)
print("variable length is: >", len(s))

# indexing
print("\nFirst Char of variable :>", s[0])
print("nLast char of variable : >", s[-1])

# slicing, reverse
# syntax of slicing Start:End:interval
print(s[2:7])

print("Reverse char of variable: >",s[::-1])

variable length is: > 18

First Char of variable :> H
nLast char of variable : > !
llo, 
Reverse char of variable: > !sovnoChceT ,olleH


#### Trimming & normalizing whitespace

In [82]:
# STRING MANIPULATION & WHITESPACE HANDLING IN PYTHON
# ----------------------------------------------
# Define three strings with different spacing issues

name = "Dhiraj     Mishra"        # This string has multiple spaces **between** the words.
leftstrip = " Tech Convos"        # This string has one leading space at the beginning.
right_strip = "Tech Convos "      # This string has one trailing space at the end.

# -----------------------------------------------------
# STEP 1: CALCULATE LEADING AND TRAILING SPACES

# Count how many spaces are on the **left side** (leading) of 'name'
# len(name) gives total length, len(name.lstrip()) removes leading spaces
left_space_name =  len(name) - len(name.lstrip())

# Count how many spaces are on the **right side** (trailing) of 'name'
right_space_name = len(name) - len(name.rstrip())

# Repeat the same logic for 'leftstrip'
lspace_leftstrip = len(leftstrip) - len(leftstrip.lstrip())

# Repeat the same logic for 'righttstrip'

rspace_rstrip = len(right_strip) - len(right_strip.rstrip())

# PRINT ORIGINAL STRINGS (with repr to show hidden spaces)
print("=== ORIGINAL STRINGS ===\n")
print("name :> ", repr(name))           # repr() shows the string with quotes and visible spaces
print("leftstrip", repr(leftstrip))     # Useful for debugging leading/trailing whitespace
print("right_tstrip", repr(right_strip))

# DISPLAY COUNT OF SPACES

print("\n===Space Count====")
print("Left spaces in 'name':", left_space_name)
print("Right spaces in 'name':", right_space_name)

print("Left spaces in 'leftstrip':", lspace_leftstrip)
print("Right spaces in 'right_strip':", rspace_rstrip )


# NORMALIZE 'name' (Collapse multiple spaces to a single space)
# name.split() splits string into words (splits on ANY whitespace)
# " ".join(...) joins the words with **a single space**

clean_name = " ".join(name.split())

print("\n=== NORMALIZED NAME ===")

print("Original length: >", len(name))
print("Cleaned Length: >", len(clean_name))
print("Cleaned output: >",clean_name)

#  STRIP SPACES FROM STRINGS
# .strip() removes spaces from both ends
trimed_name = name.strip()


# .strip() works the same for the other strings
trimed_leftstrip = leftstrip.strip()
trimed_rightstrip = right_strip.strip()


print("\n=== TRIMMED STRINGS ===")
print("Trimmed 'name':", repr(trimed_name))               # No leading/trailing spaces
print("Trimmed 'leftstrip':", repr(trimed_leftstrip))     # Should remove 1 leading space
print("Trimmed 'right_strip':", repr(trimed_rightstrip))  # Should remove 1 trailing space

# RECONSTRUCT STRINGS WITH PRESERVED PADDING

# Recreate the left-padded version of 'name' by prepending the counted spaces

left_padded_name = " " * left_space_name + name


# Do the same for 'leftstrip' and 'right_strip'
left_padded_leftstrip = " " * lspace_leftstrip + leftstrip
right_padded_rightstrip = right_strip + " " * rspace_rstrip

print("\n=== STRINGS WITH PRESERVED PADDING ===")
print("Left-padded 'name':", repr(left_padded_name))
print("Right-padded 'name':", repr(right_padded_name))
print("Left-padded 'leftstrip':", repr(left_padded_leftstrip))
print("Right-padded 'right_strip':", repr(right_padded_rightstrip))

=== ORIGINAL STRINGS ===

name :>  'Dhiraj     Mishra'
leftstrip ' Tech Convos'
right_tstrip 'Tech Convos '

===Space Count====
Left spaces in 'name': 0
Right spaces in 'name': 0
Left spaces in 'leftstrip': 1
Right spaces in 'right_strip': 1

=== NORMALIZED NAME ===
Original length: > 17
Cleaned Length: > 13
Cleaned output: > Dhiraj Mishra

=== TRIMMED STRINGS ===
Trimmed 'name': 'Dhiraj     Mishra'
Trimmed 'leftstrip': 'Tech Convos'
Trimmed 'right_strip': 'Tech Convos'

=== STRINGS WITH PRESERVED PADDING ===
Left-padded 'name': 'Dhiraj     Mishra'
Right-padded 'name': 'Dhiraj     Mishra'
Left-padded 'leftstrip': '  Tech Convos'
Right-padded 'right_strip': 'Tech Convos  '


#### Splitting (your key request) & joining

In [11]:
text = "Python   for   Data   &   BI"
token = text.split()  # ['Python', 'for', 'Data', '&', 'BI']

print(token)


['Python', 'for', 'Data', '&', 'BI']


#### Split on a specific delimiter

In [18]:
# Step 1: Define the full file path as a string
path = "C:/Data/Raw/olist_orders.csv"

# Step 2: Split the path string at each "/" to separate the components
# This will create a list of parts: drive letter, folders, and the filename
paths = path.split("/")   # Output: ['C:', 'Data', 'Raw', 'olist_orders.csv']

# Step 3: Use unpacking to extract the first and last elements from the list
# root      -> 'C:'        (the drive letter or root folder)
# *_        -> ['Data', 'Raw'] (the middle folders, ignored here with _)
# filename  -> 'olist_orders.csv' (the actual file name at the end of the path)
root, *_, filename = paths

# Step 4: Define a string that simulates a row of data, with fields separated by "|"
row = "Dhiraj|Mishra|Team Lead"

# Step 5: Split the row at the "|" symbol to get a list of individual fields
# Output: ['Dhiraj', 'Mishra', 'Team Lead']
# (Although you’re not storing the result here, this line splits the row)
print(row.split("|"))

# Step 6: Print the extracted root (e.g., 'C:')
print(root)

# Step 7: Print the extracted filename (e.g., 'olist_orders.csv')
print(filename)

# Step 8: Print the full list of path parts
# Output: ['C:', 'Data', 'Raw', 'olist_orders.csv']
print(paths)


['Dhiraj', 'Mishra', 'Team Lead']
C:
olist_orders.csv
['C:', 'Data', 'Raw', 'olist_orders.csv']


#### Limit splits (performance & control)

In [5]:
record = "id=42;name=Dhiraj;role=Team Lead;dept=BI"

first, rest = record.split(";",1)  # split only once (left to right)


# first: 'id=42', rest: 'name=Dhiraj;role=Team Lead;dept=BI'


print(first)
print(rest)

id=42
name=Dhiraj;role=Team Lead;dept=BI


#### Right split (useful for filenames, trailing fields)

In [11]:
# Assign the filename as a string
fname = "report.2025.08.26.final.csv"

# Split the filename from the right at the last '.' to separate the extension
# 'rsplit(".", 1)' means: split from the right, only once
# This will return ['report.2025.08.26.final', 'csv']
base, ext = fname.rsplit(".", 1)

# Print the base part of the filename (everything before the last dot)
print(base)
print(ext)
# Print the extension (everything after the last dot

report.2025.08.26.final
csv


#### Lines handling (robust across OS)

In [15]:
# Define a string with mixed newline characters (\r\n and \n)
doc = "line1\r\nline2\nline3"

# Split the string into lines, handling all newline types
lines = doc.splitlines()

# Print the original string (will display as 3 lines)
print(doc)

line1
line2
line3


#### Re‑join tokens — the fast way to build strings

In [22]:
words = ["Python", "Data", "Fabric", "Snowflake"]
" | ".join(words)              # 'Python | Data | Fabric | Snowflake'
# Best approach: accumulate pieces in a list, ''.join(pieces) at the end.

'Python | Data | Fabric | Snowflake'

#### Searching & replacing

In [63]:
# Assign a string to the variable 's'
s = "Power BI with Python and Snowflake"

# Find the index of the first occurrence of the character 'i'
# Returns the index (0-based), or -1 if not found
search_output = s.find('i')

# Print the index where 'i' first appears in the string
print(search_output)

# Replace the substring "Power BI" with "Fabric" in 's'
# This returns a new string; 's' itself is not modified
replace_output = s.replace("Power BI", "Fabric")

# Print the modified string
print(replace_output)

10
Fabric with Python and Snowflake


#### Case handling (for normalization)

In [83]:
# Original email string in uppercase
email = "USER@EXAMPLE.COM"

# Convert the email to lowercase using .lower()
# This is useful for case-insensitive comparisons or deduplication (e.g. treating 'User@Example.com' and 'user@example.com' as the same)
email_lower = email.lower()

# Print the normalized lowercase email
print("original str : > ", email)
print("Lowercase email:", email_lower)  # Output: user@example.com

# --- International Case Handling Example ---

# Suppose we are working with a string containing international characters
# For example, the German letter "ß" (sharp S) which should be treated like "ss" in comparisons
german_word = "straße"

# Using .lower() — basic lowercase conversion (won't change ß)
lowered = german_word.lower()

# Using .casefold() — more aggressive and Unicode-aware
# It converts ß → ss, which is more appropriate for case-insensitive comparisons in some languages
folded = german_word.casefold()

# Print results to show the difference
print("Lowered:", lowered)    # Output: straße
print("Casefolded:", folded)  # Output: strasse


original str : >  USER@EXAMPLE.COM
Lowercase email: user@example.com
Lowered: straße
Casefolded: strasse


#### Formatting (f‑strings: best approach)

In [99]:
# Assign the user's name to a variable
user = 'dhiraj'

# Assign the user's role to a variable
role = 'team lead'

# Assign the year to a variable
year = 2025

# Convert 'Mastek' to uppercase using .upper()
msg = f"{user.upper()} - {role.upper()} @ {'Mastek'.upper()} {year}"

# Print the final message
print(msg)


DHIRAJ - TEAM LEAD @ MASTEK 2025


#### Validation & parsing (digits, alpha, etc.)

In [None]:
# --- isdigit() ---
# Only digits → returns True
print("12345".isdigit())    # True

# Contains a letter → returns False
print("123a5".isdigit())    # False


# --- isalpha() ---
# Only letters → returns True
print("abcXYZ".isalpha())   # True

# Contains a number → returns False
print("abc123".isalpha())   # False

# --- isalnum() ---
# Letters and numbers only → returns True
print("abc123".isalnum())   # True

# Contains a space → returns False
print("abc 123".isalnum())  # False


# --- isspace() ---
# Only spaces → returns True
print("   ".isspace())      # True

# Contains a tab and a letter → returns False
print(" \ta".isspace())     # False


True
False
True
False
True
False
True
False


#### Cleaning data (ETL‑style string scenarios)

_A) Normalize names (spaces, case, punctuation)_

In [115]:
def normalize_name(raw: str) -> str:
    """
    Normalize user-entered names:
    - Remove leading/trailing/multiple spaces
    - Fix inconsistent capitalization (e.g., dHiRaJ -> Dhiraj)
    Steps:
    1. raw.split() breaks the string into words, removing extra spaces
    2. " ".join(...) joins words back with single spaces
    3. .title() capitalizes the first letter of each word
    """
    core = " ".join(raw.split())     # Remove extra/multiple spaces
    return core.title()              # Capitalize each word correctly


# Example inputs (messy names)
name1 = "   dHiRaJ   mIsHrA   "
name2 = "JOHN    doe"
name3 = "  alice   o'connor  "
name4 = "eLon    MUsK"
name5 = "  mAry     AnnE  "

# Normalize each name using the function
print(normalize_name(name1))  # Output: Dhiraj Mishra
print(normalize_name(name2))  # Output: John Doe
print(normalize_name(name3))  # Output: Alice O'Connor
print(normalize_name(name4))  # Output: Elon Musk
print(normalize_name(name5))  # Output: Mary Anne


Dhiraj Mishra
John Doe
Alice O'Connor
Elon Musk
Mary Anne


_B) Extract domain from email_

In [117]:
def email_domain(email: str) -> str | None:
    email = email.strip()                      # Remove leading/trailing spaces
    if '@' not in email or email.count("@") != 1:  # Check for exactly one '@'
        return None                            # Return None if invalid email
    _, domain = email.rsplit("@", 1)           # Split from right at '@' to get domain
    return domain.lower()                       # Return domain in lowercase

# Test cases to check the function behavior
print(email_domain(" User@Example.COM "))   # Output: example.com (valid email)
print(email_domain("no-at-symbol"))          # Output: None (missing '@')
print(email_domain("wrong@@example.com"))    # Output: None (more than one '@')
print(email_domain("admin@Sub.Domain.COM"))  # Output: sub.domain.com (valid email)
print(email_domain("   alice@Example.co.uk "))  # Output: example.co.uk (valid email)

example.com
None
None
sub.domain.com
example.co.uk


_C) Robust CSV line split (when fields may contain commas)_

In [123]:
import csv
from io import StringIO

line = "name,age,city\nAlice,30,New York\nBob,25,Los Angeles"  # CSV data in a string

reader = csv.reader(StringIO(line))  # Convert string to file-like object, then parse CSV

for row in reader:
    print(row)  # Prints each row as a list of values

['name', 'age', 'city']
['Alice', '30', 'New York']
['Bob', '25', 'Los Angeles']


_D) File path utilities (use pathlib, best approach)_

In [None]:
from pathlib import Path  # Import Path class from pathlib module

p = Path(r"C:\Data\Raw\olist_orders.csv")  # Create a Path object representing the file path

print(p.name)    # 'olist_orders.csv'   --> The file name with extension
print(p.stem)    # 'olist_orders'       --> The file name without the extension
print(p.suffix)  # '.csv'               --> The file extension (including the dot)
print(p.parent)  # WindowsPath('C:/Data/Raw') --> The directory path containing the file

olist_orders.csv
olist_orders
.csv
C:\Data\Raw


#### Encoding / decoding (text ↔ bytes)

In [131]:
data = "नमस्ते, TechConvos!"       # Original string containing Unicode characters (Hindi + English)

b = data.encode("utf-8")           # Convert the string into bytes using UTF-8 encoding
                                  # Bytes are necessary for writing data to disk or sending over a network

txt = b.decode("utf-8")            # Convert the bytes back to a string using UTF-8 decoding
                                  # This restores the original text from the bytes

# Note:
# Always specify the encoding (like 'utf-8') when reading/writing files or streams,
# especially when sharing data between different systems,
# to avoid corruption or misinterpretation of characters.


print(data)    # Prints: नमस्ते, TechConvos!
print(b)       # Prints: b'\xe0\xa4\xa8\xe0\xa\
         #             4\xae\xe0\xa4\xb8\xe0\xa4\xa4\xe0\xa5\x87, TechConvos!'
print(txt)     # Prints: नमस्ते, TechConvos!


नमस्ते, TechConvos!
b'\xe0\xa4\xa8\xe0\xa4\xae\xe0\xa4\xb8\xe0\xa5\x8d\xe0\xa4\xa4\xe0\xa5\x87, TechConvos!'
नमस्ते, TechConvos!


#### Regex (for advanced splitting/parsing)