### Python Data Mungin Functions
### ----------------------------------------

### Formatting function

In [5]:
# Remove whitespaces from left and right 
"\tABC\t".strip()

'ABC'

In [6]:
#Removing whitespace from the left
' ABC\t'.lstrip()

'ABC\t'

In [8]:
#Removing whitespace from the right
" ABC\t".rstrip()

' ABC'

In [9]:
#Removing character 'C' from the start and end of string
"ABC".strip("C")

'AB'

In [10]:
s = "abc\xFF"
print (s) # String with ascii characters as well as non-ascii characters
s.encode().decode("ascii", "ignore") # Getting the ascii characters and ignoring non ascii characters

abcÿ


'abc'

### Date Util Functions

In [22]:
import dateutil.parser as p
p.parse("August 13, 1985") #Parsing string to date and time. 0, 0 is the default when no time is specified

datetime.datetime(1985, 8, 13, 0, 0)

In [23]:
p.parse("2013-8-13") # Parsing differently formatted data string

datetime.datetime(2013, 8, 13, 0, 0)

In [24]:
p.parse("2013-8-13 4:15am") # Parsing string containing date and time

datetime.datetime(2013, 8, 13, 4, 15)

### Formatting Scripts 

In [1]:
def get_first_last_name(s):
    INVALID_NAME_PARTS = ["mr", "ms", "mrs", "dr", "jr", "sir"] # list of invalid name parts
    parts = s.lower().replace(".", "").strip().split() #formatting the name
    parts = [p for p in parts
             if p not in INVALID_NAME_PARTS] # getting parts of names which are not in the INVALID_NAMES_PART list
    if len(parts) == 0: # No parts found
        raise ValueError("Name %s is formatted wrong" % s) # Generate error
    first, last = parts[0], parts[-1] # Getting first name (index 0) and last name (-1 indicates the first index starting from last) 
    first = first[0].upper() + first[1:] # Converting to sentence case
    last = last[0].upper() + last[1:] 
    return first, last


def format_age(s):
    chars = list(s)  # list of characters
    digit_chars = [c for c in chars if c.isdigit()] # getting the characters which are digit
    return int("".join(digit_chars)) # merging these charcters


def format_date(s):
    MONTH_MAP = {"jan": "01", "feb": "02", "may": "03"}
    s = s.strip().lower().replace(",", "") # formatting the string
    m, d, y = s.split() # month, day and year
    if len(y) == 2: y = "19" + y
    if len(d) == 1: d = "0" + d
    return y + "-" + MONTH_MAP[m[:3]] + "-" + d # return the date in proper format
    
import pandas as pd
df = pd.read_csv("data_files/file.csv", sep="|") # reading the file
df["First Name"] = df["Name"].apply(  # creating the First Name column of the data frame
    lambda s: get_first_last_name(s)[0])
df["Last Name"] = df["Name"].apply( # creating the Last Name column of the data frame
    lambda s: get_first_last_name(s)[1])
df["Age"] = df["Age"].apply(format_age) # creating age column in the data frame
df["Birthdate"] = df["Birthdate"].apply( # creating the birthdate column in the data frame
    format_date).astype(pd.datetime)
print(df) # print the data frame

                Name  Age   Birthdate First Name Last Name
0  Ms. Janice Joplin   65  1943-01-19     Janice    Joplin
1         Bob Dylan    74  1941-03-24        Bob     Dylan
2     Billy Ray Joel   66  1941-02-09      Billy      Joel


### Regular Expressions

In [3]:
import re
# Pattern for the Street name as House number Street Name(without space) and then Ave, Avenue, St. etc
street_pattern = r"^[0-9]*\s[A-Z][a-z]*"+r"\s"+r"(?:Street|St|Rd|Road|Ave|Avenue|Blvd|Way|Wy)\.?"

# Pattern for City name as City name, State Name(Without space), followed by 5 digit pin code
city_pattern = r"[A-Z][a-z]*,\s[A-Z]{2},[0-9]{5}"

# Pattern which matches street followed by new line followed by city 
address_pattern = street_pattern + r"\n" + city_pattern
# Compile the string into a regular expression object

address_re = re.compile(address_pattern)
text = open("data_files/some_file.txt", "r").read()
matches = re.findall(address_re, text)
open("data_files/addresses_w_space_between.txt","w").write("\n\n".join(matches))

35