In [1]:
import os
import pandas as pd
import jupyter_black

jupyter_black.load()

current_dir = os.path.dirname(os.path.abspath("__file__"))

### Set file path to datasets

In [2]:
chicago_path = os.path.join(current_dir, "data", "chicago.csv")

In [3]:
chicago = pd.read_csv(chicago_path).dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago["Position Title"] = chicago["Position Title"].astype("category")
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  category
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(2), object(2)
memory usage: 887.5+ KB


### Common String-methods - `lower`, `upper` and `len`

In [4]:
"Hello World".lower()
"Hello World".upper()
len("Hello World")
"hello world".capitalize()
"hello world".title()

'Hello World'

To apply a string-method on a column of a dataframe, one needs to start with `.str`:

In [5]:
display(chicago.head(3))
chicago["Name"] = chicago["Name"].str.title()
chicago["Position Title"] = chicago["Position Title"].str.title()
chicago["Department"] = chicago["Department"].str.title()

chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Mgmnt,$90744.00
1,"Aaron, Jeffery M",Police Officer,Police,$84450.00
2,"Aaron, Karina",Police Officer,Police,$84450.00


### The `.str.replace()` method

In [6]:
chicago["Employee Annual Salary"] = (
    chicago["Employee Annual Salary"].str.replace("$", "", regex=False).astype("float")
)
chicago = chicago.rename(
    columns={"Employee Annual Salary": "Employee Annual Salary ($)"}
)

chicago["Department"] = chicago["Department"].str.replace("MGMNT", "MANAGEMENT")

chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary ($)
0,"Aaron, Elvia J",Water Rate Taker,Water Mgmnt,90744.0
1,"Aaron, Jeffery M",Police Officer,Police,84450.0
2,"Aaron, Karina",Police Officer,Police,84450.0


### Filtering with String Methods

In [7]:
mask = chicago["Position Title"].str.lower().str.contains("water")
chicago[mask].head(3)

mask = chicago["Position Title"].str.lower().str.startswith("water")
chicago[mask].head(3)

mask = chicago["Position Title"].str.lower().str.endswith("ist")
chicago[mask].head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary ($)
184,"Afroz, Nayyar",Psychiatrist,Health,99840.0
308,"Alarcon, Luis J",Loan Processing Specialist,Community Development,81948.0
422,"Allain, Carolyn",Senior Telecommunications Specialist,Doit,89880.0


### More string methods: `strip()`, `lstrip()` and `rstrip()`

In [8]:
chicago["Name"] = chicago["Name"].str.strip()
chicago["Position Title"] = chicago["Position Title"].str.strip()
chicago["Department"] = chicago["Department"].str.strip()

### String methods on Index and Columns

In [9]:
chicago = chicago.set_index("Name")
chicago.index = chicago.index.str.strip().str.title()

### Split Strings by Characters with the `.split` Method

In [10]:
chicago = chicago.reset_index()
chicago2 = chicago.copy()

The first way to do it:

In [11]:
names_splitted = chicago2["Name"].str.split(",")

chicago2["First Name"] = [x[1].strip() for x in names_splitted]
chicago2["Last Name"] = [x[0].strip() for x in names_splitted]

chicago2 = chicago2[
    [
        "Name",
        "First Name",
        "Last Name",
        "Position Title",
        "Department",
        "Employee Annual Salary ($)",
    ]
]

chicago2.head(3)

Unnamed: 0,Name,First Name,Last Name,Position Title,Department,Employee Annual Salary ($)
0,"Aaron, Elvia J",Elvia J,Aaron,Water Rate Taker,Water Mgmnt,90744.0
1,"Aaron, Jeffery M",Jeffery M,Aaron,Police Officer,Police,84450.0
2,"Aaron, Karina",Karina,Aaron,Police Officer,Police,84450.0


The second way to do it, using the `get()` method

In [12]:
chicago["Last Name"] = chicago["Name"].str.split(",").str.get(0).str.strip()
chicago["First Name"] = chicago["Name"].str.split(",").str.get(1).str.strip()

chicago = chicago[
    [
        "Name",
        "First Name",
        "Last Name",
        "Position Title",
        "Department",
        "Employee Annual Salary ($)",
    ]
]

chicago.head(3)

Unnamed: 0,Name,First Name,Last Name,Position Title,Department,Employee Annual Salary ($)
0,"Aaron, Elvia J",Elvia J,Aaron,Water Rate Taker,Water Mgmnt,90744.0
1,"Aaron, Jeffery M",Jeffery M,Aaron,Police Officer,Police,84450.0
2,"Aaron, Karina",Karina,Aaron,Police Officer,Police,84450.0


In [13]:
chicago["Position Title"].str.split(" ").str.get(0).value_counts().head(3)

Police             10856
Firefighter-Emt     1509
Sergeant            1186
Name: Position Title, dtype: int64

In [14]:
chicago["First Name"].str.split(" ").str.get(0).value_counts().head(3)

Michael    1153
John        899
James       676
Name: First Name, dtype: int64

### Exploring the `expand` and `n` Parameters of the `str.split` Method

In [15]:
# Another way to split Name into first and last name is by expand = True
chicago["Name"].str.split(",", expand=True).head(3)

Unnamed: 0,0,1
0,Aaron,Elvia J
1,Aaron,Jeffery M
2,Aaron,Karina


In [16]:
chicago["Position Title"].str.split(" ", expand=True).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Water,Rate,Taker,,,,,,
1,Police,Officer,,,,,,,
2,Police,Officer,,,,,,,


In [17]:
chicago["Position Title"].str.split(" ", expand=True, n=2).head(3)

Unnamed: 0,0,1,2
0,Water,Rate,Taker
1,Police,Officer,
2,Police,Officer,
