In [1]:
import pandas as pd

In [2]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")

chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [3]:
# In our current state, the chicago DF currently has values of all type strings
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1.2+ MB


In [4]:
# Unique departments
# Since there are only 35 out of 32K departments, we should convert this to a category in order to lower memory usage
chicago['Department'].nunique()
chicago['Department'] = chicago['Department'].astype('category')
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 1.0+ MB


# Common python string methods

In [5]:
# .title() makes the first letter of each word, upper case 
"hello world".title()

'Hello World'

In [6]:
len("Hello World")

11

In [7]:
# now here we apply the string methods to each of the columns 
# We need to add '.str' to the specific column we want to apply a string method to 
chicago["Name"] = chicago["Name"].str.title()
chicago['Position Title'] = chicago['Position Title'].str.title()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,WATER MGMNT,$90744.00
1,"Aaron, Jeffery M",Police Officer,POLICE,$84450.00
2,"Aaron, Karina",Police Officer,POLICE,$84450.00
3,"Aaron, Kimberlei R",Chief Contract Expediter,GENERAL SERVICES,$89880.00
4,"Abad Jr, Vicente M",Civil Engineer Iv,WATER MGMNT,$106836.00


In [8]:
# This allows you to get the length of each string in a given column
chicago['Department'].str.len().head()

0    11
1     6
2     6
3    16
4    11
Name: Department, dtype: int64

# .str.replace()

In [9]:
# Standard replace, first argument is substring to search, second argument is what you want to replace it with
"Hello World".replace("World", "Gabby")

'Hello Gabby'

In [10]:
# Example of string replacement for an entire column within a df 
chicago['Department'] = chicago['Department'].str.replace('MGMNT', 'MANAGEMENT')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,WATER MANAGEMENT,$90744.00
1,"Aaron, Jeffery M",Police Officer,POLICE,$84450.00
2,"Aaron, Karina",Police Officer,POLICE,$84450.00
3,"Aaron, Kimberlei R",Chief Contract Expediter,GENERAL SERVICES,$89880.00
4,"Abad Jr, Vicente M",Civil Engineer Iv,WATER MANAGEMENT,$106836.00


In [11]:
# Now let's remove the $ in the annual salary column
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$','').astype(float)

In [12]:
chicago['Employee Annual Salary'].sum()

2571506375.3600001

In [13]:
chicago['Employee Annual Salary'].mean()

80204.178633896823

In [14]:
chicago['Employee Annual Salary'].nlargest()

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
Name: Employee Annual Salary, dtype: float64

# Filtering with String Methods

In [18]:
# As a reminder, filtering requires you to create a boolean series 
# Here we are filtering out any position type with the string 'Water'
# When filtering through strings, it makes sense to first normalize the data before applying a comparison
chicago[chicago['Position Title'].str.lower().str.contains('water')].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,WATER MANAGEMENT,90744.0
554,"Aluise, Vincent G",Foreman Of Water Pipe Construction,WATER MANAGEMENT,102440.0
671,"Ander, Perry A",Water Chemist Ii,WATER MANAGEMENT,82044.0
685,"Anderson, Andrew J",District Superintendent Of Water Distribution,WATER MANAGEMENT,109272.0
702,"Anderson, Donald",Foreman Of Water Pipe Construction,WATER MANAGEMENT,102440.0


In [21]:
# Startswith and endswith 
chicago[chicago['Position Title'].str.lower().str.startswith("water")].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,WATER MANAGEMENT,90744.0
671,"Ander, Perry A",Water Chemist Ii,WATER MANAGEMENT,82044.0
1054,"Ashley, Karma T",Water Chemist Ii,WATER MANAGEMENT,82044.0
1079,"Atkins, Joanna M",Water Chemist Ii,WATER MANAGEMENT,82044.0
1181,"Azeem, Mohammed A",Water Chemist Ii,WATER MANAGEMENT,53172.0


In [22]:
chicago[chicago['Position Title'].str.lower().str.endswith("ist")].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"Afroz, Nayyar",Psychiatrist,HEALTH,99840.0
308,"Alarcon, Luis J",Loan Processing Specialist,COMMUNITY DEVELOPMENT,81948.0
422,"Allain, Carolyn",Senior Telecommunications Specialist,DoIT,89880.0
472,"Allen, Robert",Machinist,WATER MANAGEMENT,94328.0
705,"Anderson, Edward M",Sr Procurement Specialist,PROCUREMENT,91476.0


# strip, rstrip, lstrip methods

In [24]:
# Strip methods are used to eliminate whitespace within a given text 
"   hello world   ".lstrip()

'hello world   '

In [25]:
"   hello world   ".rstrip()

'   hello world'

In [26]:
"   hello world   ".strip()

'hello world'

In [30]:
chicago['Name'] = chicago['Name'].str.rstrip().str.lstrip()

In [33]:
chicago['Position Title'] = chicago['Position Title'].str.strip()

# String methods on index and columns

In [34]:
chicago = pd.read_csv("chicago.csv", index_col="Name").dropna(how="all")
chicago['Department'] = chicago['Department'].astype('category')
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [38]:
# We can apply the same string method to indices and columns 
chicago.index = chicago.index.str.strip().str.title()

In [39]:
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [43]:
chicago.columns = chicago.columns.str.upper()

In [44]:
chicago.head()

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


# Split() strings by characters

In [54]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago['Department'] = chicago['Department'].astype('category')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [55]:
# split will return a list of all the separate components of a given string 
"Hello my name is Gabby".split(" ")

['Hello', 'my', 'name', 'is', 'Gabby']

In [60]:
# here we want to get a count of the number of last names
# str.get() takes an index position and will get that value 
chicago['Name'].str.split(",").str.get(0).str.title().value_counts()

Williams              293
Johnson               244
Smith                 241
Brown                 185
Jones                 183
Rodriguez             171
Jackson               136
Garcia                130
Davis                 127
Hernandez             110
Martinez              108
Lopez                 106
Gonzalez              104
Perez                 100
Wilson                 94
Rivera                 90
Thomas                 89
Anderson               82
Torres                 81
Murphy                 80
Robinson               79
Moore                  78
Sanchez                76
Harris                 76
Miller                 75
Lewis                  74
Taylor                 73
Martin                 72
White                  66
Clark                  66
                     ... 
Patmon                  1
Wojciechowski           1
Bergendahl              1
Hoss                    1
Cromwell                1
Workman                 1
Zaller                  1
Schwertfeger

In [64]:
# Getting the most common word for the position title 
chicago['Position Title'].str.split(" ").str.get(0).value_counts()

POLICE                   10856
FIREFIGHTER-EMT           1509
SERGEANT                  1186
POOL                       918
FIREFIGHTER                810
CROSSING                   775
MOTOR                      721
SANITATION                 715
PARAMEDIC                  641
ASST                       606
TRAFFIC                    512
FIRE                       512
SENIOR                     470
CONSTRUCTION               452
LIEUTENANT-EMT             394
ADMINISTRATIVE             375
LIBRARY                    365
LIBRARIAN                  335
LIEUTENANT                 332
OPERATING                  324
ELECTRICAL                 313
AVIATION                   309
FIREFIGHTER/PARAMEDIC      259
GENERAL                    257
STAFF                      250
CLERK                      242
FOREMAN                    237
HOISTING                   214
DEPUTY                     213
LABORER                    210
                         ...  
INTAKE                       1
MOBILE  

# More Practice with Splits

In [65]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [71]:
# Here we will get the most common first names which appear in the DF
chicago['Name'].str.split(",").str.get(1).str.strip().str.split(" ").str.get(0).value_counts().head()

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

# The expand and n Parameters of the .str.split() method 

In [72]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [75]:
# Here we introduce the expand parameter within split - this will return the data as a DF\
# Here we pass a list of columns names back to the original df
chicago[['First Name', 'Last Name']] = chicago['Name'].str.split(",", expand=True)

In [76]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


In [83]:
# In this example, n represents the max number of splits you'd like to do 
chicago[["First Title Word", "Remaining Words"]]=chicago['Position Title'].str.split(" ", expand=True, n = 1).head(10)

In [84]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M,CIVIL,ENGINEER IV
