# Working with Text Data

In [None]:
import pandas as pd

In [None]:
chicago = pd.read_csv("chicago.csv")
chicago.Department.astype("category")
chicago.head(3)

In [None]:
chicago.info()

In [None]:
chicago.Department.nunique() # Strong candidate to be converted to a category

In [None]:
chicago.nunique()

In [None]:
chicago.Department.astype("category")

Decreased memory usage by converting departament to category (due to the low quantity of different data)

In [None]:
chicago.info()

## Common String Methods - .lower(), .upper(), .title() and .len()

In [None]:
chicago = pd.read_csv("chicago.csv")
chicago.Department.astype("category")
chicago.head(3)

In [None]:
"HELLO WORLD".lower()
"hello world".upper()

In [None]:
"hello world".title()

In [None]:
len("hello world")

Convert everything inside the column to the determined str funcion we're using

In [None]:
chicago.Name.str.lower()

In [None]:
chicago.Name.str.upper()

In [None]:
chicago.Name.str.title()

In [None]:
chicago["Position title"] = chicago["Position Title"].str.title()

In [None]:
chicago.head(3)

In [None]:
chicago["Department"].str.len()

## The .str.replace() method

In [None]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago.Department.astype("category")
chicago.tail(3)

In [None]:
"Hello world".replace("l", "!")

In [None]:
chicago.Department.str.replace("MGMNT", "MANAGEMENT")
chicago.head()

In [None]:
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$","").astype(float)

In [None]:
chicago.info()

In [None]:
chicago["Employee Annual Salary"].mean()
chicago["Employee Annual Salary"].nlargest()

In [None]:
## Filtering with String Methods

In [None]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago.Department.astype("category")
chicago.tail(3)

Generally, if we filter some data, we should pass the string to lower case (avoiding different cases)

In [None]:
mask = chicago["Position Title"].str.lower().str.contains("water")
chicago[mask]

In [None]:
chicago[chicago["Position Title"].str.lower().str.startswith("water")]

In [None]:
mask = chicago["Position Title"].str.lower().str.endswith("ist")
chicago[mask]

## More String Methods - .strip(), .lstrip() and .rstrip()

In [67]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago.Department.astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [48]:
"       Hello World     ".lstrip()

'Hello World     '

In [50]:
"       Hello World     ".rstrip()

'       Hello World'

In [51]:
"       Hello World     ".strip()

'Hello World'

In [60]:
chicago.Name

0            AARON,  ELVIA J
1          AARON,  JEFFERY M
2             AARON,  KARINA
3        AARON,  KIMBERLEI R
4        ABAD JR,  VICENTE M
                ...         
32057    ZYGADLO,  MICHAEL J
32058     ZYGOWICZ,  PETER J
32059      ZYMANTAS,  MARK E
32060    ZYRKOWSKI,  CARLO E
32061    ZYSKOWSKI,  DARIUSZ
Name: Name, Length: 32062, dtype: object

In [65]:
chicago.Name.str.lstrip()
chicago.Name.str.rstrip()
chicago.Name = chicago.Name.str.lstrip()

In [66]:
chicago.Name

0            AARON,  ELVIA J
1          AARON,  JEFFERY M
2             AARON,  KARINA
3        AARON,  KIMBERLEI R
4        ABAD JR,  VICENTE M
                ...         
32057    ZYGADLO,  MICHAEL J
32058     ZYGOWICZ,  PETER J
32059      ZYMANTAS,  MARK E
32060    ZYRKOWSKI,  CARLO E
32061    ZYSKOWSKI,  DARIUSZ
Name: Name, Length: 32062, dtype: object

In [59]:
chicago["Position Title"].str.strip()

0                      WATER RATE TAKER
1                        POLICE OFFICER
2                        POLICE OFFICER
3              CHIEF CONTRACT EXPEDITER
4                     CIVIL ENGINEER IV
                      ...              
32057    FRM OF MACHINISTS - AUTOMOTIVE
32058                    POLICE OFFICER
32059                    POLICE OFFICER
32060                    POLICE OFFICER
32061           CHIEF DATA BASE ANALYST
Name: Position Title, Length: 32062, dtype: object

## String Methods on Index and Columns

In [76]:
chicago = pd.read_csv("chicago.csv", index_col="Name").dropna(how = "all")
chicago.Department.astype("category")
chicago.tail(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [78]:
chicago.index = chicago.index.str.strip().str.title()

In [80]:
chicago.index
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


In [83]:
chicago.columns = chicago.columns.str.upper()

In [84]:
chicago.head(2)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00


##  Split strings by Characters

Return a list with all the splitted values

In [88]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago.Department.astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [86]:
"Hello my name is Fabio".split(" ")

['Hello', 'my', 'name', 'is', 'Fabio']

We should retrieve a value in the determined index position we want to access using get. str.get(index)

In [94]:
chicago.Name.str.split(",").str.get(1).str.title().value_counts()

  Michael J    270
  Michael      165
  Michael A    158
  David        128
  Thomas J     120
              ... 
  Sharlyn M      1
  Rickey S       1
  Renard         1
  Ernika L       1
  Carlo E        1
Name: Name, Length: 11794, dtype: int64

In [97]:
chicago["Position Title"].str.split(" ").str.get(0).str.title().value_counts()

Police             10856
Firefighter-Emt     1509
Sergeant            1186
Pool                 918
Firefighter          810
                   ...  
Dentist                1
Assoc                  1
Telephone              1
Mayor                  1
Prepress               1
Name: Position Title, Length: 320, dtype: int64

### More practice with Splits

In [100]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago.Department = chicago.Department.astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [104]:
chicago.Name.str.split(",").str.get(0).value_counts().head(3)

WILLIAMS    293
JOHNSON     244
SMITH       241
Name: Name, dtype: int64

In [109]:
chicago.Name.str.split(",").str.get(1).str.split(" ")

0            [, , ELVIA, J]
1          [, , JEFFERY, M]
2              [, , KARINA]
3        [, , KIMBERLEI, R]
4          [, , VICENTE, M]
                ...        
32057      [, , MICHAEL, J]
32058        [, , PETER, J]
32059         [, , MARK, E]
32060        [, , CARLO, E]
32061         [, , DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [113]:
chicago.Name.str.split(",").str.get(1).str.strip().str.split(" ").str.get(0).value_counts().head(3)

MICHAEL    1153
JOHN        899
JAMES       676
Name: Name, dtype: int64

## The expand and n Parameters of the str.split() Method

In [None]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago.Department = chicago.Department.astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


Expand parameter -> will return a dataframe instead of a list

Creating two new columns and assigning values of another dataframe:

In [117]:
chicago[["First Name", "Last Name"]] = chicago.Name.str.split(",", expand=True)

In [119]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA


Different number of spaces on each cell:

In [120]:
chicago["Position Title"].str.split(" ", expand= True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
...,...,...,...,...,...,...,...,...,...
32057,FRM,OF,MACHINISTS,-,AUTOMOTIVE,,,,
32058,POLICE,OFFICER,,,,,,,
32059,POLICE,OFFICER,,,,,,,
32060,POLICE,OFFICER,,,,,,,


N parameters -> maximum number of the character we want to split that should be splitted

In [124]:
chicago[["First Title Word", "Remaining Words"]] = chicago["Position Title"].str.split(" ", expand= True, n = 1)

In [125]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
