### Working with text data

In [1]:
# importing library

import pandas as pd

In [82]:
# importing dataset

employees_chicago = pd.read_csv('datasets/chicago.csv')

In [6]:
# checking shape of dataset

employees_chicago.shape

(32063, 4)

In [7]:
# checking head of dataset

employees_chicago.head(10)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
5,"ABARCA, ANABEL",ASST TO THE ALDERMAN,CITY COUNCIL,$70764.00
6,"ABARCA, EMMANUEL",GENERAL LABORER - DSS,STREETS & SAN,$41849.60
7,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,$20051.20
8,"ABBASI, CHRISTOPHER",STAFF ASST TO THE ALDERMAN,CITY COUNCIL,$49452.00
9,"ABBATACOLA, ROBERT J",ELECTRICAL MECHANIC,AVIATION,$93600.00


In [8]:
# checking info of dataset

employees_chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [10]:
# Notice in the above info our file is taking around 1003+ KB of size now let's how can we optimize it.

# let's check how many unique values inDepartment column

employees_chicago['Department'].nunique()

35

In [15]:
# we can call nunique method on whole dataset 

employees_chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [12]:
# now out of 32062 rows we have only 35 unique values inside department column 
# let's convert this into column

employees_chicago['Department'] = employees_chicago['Department'].astype("category")

In [14]:
# now after converting Department column into categorical let's check info of file

employees_chicago.info()

# Notice that size is reduced to 784.2+ KB from 1002+ KB almost by  222+ KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.2+ KB


In [16]:
# let's check the tail of dataset

employees_chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00
32062,,,,


In [34]:
# let's drop NaN from the dataframe
# let's import the dataframe with droping NA at the same time

employees_chicago = pd.read_csv('pandas datasets/chicago.csv').dropna(how = 'all')

In [35]:
# let's recheck tail of the dataset

employees_chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


#### common strings method `.lower()`,`.upper()`,`.title()` and `.len()`

In [18]:
print("-------Convert all letters to Upper-------")
print("Hello World".upper())
print("------- Convert all letters to lowercase --------")
print("Hello World".lower())
print("------- Convert 1st letter to capital and make it title ---------")
print("hello world".title())
print("------- Return length of character including blank sapce --------")
print(len("Hello World"))

-------Convert all letters to Upper-------
HELLO WORLD
------- Convert all letters to lowercase --------
hello world
------- Convert 1st letter to capital and make it title ---------
Hello World
------- Return length of character including blank sapce --------
11


In [21]:
# Now let's check this implementation using pandas
# lower() method

employees_chicago['Name'].str.lower()

0            aaron,  elvia j
1          aaron,  jeffery m
2             aaron,  karina
3        aaron,  kimberlei r
4        abad jr,  vicente m
                ...         
32058     zygowicz,  peter j
32059      zymantas,  mark e
32060    zyrkowski,  carlo e
32061    zyskowski,  dariusz
32062                    NaN
Name: Name, Length: 32063, dtype: object

In [22]:
# upper() method

employees_chicago['Name'].str.lower().str.upper()

0            AARON,  ELVIA J
1          AARON,  JEFFERY M
2             AARON,  KARINA
3        AARON,  KIMBERLEI R
4        ABAD JR,  VICENTE M
                ...         
32058     ZYGOWICZ,  PETER J
32059      ZYMANTAS,  MARK E
32060    ZYRKOWSKI,  CARLO E
32061    ZYSKOWSKI,  DARIUSZ
32062                    NaN
Name: Name, Length: 32063, dtype: object

In [23]:
# let's explore title method

# title() method

employees_chicago['Name'].str.lower().str.title()

0            Aaron,  Elvia J
1          Aaron,  Jeffery M
2             Aaron,  Karina
3        Aaron,  Kimberlei R
4        Abad Jr,  Vicente M
                ...         
32058     Zygowicz,  Peter J
32059      Zymantas,  Mark E
32060    Zyrkowski,  Carlo E
32061    Zyskowski,  Dariusz
32062                    NaN
Name: Name, Length: 32063, dtype: object

In [24]:
# Let's change position column to title

employees_chicago['Position Title'] = employees_chicago['Position Title'].str.title()

In [25]:
employees_chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00


In [26]:
# let's calculate the length of the department

employees_chicago['Department'].str.len()

0        11.0
1         6.0
2         6.0
3        16.0
4        11.0
         ... 
32058     6.0
32059     6.0
32060     6.0
32061     4.0
32062     NaN
Name: Department, Length: 32063, dtype: float64

### The `.str.replace()` method

In [27]:
# importing dataset

employees_chicago = pd.read_csv('pandas datasets/chicago.csv')

In [31]:
# The .replace method takes two argument 1st the letter to replace and 2nd the letter to replaced with

("Hello World").replace("l","!")

'He!!o Wor!d'

In [37]:
# let's replace MGMNT with complete MANAGEMENT into our dataframe

employees_chicago['Department'] = employees_chicago['Department'].str.replace("MGMNT","MANAGEMENT")

0        WATER MANAGEMENT
1                  POLICE
2                  POLICE
3        GENERAL SERVICES
4        WATER MANAGEMENT
               ...       
32057    GENERAL SERVICES
32058              POLICE
32059              POLICE
32060              POLICE
32061                DoIT
Name: Department, Length: 32062, dtype: object

In [41]:
# let's remove the dollar dign from Employee Annual Salary as on amount we are going to we perform mathematical operations

employees_chicago['Employee Annual Salary'] = employees_chicago['Employee Annual Salary'].str.replace("$","").astype(float)

  employees_chicago['Employee Annual Salary'] = employees_chicago['Employee Annual Salary'].str.replace("$","").astype(float)


In [42]:
employees_chicago['Employee Annual Salary']

0         90744.0
1         84450.0
2         84450.0
3         89880.0
4        106836.0
           ...   
32057     99528.0
32058     87384.0
32059     84450.0
32060     87384.0
32061    113664.0
Name: Employee Annual Salary, Length: 32062, dtype: float64

In [44]:
# let's perform any mathematical operations

print(employees_chicago['Employee Annual Salary'].sum())
print(employees_chicago['Employee Annual Salary'].mean())

2571506375.36
80204.17863389682


### Filtering with string method

In [49]:
employees_chicago = pd.read_csv('datasets/chicago.csv').dropna(how = 'all')
employees_chicago['Department'] = employees_chicago['Department'].astype("category")
employees_chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [50]:
employees_chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 1.0+ MB


In [55]:
# let's filter the rows only where we have Position title column contains water anywhere in the name

mask = employees_chicago['Position Title'].str.lower().str.contains("water")
employees_chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [57]:
# let's filter out the column Position Title where the name contains water at the beginning.
# Notice the output contains water at the starting only

mask = employees_chicago['Position Title'].str.lower().str.startswith('water')
employees_chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,$62004.00
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,$78948.00
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,$78948.00
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,$78948.00


In [59]:
# let's do this for end

mask = employees_chicago['Position Title'].str.lower().str.endswith("ist")
employees_chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,$74304.00
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,$19676.80
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$68556.00
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,$87324.00


### The `.strip()` `.lstrip()` `.rstrip()`

In [61]:
# All the method above is used to remove the blank spaces

print("--------- remove blank space from both sides ---------")
print("        Hello World     ".strip())

print("--------- remove blank space from left side ---------")
print("        Hello World     ".lstrip())

print("--------- remove blank space from right side ---------")
print("        Hello World     ".rstrip())

--------- remove blank space from both sides ---------
Hello World
--------- remove blank space from left side ---------
Hello World     
--------- remove blank space from right side ---------
        Hello World


In [63]:
# let's remove the blank spaces from Name column

employees_chicago['Name'].str.strip()

0            AARON,  ELVIA J
1          AARON,  JEFFERY M
2             AARON,  KARINA
3        AARON,  KIMBERLEI R
4        ABAD JR,  VICENTE M
                ...         
32057    ZYGADLO,  MICHAEL J
32058     ZYGOWICZ,  PETER J
32059      ZYMANTAS,  MARK E
32060    ZYRKOWSKI,  CARLO E
32061    ZYSKOWSKI,  DARIUSZ
Name: Name, Length: 32062, dtype: object

In [64]:
# let's remove the blank spaces from Name column

employees_chicago['Name'].str.lstrip()

0            AARON,  ELVIA J
1          AARON,  JEFFERY M
2             AARON,  KARINA
3        AARON,  KIMBERLEI R
4        ABAD JR,  VICENTE M
                ...         
32057    ZYGADLO,  MICHAEL J
32058     ZYGOWICZ,  PETER J
32059      ZYMANTAS,  MARK E
32060    ZYRKOWSKI,  CARLO E
32061    ZYSKOWSKI,  DARIUSZ
Name: Name, Length: 32062, dtype: object

In [67]:
# we can method chain this

employees_chicago['Name'] = employees_chicago['Name'].str.strip().str.rstrip().str.lstrip()
employees_chicago['Name']

0            AARON,  ELVIA J
1          AARON,  JEFFERY M
2             AARON,  KARINA
3        AARON,  KIMBERLEI R
4        ABAD JR,  VICENTE M
                ...         
32057    ZYGADLO,  MICHAEL J
32058     ZYGOWICZ,  PETER J
32059      ZYMANTAS,  MARK E
32060    ZYRKOWSKI,  CARLO E
32061    ZYSKOWSKI,  DARIUSZ
Name: Name, Length: 32062, dtype: object

### String method on index and column

In [69]:
# let's import the dataset again by using Name column as index

employees_chicago = pd.read_csv('datasets/chicago.csv',index_col="Name").dropna(how = 'all')
employees_chicago['Department'] = employees_chicago['Department'].astype("category")
employees_chicago.tail()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [75]:
# let's do for the rows

employees_chicago.index = employees_chicago.index.str.strip().str.title()
employees_chicago.index

Index(['Aaron,  Elvia J', 'Aaron,  Jeffery M', 'Aaron,  Karina',
       'Aaron,  Kimberlei R', 'Abad Jr,  Vicente M', 'Abarca,  Anabel',
       'Abarca,  Emmanuel', 'Abascal,  Reece E', 'Abbasi,  Christopher',
       'Abbatacola,  Robert J',
       ...
       'Zwit,  Jeffrey J', 'Zwolfer,  Matthew W', 'Zych,  Mateusz',
       'Zydek,  Bryan', 'Zygadlo,  John P', 'Zygadlo,  Michael J',
       'Zygowicz,  Peter J', 'Zymantas,  Mark E', 'Zyrkowski,  Carlo E',
       'Zyskowski,  Dariusz'],
      dtype='object', name='Name', length=32062)

In [79]:
# let's do for column

employees_chicago.columns =  employees_chicago.columns.str.upper()
employees_chicago.columns

Index(['POSITION TITLE', 'DEPARTMENT', 'EMPLOYEE ANNUAL SALARY'], dtype='object')

### The `split()` method

In [80]:
"Hello this is for test".split()

['Hello', 'this', 'is', 'for', 'test']

In [83]:
# let's drop NaN from the dataframe
# let's import the dataframe with droping NA at the same time

employees_chicago = pd.read_csv('datasets/chicago.csv').dropna(how = 'all')

In [92]:
# Let's seperate first and lastname based on comma

employees_chicago['Name'].str.split(",").str.get(0).str.title().value_counts()

Williams     293
Johnson      244
Smith        241
Brown        185
Jones        183
            ... 
Ogasawara      1
De Franco      1
Zunich         1
Trakes         1
Vaca           1
Name: Name, Length: 13829, dtype: int64

### The `expand` and `n` parameters of .split method()

In [93]:
# let's import the dataset again by using Name column as index

employees_chicago = pd.read_csv('datasets/chicago.csv').dropna(how = 'all')
employees_chicago['Department'] = employees_chicago['Department'].astype("category")
employees_chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [97]:
employees_chicago[['firstname','lastname']] =  employees_chicago['Name'].str.split(",",expand = True)

In [98]:
employees_chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,firstname,lastname
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M
