# Filter Data

In [1]:
import os
import pandas as pd
import jupyter_black

jupyter_black.load()

current_dir = os.path.dirname(os.path.abspath("__file__"))

### Set file path to datasets

In [2]:
employees_path = os.path.join(current_dir, "data", "employees.csv")

In [3]:
df = pd.read_csv(employees_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [4]:
# Convert "Start Date" to datetime
df["Last Login Time"] = pd.to_datetime(df["Start Date"] + " " + df["Last Login Time"])
df["Start Date"] = pd.to_datetime(df["Start Date"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    object        
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  933 non-null    object        
 7   Team               957 non-null    object        
dtypes: datetime64[ns](2), float64(1), int64(1), object(4)
memory usage: 62.6+ KB


In [5]:
# Convert data types to datetime objects when reading in
df = pd.read_csv(employees_path, parse_dates=["Start Date", "Last Login Time"])
# Convert other stuff
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


### Filter a DataFrame based on a condition

In [6]:
df[df["Gender"] == "Male"]
df[df["Team"] == "Finance"]
mask = df["Team"] == "Finance"
df[mask]
df[df["Senior Management"]]
df[df["Team"] != "Marketing"]
df[df["Salary"] > 110000]
df[df["Bonus %"] < 1.5]
df[df["Start Date"] <= "1985-01-01"].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2023-03-15 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2023-03-15 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2023-03-15 10:27:00,132940,19.082,False,Client Services


### Filter based on multiple conditions

In [7]:
mask1 = df["Gender"] == "Male"
mask2 = df["Team"] == "Marketing"
df[mask1 & mask2].head(3)

mask1 = df["Senior Management"]
mask2 = df["Start Date"] < "1990-01-01"
df[mask1 | mask2].head(3)

mask1 = df["First Name"] == "Robert"
mask2 = df["Team"] == "Client Services"
mask3 = df["Start Date"] > "2016-06-01"

df[(mask1 & mask2) | mask3]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2023-03-15 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2023-03-15 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2023-03-15 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2023-03-15 00:29:00,140002,19.49,True,Marketing


### The `isin()` method

In [8]:
mask1 = df["Team"] == "Legal"
mask2 = df["Team"] == "Sales"
mask3 = df["Team"] == "Product"
df[mask1 | mask2 | mask3].head(3)

# Do this instead:
mask = df["Team"].isin(["Legal", "Sales", "Product"])
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2023-03-15 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2023-03-15 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2023-03-15 15:19:00,102508,12.637,True,Legal


### The `isnull()` and `notnull()` Methods

In [9]:
mask = df["Team"].isnull()
df[mask].head(3)

mask = df["Gender"].notnull()
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-15 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-03-15 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-03-15 11:17:00,130590,11.858,False,Finance


### The `between()` method

In [10]:
mask = df["Salary"].between(60000, 70000)
df[mask]
df[df["Bonus %"].between(2, 5)]

df[df["Start Date"].between("1990-01-01", "1992-01-01")].head(3)
df[df["Last Login Time"].between("08:30AM", "12:00PM")].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-03-15 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2023-03-15 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2023-03-15 09:01:00,63241,15.132,True,


### The `dulicated()` method

In [11]:
df["First Name"].duplicated(keep="first")
df["First Name"].duplicated(keep="last")
df["First Name"].duplicated(keep=False)

df[~df["First Name"].duplicated()].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-15 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-03-15 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-03-15 11:17:00,130590,11.858,False,Finance


### The `drop_duplicates()` method

In [12]:
df.drop_duplicates(subset=["First Name", "Team"], keep=False).head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2023-03-15 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-03-15 11:17:00,130590,11.858,False,Finance
4,Larry,Male,1998-01-24,2023-03-15 16:47:00,101004,1.389,True,Client Services


### The `unique()` and `nunique()` methods

In [13]:
df["Gender"].unique()
df["Gender"].nunique()

2