# Filter Data

In [7]:
%load_ext lab_black
import pandas as pd

file_path = "/Users/fredrikjohannessen/Desktop/repositories/pandas-payground/data/"

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [8]:
df = pd.read_csv(file_path + "employees.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [20]:
# Convert "Start Date" to datetime
df["Last Login Time"] = pd.to_datetime(df["Start Date"] + " " + df["Last Login Time"])
df["Start Date"] = pd.to_datetime(df["Start Date"])
df.info()

In [41]:
# Convert data types to datetime objects when reading in
df = pd.read_csv(
    file_path + "employees.csv", parse_dates=["Start Date", "Last Login Time"]
)
# Convert other stuff
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


### Filter a DataFrame based on a condition

In [54]:
df[df["Gender"] == "Male"]
df[df["Team"] == "Finance"]
mask = df["Team"] == "Finance"
df[mask]
df[df["Senior Management"]]
df[df["Team"] != "Marketing"]
df[df["Salary"] > 110000]
df[df["Bonus %"] < 1.5]
df[df["Start Date"] <= "1985-01-01"]


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2023-02-28 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2023-02-28 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2023-02-28 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2023-02-28 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2023-02-28 20:49:00,57427,9.557,True,Client Services
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,2023-02-28 10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,2023-02-28 22:35:00,146907,11.738,False,Engineering
985,Stephen,,1983-07-10,2023-02-28 20:10:00,85668,1.909,False,Legal
986,Donna,Female,1982-11-26,2023-02-28 07:04:00,82871,17.999,False,Marketing


### Filter based on multiple conditions

In [66]:
mask1 = df["Gender"] == "Male"
mask2 = df["Team"] == "Marketing"
df[mask1 & mask2].head(3)

mask1 = df["Senior Management"]
mask2 = df["Start Date"] < "1990-01-01"
df[mask1 | mask2].head(3)

mask1 = df["First Name"] == "Robert"
mask2 = df["Team"] == "Client Services"
mask3 = df["Start Date"] > "2016-06-01"

df[(mask1 & mask2) | mask3]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2023-02-28 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2023-02-28 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2023-02-28 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2023-02-28 00:29:00,140002,19.49,True,Marketing


### The `isin()` method

In [71]:
mask1 = df["Team"] == "Legal"
mask2 = df["Team"] == "Sales"
mask3 = df["Team"] == "Product"
df[mask1 | mask2 | mask3].head(3)

# Do this instead:
mask = df["Team"].isin(["Legal", "Sales", "Product"])
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2023-02-28 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2023-02-28 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2023-02-28 15:19:00,102508,12.637,True,Legal


### The `isnull()` and `notnull()` Methods

In [77]:
mask = df["Team"].isnull()
df[mask].head(3)

mask = df["Gender"].notnull()
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-02-28 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-02-28 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-02-28 11:17:00,130590,11.858,False,Finance


### The `between()` method

In [94]:
mask = df["Salary"].between(60000, 70000)
df[mask]
df[df["Bonus %"].between(2, 5)]

df[df["Start Date"].between("1990-01-01", "1992-01-01")].head(3)
df[df["Last Login Time"].between("08:30AM", "12:00PM")]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-02-28 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2023-02-28 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2023-02-28 09:01:00,63241,15.132,True,
18,Diana,Female,1981-10-23,2023-02-28 10:27:00,132940,19.082,False,Client Services
33,Jean,Female,1993-12-18,2023-02-28 09:07:00,119082,16.180,False,Business Development
...,...,...,...,...,...,...,...,...
963,Ann,Female,1994-09-23,2023-02-28 11:15:00,89443,17.940,True,Sales
977,Sarah,Female,1995-12-04,2023-02-28 09:16:00,124566,5.949,False,Product
982,Rose,Female,1982-04-06,2023-02-28 10:43:00,91411,8.639,True,Human Resources
988,Alice,Female,2004-10-05,2023-02-28 09:34:00,47638,11.209,False,Human Resources


### The `dulicated()` method

In [107]:
df["First Name"].duplicated(keep="first")
df["First Name"].duplicated(keep="last")
df["First Name"].duplicated(keep=False)

df[~df["First Name"].duplicated()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-02-28 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-02-28 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2023-02-28 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-02-28 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2023-02-28 16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
712,Martin,,2001-02-06,2023-02-28 04:17:00,123963,15.745,True,Engineering
749,Janet,,1986-01-25,2023-02-28 05:48:00,85789,9.712,False,Legal
832,Keith,Male,2003-02-12,2023-02-28 15:02:00,120672,19.467,False,Legal
855,Phillip,,2003-10-20,2023-02-28 11:09:00,89700,2.277,True,


### The `drop_duplicates()` method

In [113]:
df.drop_duplicates(subset=["First Name", "Team"], keep=False)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2023-02-28 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2023-02-28 11:17:00,130590,11.858,False,Finance
4,Larry,Male,1998-01-24,2023-02-28 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2023-02-28 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2023-02-28 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2023-02-28 17:47:00,98874,4.479,True,Marketing
995,Henry,,2014-11-23,2023-02-28 06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,2023-02-28 06:30:00,42392,19.675,False,Finance
998,Larry,Male,2013-04-20,2023-02-28 16:45:00,60500,11.985,False,Business Development


### The `unique()` and `nunique()` methods

In [116]:
df["Gender"].unique()
df["Gender"].nunique()

2