In [1]:
import pandas as pd

# Memory Optimization

In [20]:
# read_csv has a parse_dates parameter which will render columns to datetimeobjects
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])

# Here we will start by using the to_datetime() method,
# which will allow use to convert the string dates into workable date times
# df["Start Date"] = pd.to_datetime(df["Start Date"])

# Now we will do the same things with the last login time
# Now this will default to today as the date time is incomplete
# df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])

# Here we convert the true or false to bools
df["Senior Management"] = df["Senior Management"].astype("bool")

# Here we set a category for male, female and null
df["Gender"]=df["Gender"].astype("category")

df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-04-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-04-03 11:17:00,130590,11.858,False,Finance


In [21]:
# As you can see, we cleaned up the data set in order to reduce the data usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null object
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.0+ KB


# Filter A Df Based on a condition

In [22]:
# Here we will exact certain rows which meet a given condition
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-04-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-04-03 11:17:00,130590,11.858,False,Finance


In [28]:
# This is how you filter all rows that meet a given condition for a column - in our example, gender == Male
df[df["Gender"] == "Male"].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-04-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2019-04-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-04-03 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2019-04-03 01:35:00,115163,10.125,False,Legal


In [29]:
# Here we pull everyone from finance
df[df["Team"] == "Finance"].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2019-04-03 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-04-03 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2019-04-03 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2019-04-03 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2019-04-03 22:47:00,114796,6.796,False,Finance


In [31]:
# More elegant way for filtering is to create its own separate variable
Marketing = df["Team"] == "Marketing"
df[Marketing].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-04-03 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2019-04-03 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2019-04-03 07:45:00,37598,7.757,True,Marketing
43,Marilyn,Female,1980-12-07,2019-04-03 03:16:00,73524,5.207,True,Marketing
62,,Female,2007-06-12,2019-04-03 17:25:00,58112,19.414,True,Marketing


In [33]:
# Booleans take a special case, if you enter it into the df[] it will only return the rows where the value is True
df[df["Senior Management"]].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-04-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2019-04-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-04-03 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2019-04-03 16:20:00,65476,10.012,True,Product


In [37]:
# Return the rows where the team is not marketing 
df[df["Team"] != "Marketing"].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-04-03 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-04-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-04-03 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2019-04-03 01:35:00,115163,10.125,False,Legal


In [40]:
# Salary greater than 100K
df[df["Salary"] >= 100000].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2019-04-03 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-04-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-04-03 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2019-04-03 01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,2019-04-03 06:51:00,139852,7.524,True,Business Development


In [43]:
# Employees will bonus less than 2%
df[df["Bonus %"] < 2].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2019-04-03 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2019-04-03 06:09:00,59414,1.256,False,Product
19,Donna,Female,2010-07-22,2019-04-03 03:48:00,81014,1.894,False,Product
52,Todd,Male,1990-02-18,2019-04-03 02:41:00,49339,1.695,True,Human Resources
58,Theresa,Female,2010-04-11,2019-04-03 07:18:00,72670,1.481,True,Engineering


In [46]:
# Start date less than a given date - enter the date as a string
old = df["Start Date"] <= "1985-01-01"
df[old].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2019-04-03 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2019-04-03 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2019-04-03 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2019-04-03 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2019-04-03 20:49:00,57427,9.557,True,Client Services


# Filter with More than One Condition (AND - &)

In [47]:
# We can use the and operator or the or operator

In [56]:
males = df["Gender"] == "Male"
finance = df["Team"] == "Finance"

# Here we can add the different conditions when filtering through a DF
df[males & finance].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
3,Jerry,Male,2005-03-04,2019-04-03 13:00:00,138705,9.34,True,Finance
46,Bruce,Male,2009-11-28,2019-04-03 22:47:00,114796,6.796,False,Finance
56,Carl,Male,2006-05-03,2019-04-03 17:55:00,130276,16.084,True,Finance
68,Jose,Male,2004-10-30,2019-04-03 13:39:00,84834,14.33,True,Finance
83,Shawn,Male,2005-09-23,2019-04-03 02:55:00,148115,6.539,True,Finance


# Filter with More than One Condition (OR)

In [61]:
# filter senior management or startdate before 1985
# We will be using variables created from above 
# or is '|'
management = df["Senior Management"]
df[old | management].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-04-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2019-04-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-04-03 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2019-04-03 16:20:00,65476,10.012,True,Product


In [None]:
# Either name of Robert and team of client services OR start date greater than June 1, 2016

In [65]:
robert = df["First Name"] == "Robert"
client_service = df["Team"] == "Client Services"
start = df["Start Date"] >= "2016-06-01"
df[(robert & client_service) | start]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2019-04-03 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2019-04-03 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2019-04-03 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2019-04-03 00:29:00,140002,19.49,True,Marketing


# the .isin() Method

In [68]:
# Here we will be checking for multiple conditions within a given column
df[df["Team"].isin(["Legal", "Sales","Product"])].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2019-04-03 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2019-04-03 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2019-04-03 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2019-04-03 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2019-04-03 06:09:00,59414,1.256,False,Product


# isnull() and .notnull() Methods

In [71]:
# These check for the null status within a given row
# This returns true for a null value
mask = df["Team"].isnull()
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2019-04-03 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2019-04-03 16:19:00,125792,5.042,True,
32,,Male,1998-08-21,2019-04-03 14:27:00,122340,6.417,True,
91,James,,2005-01-26,2019-04-03 23:00:00,128771,8.309,False,


In [74]:
# This will only return true if the value in a row for a column is true 
df[df["Gender"].notnull()].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-04-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-04-03 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-04-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-04-03 16:47:00,101004,1.389,True,Client Services


# .between() Method

In [77]:
# This will return a df where column values fall between two values 60k - 80k 
# .between() is inclusive 
df[df["Salary"].between(60000, 80000)].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
6,Ruby,Female,1987-08-17,2019-04-03 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2019-04-03 09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,2019-04-03 19:18:00,64714,4.934,True,Legal
29,Benjamin,Male,2005-01-26,2019-04-03 22:06:00,79529,7.008,True,Legal


In [80]:
# Bonus between 2-5%
df[df["Bonus %"].between(2.0, 5.0)].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
20,Lois,,1995-04-22,2019-04-03 19:18:00,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,2019-04-03 11:25:00,99283,2.665,True,Distribution
49,Chris,,1980-01-24,2019-04-03 12:13:00,113590,3.055,False,Sales
60,Paula,,2005-11-23,2019-04-03 14:01:00,48866,4.271,False,Distribution


In [81]:
# This will also work with datetimes 
df[df["Start Date"].between("1991-01-01", "1992-01-01")]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,2019-04-03 18:58:00,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,2019-04-03 01:27:00,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,2019-04-03 13:59:00,64088,6.155,True,Legal
116,,Male,1991-06-22,2019-04-03 20:58:00,76189,18.988,True,Legal
148,Patrick,,1991-07-14,2019-04-03 02:24:00,124488,14.837,True,Sales
166,,Female,1991-07-09,2019-04-03 18:52:00,42341,7.014,True,Sales
172,Sara,Female,1991-09-23,2019-04-03 18:17:00,97058,9.402,False,Finance
220,,Female,1991-06-17,2019-04-03 12:49:00,71945,5.56,True,Marketing
245,Victor,Male,1991-04-11,2019-04-03 07:44:00,70817,17.138,False,Engineering
277,Brenda,,1991-05-29,2019-04-03 06:32:00,82439,19.062,False,Sales


In [84]:
# Here we will filter for time between 8:30 -12:00
df[df["Last Login Time"].between("08:30AM", "12:00PM")].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2019-04-03 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2019-04-03 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2019-04-03 09:01:00,63241,15.132,True,
18,Diana,Female,1981-10-23,2019-04-03 10:27:00,132940,19.082,False,Client Services
33,Jean,Female,1993-12-18,2019-04-03 09:07:00,119082,16.18,False,Business Development


# The .duplicated() Method

In [85]:
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-04-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-04-03 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-04-03 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-04-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-04-03 16:47:00,101004,1.389,True,Client Services


In [88]:
df.sort_values("First Name", inplace=True)

In [94]:
# By default, duplicated will keep the first instance and mark the rest as duplicates
df[df["First Name"].duplicated(keep=False)].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2019-04-03 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2019-04-03 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2019-04-03 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2019-04-03 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2019-04-03 01:45:00,95327,15.12,False,Distribution


In [96]:
# Here we will use the tilde symbol to reverse the status of every boolean
# Here we are returning the employees with unique names 
unique = ~df["First Name"].duplicated(keep=False)
df[unique]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2019-04-03 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2019-04-03 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2019-04-03 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2019-04-03 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2019-04-03 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2019-04-03 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2019-04-03 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2019-04-03 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2019-04-03 10:30:00,132839,17.463,True,Client Services


# .drop_duplicates() Method

In [98]:
len(df)

1000

In [101]:
# Subset are the columns where you want to drop the duplicates
df.drop_duplicates(subset=["First Name"], keep="first").head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2019-04-03 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2019-04-03 01:45:00,95327,15.12,False,Distribution
300,Alan,Male,1988-06-26,2019-04-03 03:54:00,111786,3.592,True,Engineering
372,Albert,Male,1997-02-01,2019-04-03 16:20:00,67827,19.717,True,Engineering
988,Alice,Female,2004-10-05,2019-04-03 09:34:00,47638,11.209,False,Human Resources


In [102]:
# Here if we use "keep=False", then we will only get the non-duplicated names
df.drop_duplicates(subset=["First Name"], keep=False).head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2019-04-03 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2019-04-03 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2019-04-03 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2019-04-03 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2019-04-03 01:35:00,115163,10.125,False,Legal


In [105]:
df.drop_duplicates(subset=["First Name", "Team"]).head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2019-04-03 10:20:00,61602,11.849,True,Marketing
440,Aaron,Male,1990-07-22,2019-04-03 14:53:00,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,2019-04-03 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2019-04-03 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2019-04-03 11:59:00,71276,5.027,True,Human Resources


# The .unique() and .nunique() Methods

In [106]:
# This will return all the unique values 
df["Gender"].unique()

[Male, NaN, Female]
Categories (2, object): [Male, Female]

In [108]:
df["Team"].unique()

array(['Marketing', 'Client Services', 'Distribution', 'Product',
       'Human Resources', 'Engineering', 'Finance', 'Business Development',
       'Sales', nan, 'Legal'], dtype=object)

In [109]:
len(df["Team"].unique()) # There are 11 unique items

11

In [110]:
df["Team"].nunique() # Here the number is 10 because it excludes the Null values 

10