In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('udemy_material/employees.csv', parse_dates=['Start Date', 'Last Login Time'])

df['Senior Management'] = df['Senior Management'].astype("bool")

df["Gender"] = df['Gender'].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null object
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.0+ KB


In [2]:
# Filter based on a condition, eg., Male
# Create a Boolean series as a variable
males = df['Gender'] == 'Male'
males.head(3)

0     True
1     True
2    False
Name: Gender, dtype: bool

In [3]:
# Use the Boolean series as an index
df[males].head(2)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-02-05 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-02-05 06:53:00,61933,4.17,True,


In [4]:
# Can pass the entire argument as the index
df[df['Team'] == 'Finance'].head(2)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2018-02-05 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-02-05 13:00:00,138705,9.34,True,Finance


In [5]:
# What if the column is already a Boolean?
# Just pass the col name
df[df['Senior Management']].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-02-05 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-02-05 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2018-02-05 13:00:00,138705,9.34,True,Finance


In [6]:
# Not equal to...
df[df['Team'] != 'Marketing'].head(2)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2018-02-05 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-02-05 11:17:00,130590,11.858,False,Finance


In [7]:
df[df['Salary'] > 110000].head(2)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2018-02-05 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-02-05 13:00:00,138705,9.34,True,Finance


In [8]:
# Can use < and > with regards to the date
df[df['Start Date'] <= '1995-01-31'].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-02-05 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2018-02-05 11:17:00,130590,11.858,False,Finance
5,Dennis,Male,1987-04-18,2018-02-05 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2018-02-05 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2018-02-05 09:01:00,63241,15.132,True,


In [9]:
males = df['Gender'] == 'Male'
marketing = df['Team'] == 'Marketing'
dates = df['Start Date'] > '2016-06-01'
df[(males & marketing) | dates].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-02-05 12:42:00,97308,6.945,True,Marketing
15,Lillian,Female,2016-06-05,2018-02-05 06:09:00,59414,1.256,False,Product
21,Matthew,Male,1995-09-05,2018-02-05 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2018-02-05 07:45:00,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2018-02-05 14:24:00,62096,17.029,False,Marketing


In [11]:
# isin()
team_bool = df['Team'].isin(['Sales', 'Marketing', 'Product'])
df[team_bool].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-02-05 12:42:00,97308,6.945,True,Marketing
6,Ruby,Female,1987-08-17,2018-02-05 16:20:00,65476,10.012,True,Product
13,Gary,Male,2008-01-27,2018-02-05 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2018-02-05 06:09:00,59414,1.256,False,Product
17,Shawn,Male,1986-12-07,2018-02-05 19:45:00,111737,6.414,False,Product


In [18]:
# between
sixty_seventy = df[df['Salary'].between(60000,70000)]
sixty_seventy.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2018-02-05 06:53:00,61933,4.17,True,
6,Ruby,Female,1987-08-17,2018-02-05 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2018-02-05 09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,2018-02-05 19:18:00,64714,4.934,True,Legal
41,Christine,,2015-06-28,2018-02-05 01:08:00,66582,11.308,True,Business Development


In [22]:
timings = df["Last Login Time"].between("08:30AM", "12:00PM")
df[timings].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2018-02-05 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2018-02-05 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2018-02-05 09:01:00,63241,15.132,True,
18,Diana,Female,1981-10-23,2018-02-05 10:27:00,132940,19.082,False,Client Services
33,Jean,Female,1993-12-18,2018-02-05 09:07:00,119082,16.18,False,Business Development


In [37]:
df.sort_values('First Name', inplace=True)
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2018-02-05 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2018-02-05 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2018-02-05 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2018-02-05 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2018-02-05 01:45:00,95327,15.12,False,Distribution


In [44]:
# duplicated returns first instance as true and duplicates as Falsedf['First Name'].duplicated()
mask = ~df['First Name'].duplicated(keep=False)
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2018-02-05 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2018-02-05 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2018-02-05 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2018-02-05 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2018-02-05 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2018-02-05 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2018-02-05 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2018-02-05 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2018-02-05 10:30:00,132839,17.463,True,Client Services


In [45]:
len(df)

1000

In [46]:
len(df.drop_duplicates(subset=["First Name"], keep="first"))

201

In [47]:
len(df.drop_duplicates(subset=["First Name"], keep=False))

9

In [52]:
df['Gender'].unique()

[Male, NaN, Female]
Categories (2, object): [Male, Female]

In [53]:
df['First Name'].nunique() # drops na

200

In [60]:
'Aaron' in df['First Name']

False

In [62]:
l = [1,3]
3 in l

True