## 5.1 Optimizing A Dataset for Memory Usage

In [None]:
import pandas as pd

In [None]:
pd.read_csv("employees.csv")

In [None]:
pd.read_csv("employees.csv", parse_dates = ["Start Date"]).head()

In [None]:
employees = pd.read_csv(
    "employees.csv", parse_dates = ["Start Date"]
)

### 5.1.1 Converting Data Types with the astype Method

In [None]:
employees.info()

In [None]:
employees["Mgmt"].astype(bool)

In [None]:
employees["Mgmt"] = employees["Mgmt"].astype(bool)

In [None]:
employees.tail()

In [None]:
employees.info()

In [None]:
# employees["Salary"].astype(int)

In [None]:
employees["Salary"].fillna(0).tail()

In [None]:
employees["Salary"].fillna(0).astype(int).head()

In [None]:
employees["Salary"] = employees["Salary"].fillna(0).astype(int)

In [None]:
employees.nunique()

In [None]:
employees["Gender"].astype("category")

In [None]:
employees["Gender"] = employees["Gender"].astype("category")

In [None]:
employees.info()

In [None]:
employees["Team"] = employees["Team"].astype("category")

In [None]:
employees.info()

## 5.2 Filtering by a Single Condition

In [None]:
"Maria" == "Maria"

In [None]:
"Maria" == "Taylor"

In [None]:
employees["First Name"] == "Maria"

In [None]:
employees[employees["First Name"] == "Maria"]

In [None]:
marias = employees["First Name"] == "Maria"
employees[marias]

In [None]:
"Engineering" != "HR"

In [None]:
employees["Team"] != "HR"

In [None]:
employees[employees["Team"] != "HR"]

In [None]:
employees[employees["Mgmt"]].head()

In [None]:
high_earners = employees["Salary"] > 100000
high_earners.head()

In [None]:
employees[high_earners].head()

## 5.3 Filtering by Multiple Conditions

### 5.3.1 The AND Condition

In [None]:
is_female = employees["Gender"] == "Female"

In [None]:
in_biz_dev = employees["Team"] == "Business Dev"

In [None]:
employees[is_female & in_biz_dev].head()

In [None]:
is_manager = employees["Mgmt"]
employees[is_female & in_biz_dev & is_manager].head()

### 5.3.2 The OR Condition

In [None]:
earning_below_40k = employees["Salary"] < 40000
started_after_2015 = employees["Start Date"] > "2015-01-01"

In [None]:
employees[earning_below_40k | started_after_2015].tail()

### 5.3.3 Inversion with ~

In [None]:
my_series = pd.Series([True, False, True])
my_series

In [None]:
~my_series

In [None]:
employees[employees["Salary"] < 100000].head()

In [None]:
employees[~(employees["Salary"] >= 100000)].head()

### 5.3.4 Methods for Booleans

## 5.4 Filtering by Condition

### 5.4.1 The isin Method

In [None]:
sales = employees["Team"] == "Sales"
legal = employees["Team"] == "Legal"
mktg  = employees["Team"] == "Marketing"
employees[sales | legal | mktg].head()

In [None]:
all_star_teams = ["Sales", "Legal", "Marketing"]
employees["Team"].isin(all_star_teams).head()

### 5.4.2 The between Method

In [None]:
higher_than_80 = employees["Salary"] >= 80000
lower_than_90  = employees["Salary"] < 90000
employees[higher_than_80 & lower_than_90].head()

In [None]:
between_80k_and_90k = employees["Salary"].between(80000, 90000)
employees[between_80k_and_90k].head()

In [None]:
eighties_folk = employees["Start Date"].between(
    left = "1980-01-01", 
    right = "1990-01-01"
)

employees[eighties_folk].head()

In [None]:
name_starts_with_r = employees["First Name"].between("R", "S")
employees[name_starts_with_r].head()

### 5.4.3 The isnull and notnull Methods

In [None]:
employees.head()

In [None]:
employees["Team"].isnull().head()

In [None]:
employees["Start Date"].isnull().head()

In [None]:
employees["Team"].notnull().head()

In [None]:
(~employees["Team"].isnull()).head()

In [None]:
no_team = employees["Team"].isnull()
employees[no_team].head()

In [None]:
has_name = employees["First Name"].notnull()
employees[has_name].tail()

### 5.4.4 Dealing with Null Values

In [None]:
employees = pd.read_csv(
    "employees.csv", parse_dates = ["Start Date"]
)

In [None]:
employees

In [None]:
employees.dropna()

In [None]:
employees.dropna(how = "all").tail()

In [None]:
employees.dropna(how = "any").tail()

In [None]:
employees.dropna(subset = ["Gender"]).tail()

In [None]:
employees.dropna(subset = ["Start Date", "Salary"]).head()

In [None]:
employees.dropna(how = "any", thresh = 4).head()

## 5.5 Dealing with Duplicates

### 5.5.1 The duplicated Method

In [None]:
employees["Team"].head()

In [None]:
employees["Team"].duplicated().head()

In [None]:
employees["Team"].duplicated(keep = "first").head()

In [None]:
employees["Team"].duplicated(keep = "last")

In [None]:
(~employees["Team"].duplicated()).head()

In [None]:
first_one_in_team = ~employees["Team"].duplicated()
employees[first_one_in_team]

### 5.5.2 The drop_duplicates Method

In [None]:
employees.drop_duplicates()

In [None]:
employees.drop_duplicates(subset = ["Team"])

In [None]:
employees.drop_duplicates(subset = ["Team"], keep = "last")

In [None]:
employees.drop_duplicates(subset = ["First Name"], keep = False)

In [None]:
name_is_douglas = employees["First Name"] == "Douglas"
is_male = employees["Gender"] == "Male"
employees[name_is_douglas & is_male]

In [None]:
employees.drop_duplicates(subset = ["Gender", "Team"]).head()

## 5.6 Coding Challenge

### 5.6.1 The Problem

### 5.6.2 Solutions

In [None]:
pd.read_csv("netflix.csv")

In [None]:
netflix = pd.read_csv("netflix.csv", parse_dates = ["date_added"])

In [None]:
netflix.info()

In [None]:
netflix.nunique()

In [None]:
netflix["type"] = netflix["type"].astype("category")

In [None]:
netflix.info()

In [None]:
netflix[netflix["title"] == "Limitless"]

In [None]:
directed_by_robert_rodriguez = netflix["director"] == "Robert Rodriguez"
is_movie = netflix["type"] == "Movie"
netflix[directed_by_robert_rodriguez & is_movie]

In [None]:
added_on_july_31 = netflix["date_added"] == "2019-07-31"
directed_by_altman = netflix["director"] == "Robert Altman"
netflix[added_on_july_31 | directed_by_altman]

In [None]:
directors = ["Orson Welles", "Aditya Kripalani", "Sam Raimi"]
target_directors = netflix["director"].isin(directors)
netflix[target_directors]

In [None]:
may_movies = netflix["date_added"].between(
    "2019-05-01", "2019-06-01"
)

netflix[may_movies].head()

In [None]:
netflix.dropna(subset = ["director"]).head()

In [None]:
netflix.drop_duplicates(subset = ["date_added"], keep = False)