In [None]:
# Import required packages

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Import the data set
cwd = os.getcwd()

# read_csv() reads data from a CSV into a DataFrame
data_set = pd.read_csv(cwd + "/netflix_titles.csv")

In [None]:
# head() prints the first 5 rows of the DataFrame
data_set.head()

#### `head()` output
| show_id | type | title   | director              | cast            | country                                           | date_added    | release_year       | rating | duration | listed_in | description                                       |
|---------|------|---------|-----------------------|-----------------|---------------------------------------------------|---------------|--------------------|--------|----------|-----------|---------------------------------------------------|
| 0       | s1   | Movie   | Dick Johnson Is Dead  | Kirsten Johnson | NaN                                               | United States | September 25, 2021 | 2020   | PG-13    | 90 min    | Documentaries                                     | As her father nears the end of his life, filmm... |
| 1       | s2   | TV Show | Blood & Water         | NaN             | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa  | September 24, 2021 | 2021   | TV-MA    | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries   | After crossing paths at a party, a Cape Town t... |
| 2       | s3   | TV Show | Ganglands             | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN           | September 24, 2021 | 2021   | TV-MA    | 1 Season  | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
| 3       | s4   | TV Show | Jailbirds New Orleans | NaN             | NaN                                               | NaN           | September 24, 2021 | 2021   | TV-MA    | 1 Season  | Docuseries, Reality TV                            | Feuds, flirtations and toilet talk go down amo... |
| 4       | s5   | TV Show | Kota Factory          | NaN             | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India         | September 24, 2021 | 2021   | TV-MA    | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... | 

In [None]:
# info() prints information about the DataFrame. This includes information like the number of columns and data type.
data_set.info()

#### `info()` output
```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
```

In [None]:
# isna() - Is used to detect missing values for each column
# sum() - Returns the sum of missing values for each number
data_set.isna().sum()

In [None]:
# duplicated() - Returns a boolean Series noting duplicate rows.
data_set.duplicated().sum()

### Data Notes
* There are `6` columns which contain missing values
* There are no duplicates
* `show_id` is not a relevant column

### Data Cleaning
* Drop the `show_id` column, and drop any row that have NA values

In [None]:
# Drop rows with NA columns
data_set = data_set.dropna()

# Drop the irelevant column(s)
data_set = data_set.drop(columns=['show_id'])

In [None]:
# Get DataFrame output to validate that columns have matching number of rows
data_set.info()

In [None]:
# Plot the number of movies vs number of TV shows
counts = data_set['type'].value_counts()
plt.bar(counts.index, counts.values)
plt.show()
