# Working with Pandas on a Netflix dataset - Solutions

In [1]:
# Import pandas library
import pandas as pd
import numpy as np  # We'll use numpy for some operations

## Exercise 1: Loading Data

In [None]:
# Load the CSV file into a DataFrame
#netflix_df = pd.read_csv('path/to/your/folder/netflix_titles.csv')
netflix_df = pd.read_csv('/Users/stefanodegiorgis/tmp/comp-data/exercises/03/netflix_titles.csv')

# Display the first 5 rows
netflix_df.head()

## Exercise 2: Basic Information

**Objective**: Get basic information about the DataFrame.

- Display the shape (number of rows and columns) of the DataFrame.
- Use the `info()` method to get a summary of the DataFrame, including the data types of each column and the number of non-null entries.

In [None]:
# Display the shape of the DataFrame (rows, columns)
print("Shape of DataFrame (rows, columns):", netflix_df.shape)

# Display information about the DataFrame
print("\nDataFrame Info:")
netflix_df.info()

## Exercise 3: Indexing and Selection

**Objective**: Select specific columns and rows from the DataFrame.

- Select the `title`, `country`, and `release_year` columns and display the first 10 rows.
- Select and display the row for the movie "Dick Johnson Is Dead" using boolean indexing.

In [None]:
# We are creating a new object that is a subset of the original DataFrame
# We are selecting the first 10 rows and only the 'title', 'country', and 'release_year' columns
selected_columns = netflix_df[['title', 'country', 'release_year']].head(10)

# As always, this line "First 10 rows..." is just a string that we are printing to help you understand what the output is
# in terminal. In this case, we are printing the first 10 rows of the selected columns
print("First 10 rows of selected columns:")

# In python script in VS Code you have to add a print statement to display the selected_columns, like this:
#print(selected_columns)
selected_columns

In [None]:
# Select the row for "Dick Johnson Is Dead" using boolean indexing, namely:
# we are creating a new object that is a subset of the original DataFrame, and in particular we are
# filtering for the title being equal to "Dick Johnson is Dead"
dick_johnson = netflix_df[netflix_df['title'] == 'Dick Johnson Is Dead']
print("\nRow for 'Dick Johnson Is Dead':")
dick_johnson

## Exercise 4: Slicing

**Objective**: Use slicing to create a sub-DataFrame.

- Create a sub-DataFrame containing only the first 20 entries of the Netflix dataset.
- From this sub-DataFrame, select only the title and rating columns.

In [None]:
# Create a sub-DataFrame with the first 20 entries
sub_df = netflix_df.iloc[:20]
print("Sub-DataFrame with first 20 entries:")
sub_df

In [None]:
# From the sub-DataFrame, we select only title and rating columns
title_rating_df = sub_df[['title', 'rating']]
print("\nTitle and rating columns from sub-DataFrame:")
title_rating_df

## Exercise 5: Basic Data Manipulation

**Objective**: Add a new column to the DataFrame named duration_minutes that contains the duration of the show in minutes.

For movies, extract the number of minutes directly. For TV shows, assume 10 episodes per season with each episode lasting 45 minutes.

In [None]:
# First, let's look at some examples of duration values
print("Sample duration values:")
print(netflix_df[['type', 'duration']].head(10))

In [None]:
# Function to convert duration to minutes
def convert_to_minutes(row):
    # Someone among you was asking how to deal with the NaN value.
    # There is a specific method in pandas that does this, and it is isna(), which, in thi case, checks if duration is missing (NaN)
    if pd.isna(row['duration']):
        return None
    
    duration_str = row['duration']
    
    # For movies, extract minutes
    if row['type'] == 'Movie':
        if 'min' in duration_str:
            # Extract the number before 'min'
            # This solution actually assumes that the number of minutes is always the first number in the string
            # Another possibility would be to use regular expressions to extract the characters which are numbers e.g. [0-9]+
            minutes = int(duration_str.split(' ')[0])
            return minutes
    
    # For TV shows, calculate minutes based on seasons
    elif row['type'] == 'TV Show':
        if 'Season' in duration_str or 'Seasons' in duration_str:
            # Extract the number of seasons
            seasons = int(duration_str.split(' ')[0])
            # As suggested in the original notebook, we assume 10 episodes per season, 45 minutes per episode
            # Therefore we simply multiply the number of seasons by 10 and by 45
            return seasons * 10 * 45
    
    # Default case if format is unexpected
    return None

# Apply the function to each row to create new column
netflix_df['duration_minutes'] = netflix_df.apply(convert_to_minutes, axis=1)

# Display some results
print("DataFrame with new duration_minutes column:")
netflix_df[['type', 'duration', 'duration_minutes']].head(10)

## Exercise 6: Querying

**Objective**: Use query methods to filter the DataFrame.

- Filter the DataFrame to show only entries that are movies released in the year 2020 and later.
- Further refine this query to show only movies with a rating of `PG-13`.

In [None]:
# Filter for movies released in 2020 or later
recent_movies = netflix_df[(netflix_df['type'] == 'Movie') & (netflix_df['release_year'] >= 2020)]
print(f"Number of movies released in 2020 or later: {len(recent_movies)}")
recent_movies.head()

In [None]:
# Further filter for PG-13 rated movies
pg13_recent_movies = recent_movies[recent_movies['rating'] == 'PG-13']
print(f"Number of PG-13 rated movies released in 2020 or later: {len(pg13_recent_movies)}")
pg13_recent_movies.head()

## Exercise 7: Grouping and Aggregating

**Objective**: Analyze the Netflix dataset by performing grouping and aggregation operations.

### Part 1: Count Content by Country
Calculate the total number of Netflix titles for each country. Display the top 5 countries by total title count.

In [None]:
# Group by country and count titles
# First, we need to handle the case where some entries might have multiple countries (as it happens in this dataset)
# We'll create a simplified approach by taking only the first country listed

# Let's examine a few country entries to understand the format
print("Sample country values:")
print(netflix_df['country'].head(10))

In [None]:
# Function to get the first country if multiple are listed
def get_first_country(country_str):
    # Check if the value is missing with isna()
    if pd.isna(country_str):
        return 'Unknown'
    # Split by comma and take the first entry
    return country_str.split(',')[0].strip()

# Create a new column with the first country
netflix_df['primary_country'] = netflix_df['country'].apply(get_first_country)

# Group by primary_country and count titles
# As you can see, as mentioned to some of you, we can concatenate multiple operations in a single line
# In this case, we are grouping by the primary_country and then counting the number of titles for each country via calling
# the size() method, and finally sorting the results in descending order
# the operations will be performed in the order they are written
country_counts = netflix_df.groupby('primary_country').size().sort_values(ascending=False)

# Display the top 5 countries by title count
print("Top 5 countries by title count:")
country_counts.head(5)

### Part 2: Average Duration of Movies
Find the average duration of movies across all available movies in the dataset.

In [None]:
# Filter for movies only
movies_only = netflix_df[netflix_df['type'] == 'Movie']

# We can use the duration_minutes column we created earlier
# The mean() method calculates the average of the values in the column
avg_movie_duration = movies_only['duration_minutes'].mean()

print(f"Average movie duration: {avg_movie_duration:.2f} minutes")

## Exercise 8: Handling Missing Values

**Objective**: Handle missing values in the DataFrame.

- Identify columns with missing values and the count of missing values in each.
- For the country column, replace missing values with a default value (e.g., "Unknown").
- Drop any rows where the `title` or `release_year` is missing.

In [None]:
# Count missing values in each column
# We can use the isnull() method to create a DataFrame of booleans, where True indicates a missing value
# We then use the sum() method to count the True values for each column
missing_values = netflix_df.isnull().sum()
print("Missing values by column:")
print(missing_values[missing_values > 0])  # Only show columns with missing values

In [None]:
# Replace missing values in the country column with "Unknown"
netflix_df['country'] = netflix_df['country'].fillna("Unknown")

# Verify the change
print("\nMissing values in country column after filling:", netflix_df['country'].isnull().sum())

In [None]:
# Drop rows where title or release_year is missing
# First, let's see how many rows might be affected
print("Rows with missing title:", netflix_df['title'].isnull().sum())
print("Rows with missing release_year:", netflix_df['release_year'].isnull().sum())

# Now drop the rows
netflix_df_clean = netflix_df.dropna(subset=['title', 'release_year'])

# Verify the shape after dropping
print("\nOriginal DataFrame shape:", netflix_df.shape)
print("Cleaned DataFrame shape:", netflix_df_clean.shape)

## Exercise 9: Exporting Data

**Objective**: Export a modified DataFrame to a new CSV file.
- Ensure that the index is not included in the exported file.

In [None]:
# Export the cleaned DataFrame to a new CSV file without the index
netflix_df_clean.to_csv('netflix_cleaned.csv', index=False)

print("Exported cleaned DataFrame to 'netflix_cleaned.csv'")