In [None]:
import pandas as pd
import numpy as np

# Load the dataset
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-09-17/national_parks.csv"
parks_df = pd.read_csv(url)

## Task 1: Data Exploration and Cleaning

In [None]:
# 1 How many rows and columns does the DataFrame have?
print(f"This DataFrame has shape {parks_df.shape}")

In [None]:
# 2 What are the column names?
print(f"The column names are {list(parks_df.columns)}")

In [None]:
# 3 What data types are used in each column?
print(f"The data types in each column are {parks_df.dtypes}")

In [None]:
# 4 Are there any missing values in the DataFrame?
print(parks_df.info)

In [None]:
# 5 Remove the rows where year is Total (these are summary rows we don’t need for our analysis).
parks_df = parks_df[parks_df['year'] != 'Total']

In [None]:
# 6. Convert the year column to numeric type.
pd.to_numeric(parks_df['year'])

## Task 2: Basic Filtering and Analysis

In [None]:
# 1 Create a new DataFrame containing only data for the years 2000-2015 and only data for National Parks (unit_type is National Park)

In [None]:
correct_years = (parks_df['year'] >= '2000') & (parks_df['year'] <= '2015')
nat_parks_df = parks_df['unit_type'] == 'National Park'

sub_parks_df = parks_df[correct_years & nat_parks_df]
sub_parks_df.head()

In [None]:
# 2. Find the total number of visitors across all National Parks for each year from 2000-2015.
total_visitors = sub_parks_df['visitors'].sum()
print(total_visitors)

In [None]:
# 3. Calculate the average yearly visitors for each National Park during the 2000-2015 period.
grouped_parks = sub_parks_df.groupby('parkname').sum('year')
grouped_parks.head()

In [None]:
# 4. Identify the top 5 most visited National Parks (based on total visitors) during the 2000-2015 period.
grouped_parks['visitors'].sort_values(ascending=False).head()