# More on Pandas Basics
## More Pandas Methods

In [1]:
# import library
import pandas as pd

In [2]:
# create a dataframe 
tiktok = pd.read_csv("tiktok_dataset.csv")

In [None]:
# check dataframe
tiktok.head()

In [None]:
# info() returns list of columns, non-null count, and data types
tiktok.info()

In [None]:
# describe() returns descriptive statistics of dataframe
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html
tiktok.describe()

In [None]:
# you can spacify the column
tiktok['video_duration_sec'].describe()

In [None]:
# you can also use dot to access series inside dataframe
tiktok.video_duration_sec.describe()

In [None]:
# if you just want spacific statistics, you can specify
tiktok['video_duration_sec'].mean()

In [None]:
tiktok['video_duration_sec'].min()

In [None]:
# you can show descriptives for object data
tiktok.describe(include=[object])

In [None]:
# you can change datatype using astype()
tiktok['video_id'] = tiktok['video_id'].astype(str)
tiktok.describe(include=[object])

In [None]:
# value_counts() returns a series containing counts of unique rows in the dataframe
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.value_counts.html
tiktok['claim_status'].value_counts()

In [None]:
# normalize returns proportions rather than frequencies
tiktok['claim_status'].value_counts(normalize=True)

In [None]:
# sort_values() sort by the values across a given axis
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html
tiktok_sort = tiktok.sort_values(by=['video_view_count'],ascending=False)
tiktok_sort.head(10)

In [None]:
# if you want to see only the column 'video_view_count' as a series
# specify a column directly
tiktok_sort['video_view_count']

In [None]:
# if you want to see the column 'video_view_count' as a dataframe,
# specify the column as a list
tiktok_sort[['video_view_count']]

In [None]:
# show only 3 most viewed videos' view counts
tiktok_sort[['video_view_count']].head(n=3)

## Boolean Masking in Pandas

In [None]:
data = {'planet': ['Mercury', 'Venus', 'Earth', 'Mars',
                   'Jupiter', 'Saturn', 'Uranus', 'Neptune'],
       'radius_km': [2440, 6052, 6371, 3390, 69911, 58232,
                     25362, 24622],
       'moons': [0, 0, 1, 2, 80, 83, 27, 14]
        }
planet = pd.DataFrame(data)
planet

In [None]:
# this returns a series object of dtype: bool
print(planet['moons'] < 20)

In [None]:
# this converts series objects to dataframe
print(planet[planet['moons'] < 20])

In [None]:
# You can also assign Boolean mask to a named variable and then apply that to your dataframe.
mask = planet['moons'] < 20
planet[mask]

In [None]:
# planet dataframe was not changed.
planet

In [None]:
mask = planet['moons'] < 20
planet2 = planet[mask]
planet2

In [None]:
# if you want to select just the planet column as a Series object, you can use regular selection tools like loc[]
mask = planet['moons'] < 20
planet.loc[mask, 'planet']

In [None]:
# Logical operators:
# & (and) 
# | (or) 
# ~ (not)

mask = (planet['moons'] < 10) | (planet['moons'] > 50)
planet[mask]

In [None]:
# select all planets that have more than 20 moons, but not planets 
# with 80 moons and not planets with a radius less than 50,000 km
mask = (planet['moons'] > 20) & ~(planet['moons'] == 80) & ~(planet['radius_km'] < 50000)
planet[mask]

In [None]:
# this returns the same result as above.
mask = (planet['moons'] > 20) & (planet['moons'] != 80) & (planet['radius_km'] >= 50000)
planet[mask]