# More on Pandas Basics
## More Pandas Methods

In [2]:
# import library
import pandas as pd

In [3]:
# create a dataframe 
tiktok = pd.read_csv("tiktok_dataset.csv")

In [4]:
# check dataframe
tiktok.head()

Unnamed: 0,#,claim_status,video_id,video_duration_sec,video_transcription_text,verified_status,author_ban_status,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
0,1,claim,7017666017,59,someone shared with me that drone deliveries a...,not verified,under review,343296.0,19425.0,241.0,1.0,0.0
1,2,claim,4014381136,32,someone shared with me that there are more mic...,not verified,active,140877.0,77355.0,19034.0,1161.0,684.0
2,3,claim,9859838091,31,someone shared with me that american industria...,not verified,active,902185.0,97690.0,2858.0,833.0,329.0
3,4,claim,1866847991,25,someone shared with me that the metro of st. p...,not verified,active,437506.0,239954.0,34812.0,1234.0,584.0
4,5,claim,7105231098,19,someone shared with me that the number of busi...,not verified,active,56167.0,34987.0,4110.0,547.0,152.0


In [5]:
# info() returns list of columns, non-null count, and data types
tiktok.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19382 entries, 0 to 19381
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   #                         19382 non-null  int64  
 1   claim_status              19084 non-null  object 
 2   video_id                  19382 non-null  int64  
 3   video_duration_sec        19382 non-null  int64  
 4   video_transcription_text  19084 non-null  object 
 5   verified_status           19382 non-null  object 
 6   author_ban_status         19382 non-null  object 
 7   video_view_count          19084 non-null  float64
 8   video_like_count          19084 non-null  float64
 9   video_share_count         19084 non-null  float64
 10  video_download_count      19084 non-null  float64
 11  video_comment_count       19084 non-null  float64
dtypes: float64(5), int64(3), object(4)
memory usage: 1.8+ MB


In [6]:
# describe() returns descriptive statistics of dataframe
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html
tiktok.describe()

Unnamed: 0,#,video_id,video_duration_sec,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
count,19382.0,19382.0,19382.0,19084.0,19084.0,19084.0,19084.0,19084.0
mean,9691.5,5627454000.0,32.421732,254708.558688,84304.63603,16735.248323,1049.429627,349.312146
std,5595.245794,2536440000.0,16.229967,322893.280814,133420.546814,32036.17435,2004.299894,799.638865
min,1.0,1234959000.0,5.0,20.0,0.0,0.0,0.0,0.0
25%,4846.25,3430417000.0,18.0,4942.5,810.75,115.0,7.0,1.0
50%,9691.5,5618664000.0,32.0,9954.5,3403.5,717.0,46.0,9.0
75%,14536.75,7843960000.0,47.0,504327.0,125020.0,18222.0,1156.25,292.0
max,19382.0,9999873000.0,60.0,999817.0,657830.0,256130.0,14994.0,9599.0


In [7]:
# you can spacify the column
tiktok['video_duration_sec'].describe()

count    19382.000000
mean        32.421732
std         16.229967
min          5.000000
25%         18.000000
50%         32.000000
75%         47.000000
max         60.000000
Name: video_duration_sec, dtype: float64

In [8]:
# you can also use dot to access series inside dataframe
tiktok.video_duration_sec.describe()

count    19382.000000
mean        32.421732
std         16.229967
min          5.000000
25%         18.000000
50%         32.000000
75%         47.000000
max         60.000000
Name: video_duration_sec, dtype: float64

In [9]:
# if you just want spacific statistics, you can specify
tiktok['video_duration_sec'].mean()

32.42173150345682

In [10]:
tiktok['video_duration_sec'].min()

5

In [11]:
# you can show descriptives for object data
tiktok.describe(include=[object])

Unnamed: 0,claim_status,video_transcription_text,verified_status,author_ban_status
count,19084,19084,19382,19382
unique,2,19012,2,3
top,claim,a friend read in the media a claim that badmi...,not verified,active
freq,9608,2,18142,15663


In [12]:
# you can change datatype using astype()
tiktok['video_id'] = tiktok['video_id'].astype(str)
tiktok.describe(include=[object])

Unnamed: 0,claim_status,video_id,video_transcription_text,verified_status,author_ban_status
count,19084,19382,19084,19382,19382
unique,2,19382,19012,2,3
top,claim,7017666017,a friend read in the media a claim that badmi...,not verified,active
freq,9608,1,2,18142,15663


In [14]:
# value_counts() returns a series containing counts of unique rows in the dataframe
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.value_counts.html
tiktok['claim_status'].value_counts()

claim_status
claim      9608
opinion    9476
Name: count, dtype: int64

In [15]:
# normalize returns proportions rather than frequencies
tiktok['claim_status'].value_counts(normalize=True)

claim_status
claim      0.503458
opinion    0.496542
Name: proportion, dtype: float64

In [16]:
# sort_values() sort by the values across a given axis
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html
tiktok_sort = tiktok.sort_values(by=['video_view_count'],ascending=False)
tiktok_sort.head(10)

Unnamed: 0,#,claim_status,video_id,video_duration_sec,video_transcription_text,verified_status,author_ban_status,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
9273,9274,claim,5491664721,16,i discovered on the radio a claim that 1920 wa...,not verified,active,999817.0,385415.0,63849.0,5330.0,2475.0
8372,8373,claim,8614898760,26,a colleague located an article mentioning that...,not verified,active,999673.0,525305.0,9296.0,10049.0,2873.0
1263,1264,claim,8119826638,52,a colleague told them the media discovered the...,not verified,under review,999655.0,235087.0,38702.0,2245.0,666.0
274,275,claim,6662549277,5,a friend mentioned someone revealed that the f...,not verified,active,999653.0,45323.0,12923.0,101.0,50.0
8532,8533,claim,7712371483,16,a friend discovered on the news that 1 billion...,not verified,active,999446.0,253776.0,100579.0,1845.0,1091.0
6651,6652,claim,4585936744,23,a colleague read online that sputnik was the ...,not verified,active,999346.0,66574.0,26338.0,508.0,73.0
2746,2747,claim,2315352343,15,a friend read in a discussion board a claim t...,not verified,under review,999132.0,361418.0,72489.0,8046.0,1183.0
1684,1685,claim,9389133842,37,a friend learned on an internet forum a claim...,not verified,under review,999127.0,438885.0,10456.0,9888.0,1724.0
4635,4636,claim,2446925197,57,i learned from the media that baby spiders ar...,not verified,active,999082.0,83324.0,17272.0,263.0,78.0
1648,1649,claim,4417029469,59,a friend learned on an internet forum that th...,not verified,active,998911.0,584177.0,129714.0,8037.0,185.0


In [17]:
# if you want to see only the column 'video_view_count' as a series
# specify a column directly
tiktok_sort['video_view_count']

9273     999817.0
8372     999673.0
1263     999655.0
274      999653.0
8532     999446.0
           ...   
19377         NaN
19378         NaN
19379         NaN
19380         NaN
19381         NaN
Name: video_view_count, Length: 19382, dtype: float64

In [18]:
# if you want to see the column 'video_view_count' as a dataframe,
# specify the column as a list
tiktok_sort[['video_view_count']]

Unnamed: 0,video_view_count
9273,999817.0
8372,999673.0
1263,999655.0
274,999653.0
8532,999446.0
...,...
19377,
19378,
19379,
19380,


In [19]:
# show only 3 most viewed videos' view counts
tiktok_sort[['video_view_count']].head(n=3)

Unnamed: 0,video_view_count
9273,999817.0
8372,999673.0
1263,999655.0


## Boolean Masking in Pandas

In [20]:
data = {'planet': ['Mercury', 'Venus', 'Earth', 'Mars',
                   'Jupiter', 'Saturn', 'Uranus', 'Neptune'],
       'radius_km': [2440, 6052, 6371, 3390, 69911, 58232,
                     25362, 24622],
       'moons': [0, 0, 1, 2, 80, 83, 27, 14]
        }
planet = pd.DataFrame(data)
planet

Unnamed: 0,planet,radius_km,moons
0,Mercury,2440,0
1,Venus,6052,0
2,Earth,6371,1
3,Mars,3390,2
4,Jupiter,69911,80
5,Saturn,58232,83
6,Uranus,25362,27
7,Neptune,24622,14


In [21]:
# this returns a series object of dtype: bool
print(planet['moons'] < 20)

0     True
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: moons, dtype: bool


In [22]:
# this converts series objects to dataframe
print(planet[planet['moons'] < 20])

    planet  radius_km  moons
0  Mercury       2440      0
1    Venus       6052      0
2    Earth       6371      1
3     Mars       3390      2
7  Neptune      24622     14


In [23]:
# You can also assign Boolean mask to a named variable and then apply that to your dataframe.
mask = planet['moons'] < 20
planet[mask]

Unnamed: 0,planet,radius_km,moons
0,Mercury,2440,0
1,Venus,6052,0
2,Earth,6371,1
3,Mars,3390,2
7,Neptune,24622,14


In [24]:
# planet dataframe was not changed.
planet

Unnamed: 0,planet,radius_km,moons
0,Mercury,2440,0
1,Venus,6052,0
2,Earth,6371,1
3,Mars,3390,2
4,Jupiter,69911,80
5,Saturn,58232,83
6,Uranus,25362,27
7,Neptune,24622,14


In [25]:
mask = planet['moons'] < 20
planet2 = planet[mask]
planet2

Unnamed: 0,planet,radius_km,moons
0,Mercury,2440,0
1,Venus,6052,0
2,Earth,6371,1
3,Mars,3390,2
7,Neptune,24622,14


In [26]:
# if you want to select just the planet column as a Series object, you can use regular selection tools like loc[]
mask = planet['moons'] < 20
planet.loc[mask, 'planet']

0    Mercury
1      Venus
2      Earth
3       Mars
7    Neptune
Name: planet, dtype: object

In [27]:
# Logical operators:
# & (and) 
# | (or) 
# ~ (not)

mask = (planet['moons'] < 10) | (planet['moons'] > 50)
planet[mask]

Unnamed: 0,planet,radius_km,moons
0,Mercury,2440,0
1,Venus,6052,0
2,Earth,6371,1
3,Mars,3390,2
4,Jupiter,69911,80
5,Saturn,58232,83


In [28]:
# select all planets that have more than 20 moons, but not planets 
# with 80 moons and not planets with a radius less than 50,000 km
mask = (planet['moons'] > 20) & ~(planet['moons'] == 80) & ~(planet['radius_km'] < 50000)
planet[mask]

Unnamed: 0,planet,radius_km,moons
5,Saturn,58232,83


In [29]:
# this returns the same result as above.
mask = (planet['moons'] > 20) & (planet['moons'] != 80) & (planet['radius_km'] >= 50000)
planet[mask]

Unnamed: 0,planet,radius_km,moons
5,Saturn,58232,83
