In [104]:
import matplotlib.pyplot as plt
import numpy as np
import csv
import pandas as pd
import seaborn as sns

In [105]:
data = pd.read_csv('tiktok_dataset.csv')
# quick info of all data variables stored in the csv file
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19382 entries, 0 to 19381
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   #                         19382 non-null  int64  
 1   claim_status              19084 non-null  object 
 2   video_id                  19382 non-null  int64  
 3   video_duration_sec        19382 non-null  int64  
 4   video_transcription_text  19084 non-null  object 
 5   verified_status           19382 non-null  object 
 6   author_ban_status         19382 non-null  object 
 7   video_view_count          19084 non-null  float64
 8   video_like_count          19084 non-null  float64
 9   video_share_count         19084 non-null  float64
 10  video_download_count      19084 non-null  float64
 11  video_comment_count       19084 non-null  float64
dtypes: float64(5), int64(3), object(4)
memory usage: 1.8+ MB


In [106]:
# show raw table
data

Unnamed: 0,#,claim_status,video_id,video_duration_sec,video_transcription_text,verified_status,author_ban_status,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
0,1,claim,7017666017,59,someone shared with me that drone deliveries a...,not verified,under review,343296.0,19425.0,241.0,1.0,0.0
1,2,claim,4014381136,32,someone shared with me that there are more mic...,not verified,active,140877.0,77355.0,19034.0,1161.0,684.0
2,3,claim,9859838091,31,someone shared with me that american industria...,not verified,active,902185.0,97690.0,2858.0,833.0,329.0
3,4,claim,1866847991,25,someone shared with me that the metro of st. p...,not verified,active,437506.0,239954.0,34812.0,1234.0,584.0
4,5,claim,7105231098,19,someone shared with me that the number of busi...,not verified,active,56167.0,34987.0,4110.0,547.0,152.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19377,19378,,7578226840,21,,not verified,active,,,,,
19378,19379,,6079236179,53,,not verified,active,,,,,
19379,19380,,2565539685,10,,verified,under review,,,,,
19380,19381,,2969178540,24,,not verified,active,,,,,


In [107]:
# creating a dictionary for preparation to rename columns for more visibility
new_columns = {
    'video_duration_sec':'video_duration',
    'video_id': 'ID',
    'video_view_count':'Views',
    'video_like_count': 'Likes',
    'video_share_count': 'Shares',
    'video_download_count': 'Downloads',
    'video_comment_count': 'Comments',
}

# renaming columns
data.rename(new_columns, axis=1, inplace=True)
# removing columns
data.drop('#', axis=1, inplace=True)
data.head()

Unnamed: 0,claim_status,ID,video_duration,video_transcription_text,verified_status,author_ban_status,Views,Likes,Shares,Downloads,Comments
0,claim,7017666017,59,someone shared with me that drone deliveries a...,not verified,under review,343296.0,19425.0,241.0,1.0,0.0
1,claim,4014381136,32,someone shared with me that there are more mic...,not verified,active,140877.0,77355.0,19034.0,1161.0,684.0
2,claim,9859838091,31,someone shared with me that american industria...,not verified,active,902185.0,97690.0,2858.0,833.0,329.0
3,claim,1866847991,25,someone shared with me that the metro of st. p...,not verified,active,437506.0,239954.0,34812.0,1234.0,584.0
4,claim,7105231098,19,someone shared with me that the number of busi...,not verified,active,56167.0,34987.0,4110.0,547.0,152.0


In [108]:
# listing numbers of values for a variable
data.claim_status.value_counts()

claim_status
claim      9608
opinion    9476
Name: count, dtype: int64

In [109]:
# filter after claim status
data_claim = data.loc[data.claim_status == 'claim']
data_claim


Unnamed: 0,claim_status,ID,video_duration,video_transcription_text,verified_status,author_ban_status,Views,Likes,Shares,Downloads,Comments
0,claim,7017666017,59,someone shared with me that drone deliveries a...,not verified,under review,343296.0,19425.0,241.0,1.0,0.0
1,claim,4014381136,32,someone shared with me that there are more mic...,not verified,active,140877.0,77355.0,19034.0,1161.0,684.0
2,claim,9859838091,31,someone shared with me that american industria...,not verified,active,902185.0,97690.0,2858.0,833.0,329.0
3,claim,1866847991,25,someone shared with me that the metro of st. p...,not verified,active,437506.0,239954.0,34812.0,1234.0,584.0
4,claim,7105231098,19,someone shared with me that the number of busi...,not verified,active,56167.0,34987.0,4110.0,547.0,152.0
...,...,...,...,...,...,...,...,...,...,...,...
9603,claim,3883493316,49,a colleague discovered on the radio a claim th...,not verified,active,737177.0,460743.0,54550.0,8119.0,3372.0
9604,claim,4765029942,9,a colleague discovered on the radio a claim th...,verified,active,546987.0,360080.0,79346.0,4537.0,2432.0
9605,claim,3513102998,27,a colleague discovered on the radio a claim th...,not verified,under review,885521.0,209475.0,44286.0,1210.0,794.0
9606,claim,9461481859,27,a colleague discovered on the radio a claim th...,not verified,active,356747.0,99394.0,21016.0,1163.0,497.0


In [110]:
#removing entries with zero likes to clean the data
data['Likes'].dropna()

0         19425.0
1         77355.0
2         97690.0
3        239954.0
4         34987.0
           ...   
19079       423.0
19080       820.0
19081       102.0
19082       655.0
19083       815.0
Name: Likes, Length: 19084, dtype: float64

In [115]:
# dropping rows with null values
data = data.dropna()

In [116]:
data

Unnamed: 0,claim_status,ID,video_duration,video_transcription_text,verified_status,author_ban_status,Views,Likes,Shares,Downloads,Comments
0,claim,7017666017,59,someone shared with me that drone deliveries a...,not verified,under review,343296.0,19425.0,241.0,1.0,0.0
1,claim,4014381136,32,someone shared with me that there are more mic...,not verified,active,140877.0,77355.0,19034.0,1161.0,684.0
2,claim,9859838091,31,someone shared with me that american industria...,not verified,active,902185.0,97690.0,2858.0,833.0,329.0
3,claim,1866847991,25,someone shared with me that the metro of st. p...,not verified,active,437506.0,239954.0,34812.0,1234.0,584.0
4,claim,7105231098,19,someone shared with me that the number of busi...,not verified,active,56167.0,34987.0,4110.0,547.0,152.0
...,...,...,...,...,...,...,...,...,...,...,...
19079,opinion,1492320297,49,in our opinion the earth holds about 11 quinti...,not verified,active,6067.0,423.0,81.0,8.0,2.0
19080,opinion,9841347807,23,in our opinion the queens in ant colonies live...,not verified,active,2973.0,820.0,70.0,3.0,0.0
19081,opinion,8024379946,50,in our opinion the moon is moving away from th...,not verified,active,734.0,102.0,7.0,2.0,1.0
19082,opinion,7425795014,8,in our opinion lightning strikes somewhere on ...,not verified,active,3394.0,655.0,123.0,11.0,4.0


In [117]:
# save the new cleaned table as an excel file
data.to_excel('tiktok_dataset_final.xlsx', sheet_name='Data')