In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Loading the dataset
data = pd.read_csv("tiktok_dataset.csv")
data.head()

Unnamed: 0,#,claim_status,video_id,video_duration_sec,video_transcription_text,verified_status,author_ban_status,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
0,1,claim,7017666017,59,someone shared with me that drone deliveries a...,not verified,under review,343296.0,19425.0,241.0,1.0,0.0
1,2,claim,4014381136,32,someone shared with me that there are more mic...,not verified,active,140877.0,77355.0,19034.0,1161.0,684.0
2,3,claim,9859838091,31,someone shared with me that american industria...,not verified,active,902185.0,97690.0,2858.0,833.0,329.0
3,4,claim,1866847991,25,someone shared with me that the metro of st. p...,not verified,active,437506.0,239954.0,34812.0,1234.0,584.0
4,5,claim,7105231098,19,someone shared with me that the number of busi...,not verified,active,56167.0,34987.0,4110.0,547.0,152.0


In [None]:
#Pulling basic info about the dataset
data.info()

In [None]:
#Getting number of null values in each column
data.isnull().sum()

In [4]:
#Dropping null values
data = data.dropna()

In [5]:
#Creating a copy of the cleaned data
data_cp = data.copy()

In [6]:
#Feature Engineering: Creating a new feature 'text_length' to store the length of video transcription text
data_cp['text_length'] = data_cp['video_transcription_text'].apply(len)

In [None]:
sns.set(style="whitegrid")

#Plot 1: Claim Status Distribution
plt.figure(figsize=(8,5))
sns.countplot(x='claim_status', data=data_cp, palette='viridis')
plt.title('Distribution of Claim Status')
plt.xlabel('Claim Status')
plt.ylabel('Count')
plt.show()


In [None]:
# Plot 2: Ban Status by Claim Type
plt.figure(figsize=(10, 6))
sns.countplot(x='author_ban_status', hue='claim_status', data=data_cp, palette='coolwarm')
plt.title('Author Ban Status by Claim Status')
plt.show()

In [None]:
# Plot 3: Views by Claim Status
plt.figure(figsize=(8, 6))
sns.boxplot(x='claim_status', y='video_view_count', data=data_cp, palette='Set2')
plt.title('Video View Count by Claim Status')
plt.show()

In [None]:
# Plot 4: Correlation Heatmap
plt.figure(figsize=(10, 8))
numeric_cols = ['video_duration_sec', 'video_view_count', 'video_like_count', 
                'video_share_count', 'video_download_count', 'video_comment_count', 'text_length']
corr_matrix = data_cp[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()