# CS 4641 Project - TikTok Trends

### Shrey Amin, Sam Garvis, Abhinav Joshi, Zoe Maisel, Dimitrios Kosmakos

In [312]:
columns_to_drop=['secretID', 'authorMeta.secUid', 'authorMeta.name', 'mentions','authorMeta.nickName', 'authorMeta.signature', 'authorMeta.avatar', 'musicMeta.musicName', 'musicMeta.musicAuthor', 'musicMeta.musicAlbum', 'musicMeta.playUrl', 'musicMeta.coverThumb', 'musicMeta.coverMedium', 'musicMeta.coverLarge', 'covers.default', 'covers.origin', 'covers.dynamic', 'webVideoUrl', 'videoUrl', 'videoUrlNoWaterMark', 'videoApiUrlNoWaterMark', 'videoMeta.height', 'videoMeta.width', 'downloaded']
print(columns_to_drop)

['secretID', 'authorMeta.secUid', 'authorMeta.name', 'mentions', 'authorMeta.nickName', 'authorMeta.signature', 'authorMeta.avatar', 'musicMeta.musicName', 'musicMeta.musicAuthor', 'musicMeta.musicAlbum', 'musicMeta.playUrl', 'musicMeta.coverThumb', 'musicMeta.coverMedium', 'musicMeta.coverLarge', 'covers.default', 'covers.origin', 'covers.dynamic', 'webVideoUrl', 'videoUrl', 'videoUrlNoWaterMark', 'videoApiUrlNoWaterMark', 'videoMeta.height', 'videoMeta.width', 'downloaded']


In [278]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import iplot
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline

In [279]:
# make dataset dataframe
def make_dataset_dataframe(dataset):
    # ../data/trend_full.csv
    df = pd.read_csv(dataset, converters={'hashtags': literal_eval})
    return df

### Data Cleaning

In [280]:
def drop_columns(df, columns_to_drop=[]):
    # Drop Columns
    return df.drop(columns=columns_to_drop, errors='ignore')

In [281]:
# Convert timestamp into day of week and hour of day
def convert_timestamp(df):
    date = pd.to_datetime(df['createTime'], unit='s')
    dayofweek = date.dt.dayofweek
    hourofday = date.dt.hour
    df.insert(2, "dayofweek", dayofweek)
    df.insert(2, "hour", hourofday)
    df = df.drop(columns=['createTime'])
    return df

In [282]:
def convert_data_types(df):
    # Convert data types/handle special cases
    # Replace text column with length of characters
    df['text'] = df['text'].str.len()
    # Replace verified and musicOriginal column with 0 or 1
    df["authorMeta.verified"] = df["authorMeta.verified"].astype(int)
    df["musicMeta.musicOriginal"] = df["musicMeta.musicOriginal"].astype(int)
    return df

In [283]:
def separate_hashtags(df):
     # Keep hashtags with at least N occurences 
    df['hashtags'] = df['hashtags'].apply(lambda hashtags: tuple(hashtag['name'] for hashtag in hashtags if len(hashtag['name']) > 0))
    mlb = MultiLabelBinarizer()
    hashtags = mlb.fit_transform(df['hashtags'])
    hashtags = pd.DataFrame(hashtags, columns=mlb.classes_)
    n = 500
    n_hashtags = hashtags.sum(axis=0).sort_values(ascending=False)
    n_hashtags = n_hashtags[n_hashtags > n].index
    popular_hashtags = hashtags[n_hashtags]
    df['hashtags'] = df['hashtags'].apply(lambda hashtags: len(hashtags))  # replace the original hashtag column with number of hashtags
    df.rename(columns={'hashtags': 'numHashtags'}, inplace=True)
    df = pd.concat([df, popular_hashtags], axis=1) # concatenate the selected hashtags
    return df

In [284]:
# Remove null values
def remove_null_values(df):
    df = df.fillna(0)
    return df

In [285]:
# Remove duplicate values if there are any
def remove_duplicates(df):
    return df.drop_duplicates()

In [286]:
# Create cleaned CSV
def save_cleaned_df_to_csv(df, csv_name):
    #'../data/trend_cleaned.csv'
    df.to_csv(csv_name, index=False)

In [287]:
def run_cleanup_pipeline(dataset, columns_to_drop, csv_output_name=''):
    df = make_dataset_dataframe(dataset)
    df = drop_columns(df, columns_to_drop=columns_to_drop)
    df = convert_timestamp(df)
    df = convert_data_types(df)
    df = separate_hashtags(df)
    df = remove_null_values(df)
    df = remove_duplicates(df)
    if csv_output_name:
        save_cleaned_df_to_csv(df, csv_output_name)
    return df

#### Clean up popular videos

In [315]:
columns_to_drop=['secretID', 'authorMeta.secUid', 'authorMeta.name', 'mentions','authorMeta.nickName', 'authorMeta.signature', 'authorMeta.avatar', 'musicMeta.musicName', 'musicMeta.musicAuthor', 'musicMeta.musicAlbum', 'musicMeta.playUrl', 'musicMeta.coverThumb', 'musicMeta.coverMedium', 'musicMeta.coverLarge', 'covers.default', 'covers.origin', 'covers.dynamic', 'webVideoUrl', 'videoUrl', 'videoUrlNoWaterMark', 'videoApiUrlNoWaterMark', 'videoMeta.height', 'videoMeta.width', 'downloaded']
dataset='../data/trend_full.csv'
csv_output_name='../data/trend_cleaned.csv'
popular_df = run_cleanup_pipeline(dataset, columns_to_drop, csv_output_name)
df = popular_df
df.columns

Index(['id', 'text', 'hour', 'dayofweek', 'authorMeta.id',
       'authorMeta.verified', 'authorMeta.following', 'authorMeta.fans',
       'authorMeta.heart', 'authorMeta.video', 'authorMeta.digg',
       'musicMeta.musicId', 'musicMeta.musicOriginal', 'musicMeta.duration',
       'videoMeta.duration', 'diggCount', 'shareCount', 'playCount',
       'commentCount', 'numHashtags', 'fyp', 'foryou', 'foryoupage', 'viral',
       'funny', 'comedy'],
      dtype='object')

#### Clean up unpopular videos

In [289]:
# columns_to_drop=['secretID', 'authorMeta.secUid', 'authorMeta.name', 'mentions','authorMeta.nickName', 'authorMeta.signature', 'authorMeta.avatar', 'musicMeta.musicName', 'musicMeta.musicAuthor', 'musicMeta.musicAlbum', 'musicMeta.playUrl', 'musicMeta.coverThumb', 'musicMeta.coverMedium', 'musicMeta.coverLarge', 'covers.default', 'covers.origin', 'covers.dynamic', 'webVideoUrl', 'videoUrl', 'videoUrlNoWaterMark', 'videoApiUrlNoWaterMark', 'videoMeta.height', 'videoMeta.width', 'downloaded']
# dataset='../data/unpopular_tiktoks.csv'
# csv_output_name='../data/unpopular_tiktoks_cleaned.csv'
# unpopular_df = run_cleanup_pipeline(dataset, columns_to_drop, csv_output_name)
# df = unpopular_df

#### Clean up unpopular and popular videos after combining them

In [290]:
# columns_to_drop=['secretID', 'authorMeta.secUid', 'authorMeta.name', 'mentions','authorMeta.nickName', 'authorMeta.signature', 'authorMeta.avatar', 'musicMeta.musicName', 'musicMeta.musicAuthor', 'musicMeta.musicAlbum', 'musicMeta.playUrl', 'musicMeta.coverThumb', 'musicMeta.coverMedium', 'musicMeta.coverLarge', 'covers.default', 'covers.origin', 'covers.dynamic', 'webVideoUrl', 'videoUrl', 'videoUrlNoWaterMark', 'videoApiUrlNoWaterMark', 'videoMeta.height', 'videoMeta.width', 'downloaded']

# popular_df = make_dataset_dataframe('../data/trend_full.csv')
# popular_df = drop_columns(popular_df, columns_to_drop)
# unpopular_df = make_dataset_dataframe('../data/unpopular_tiktoks.csv')
# unpopular_df = drop_columns(unpopular_df, columns_to_drop)

# df = pd.concat([popular_df, unpopular_df], ignore_index=True)
# df = convert_timestamp(df)
# df = convert_data_types(df)
# df = separate_hashtags(df)
# df = remove_null_values(df)
# df = remove_duplicates(df)

# df.insert(20, "popular", np.where(df['playCount'] >= 500000, 1, 0))
# df.to_csv('../data/all_tiktoks_cleaned.csv', index=False)

### Exploratory Data Analysis

In [291]:
# Scale the data
df_scaled = df.iloc[:, :20].drop(columns=['id', 'authorMeta.id', 'musicMeta.musicId'])
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_scaled), columns=df_scaled.columns)

In [292]:
# Heat Map to show correlation between features
corrs = df_scaled.corr()
fig = ff.create_annotated_heatmap(z = corrs.values, x = list(corrs.columns),
    y = list(corrs.index),
    annotation_text = corrs.round(2).values,
    showscale = True)
fig.layout.margin = dict(l = 200, t = 200)
fig.layout.height = 800
fig.layout.width = 1000
# fig.write_image("../plots/heatMap/heatMap.png")
iplot(fig)

In [293]:
# Scatterplots between each feature and playCount
feats = df_scaled
colors = ['b', 'g', 'r', 'c', 'm', 'y']
idx = 0
y = df['playCount']
for feat in feats:
    x = df[feat]
    figure = px.scatter(df[:1000], x=feat, y="playCount")
#     figure.write_image("../plots/scatterPlots/{}.png".format(feat))
    figure.show()