# CS 4641 Project - TikTok Trends

### Shrey Amin, Sam Garvis, Abhinav Joshi, Zoe Maisel, Dimitrios Kosmakos

In [405]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import iplot
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline

In [406]:
# make dataset dataframe
def make_dataset_dataframe(dataset):
    # ../data/trend_full.csv
    df = pd.read_csv(dataset, converters={'hashtags': literal_eval})
    return df

### Data Cleaning

In [407]:
def drop_columns(df, columns_to_drop=[]):
    # Drop Columns
    return df.drop(columns=columns_to_drop, errors='ignore')

In [408]:
# Convert timestamp into day of week and hour of day
def convert_timestamp(df):
    date = pd.to_datetime(df['createTime'], unit='s')
    dayofweek = date.dt.dayofweek
    hourofday = date.dt.hour
    df.insert(2, "dayofweek", dayofweek)
    df.insert(2, "hour", hourofday)
    df = df.drop(columns=['createTime'])
    return df

In [409]:
def convert_data_types(df):
    # Convert data types/handle special cases
    # Replace text column with length of characters
    df['text'] = df['text'].str.len()
    # Replace verified and musicOriginal column with 0 or 1
    df["authorMeta.verified"] = df["authorMeta.verified"].astype(int)
    df["musicMeta.musicOriginal"] = df["musicMeta.musicOriginal"].astype(int)
    return df

In [410]:
def separate_hashtags(df):
     # Keep hashtags with at least N occurences 
    df['hashtags'] = df['hashtags'].apply(lambda hashtags: tuple(hashtag['name'] for hashtag in hashtags if len(hashtag['name']) > 0))
    mlb = MultiLabelBinarizer()
    hashtags = mlb.fit_transform(df['hashtags'])
    hashtags = pd.DataFrame(hashtags, columns=mlb.classes_)
    n = 1000
    n_hashtags = hashtags.sum(axis=0).sort_values(ascending=False)
    n_hashtags = n_hashtags[n_hashtags > n].index
    popular_hashtags = hashtags[n_hashtags]
    df['hashtags'] = df['hashtags'].apply(lambda hashtags: len(hashtags))  # replace the original hashtag column with number of hashtags
    df.rename(columns={'hashtags': 'numHashtags'}, inplace=True)
    df = pd.concat([df, popular_hashtags], axis=1) # concatenate the selected hashtags
    return df

In [411]:
# Remove null values
def remove_null_values(df):
    df = df.fillna(0)
    return df

In [412]:
# Remove duplicate values if there are any
def remove_duplicates(df):
    return df.drop_duplicates()

In [413]:
# Create cleaned CSV
def save_cleaned_df_to_csv(df, csv_name):
    #'../data/trend_cleaned.csv'
    df.to_csv(csv_name, index=False)

In [414]:
def run_cleanup_pipeline(dataset, columns_to_drop, csv_output_name=''):
    df = make_dataset_dataframe(dataset)
    df = drop_columns(df, columns_to_drop=columns_to_drop)
    df = convert_timestamp(df)
    df = convert_data_types(df)
    df = separate_hashtags(df)
    df = remove_null_values(df)
    df = remove_duplicates(df)
    if csv_output_name:
        save_cleaned_df_to_csv(df, csv_output_name)
    return df

#### Clean up popular videos

In [415]:
# columns_to_drop=['secretID', 'authorMeta.secUid', 'authorMeta.name', 'mentions','authorMeta.nickName', 'authorMeta.signature', 'authorMeta.avatar', 'musicMeta.musicName', 'musicMeta.musicAuthor', 'musicMeta.musicAlbum', 'musicMeta.playUrl', 'musicMeta.coverThumb', 'musicMeta.coverMedium', 'musicMeta.coverLarge', 'covers.default', 'covers.origin', 'covers.dynamic', 'webVideoUrl', 'videoUrl', 'videoUrlNoWaterMark', 'videoApiUrlNoWaterMark', 'videoMeta.height', 'videoMeta.width', 'downloaded']
# dataset='../data/trend_full.csv'
# csv_output_name='../data/trend_cleaned.csv'
# popular_df = run_cleanup_pipeline(dataset, columns_to_drop, csv_output_name)
# df = popular_df

#### Clean up unpopular videos

In [416]:
# columns_to_drop=['secretID', 'authorMeta.secUid', 'authorMeta.name', 'mentions','authorMeta.nickName', 'authorMeta.signature', 'authorMeta.avatar', 'musicMeta.musicName', 'musicMeta.musicAuthor', 'musicMeta.musicAlbum', 'musicMeta.playUrl', 'musicMeta.coverThumb', 'musicMeta.coverMedium', 'musicMeta.coverLarge', 'covers.default', 'covers.origin', 'covers.dynamic', 'webVideoUrl', 'videoUrl', 'videoUrlNoWaterMark', 'videoApiUrlNoWaterMark', 'videoMeta.height', 'videoMeta.width', 'downloaded']
# dataset='../data/unpopular_tiktoks.csv'
# csv_output_name='../data/unpopular_tiktoks_cleaned.csv'
# unpopular_df = run_cleanup_pipeline(dataset, columns_to_drop, csv_output_name)
# df = unpopular_df

#### Clean up unpopular and popular videos after combining them

In [417]:
columns_to_drop=['secretID', 'authorMeta.secUid', 'authorMeta.name', 'mentions','authorMeta.nickName', 'authorMeta.signature', 'authorMeta.avatar', 'musicMeta.musicName', 'musicMeta.musicAuthor', 'musicMeta.musicAlbum', 'musicMeta.playUrl', 'musicMeta.coverThumb', 'musicMeta.coverMedium', 'musicMeta.coverLarge', 'covers.default', 'covers.origin', 'covers.dynamic', 'webVideoUrl', 'videoUrl', 'videoUrlNoWaterMark', 'videoApiUrlNoWaterMark', 'videoMeta.height', 'videoMeta.width', 'downloaded']

popular_df = make_dataset_dataframe('../data/trend_full.csv')
popular_df = drop_columns(popular_df, columns_to_drop)
unpopular_df = make_dataset_dataframe('../data/unpopular_tiktoks.csv')
unpopular_df = drop_columns(unpopular_df, columns_to_drop)

df = pd.concat([popular_df, unpopular_df], ignore_index=True)
df = convert_timestamp(df)
df = convert_data_types(df)
df = separate_hashtags(df)
df = remove_null_values(df)
df = remove_duplicates(df)
df.to_csv('../data/all_tiktoks_cleaned.csv', index=False)

### Exploratory Data Analysis

In [418]:
# Scale the data
df_scaled = df.iloc[:, :20].drop(columns=['id', 'authorMeta.id', 'musicMeta.musicId'])
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_scaled), columns=df_scaled.columns)

In [419]:
# Heat Map to show correlation between features
corrs = df_scaled.corr()
fig = ff.create_annotated_heatmap(z = corrs.values, x = list(corrs.columns),
    y = list(corrs.index),
    annotation_text = corrs.round(2).values,
    showscale = True)
fig.layout.margin = dict(l = 200, t = 200)
fig.layout.height = 800
fig.layout.width = 1000
# fig.write_image("../plots/heatMap/heatMap.png")
iplot(fig)

In [420]:
# Scatterplots between each feature and playCount
feats = df_scaled
colors = ['b', 'g', 'r', 'c', 'm', 'y']
idx = 0
y = df['playCount']
for feat in feats:
    x = df[feat]
    figure = px.scatter(df[:1000], x=feat, y="playCount")
#     figure.write_image("../plots/scatterPlots/{}.png".format(feat))
    figure.show()

### Clustering

In [421]:
# Unsupervised learning - drop ID columns
df_u = df.iloc[:, :20].drop(columns=['id', 'authorMeta.id', 'musicMeta.musicId'])
df_u.head(10)

Unnamed: 0,text,hour,dayofweek,authorMeta.verified,authorMeta.following,authorMeta.fans,authorMeta.heart,authorMeta.video,authorMeta.digg,musicMeta.musicOriginal,musicMeta.duration,videoMeta.duration,diggCount,shareCount,playCount,commentCount,numHashtags
0,115.0,13,4,0,1579.0,26600.0,173900.0,330.0,5813.0,1,27.0,26.0,88.0,0.0,232.0,1.0,2
1,60.0,5,3,0,1579.0,26600.0,173900.0,330.0,5813.0,1,27.0,55.0,33.0,0.0,199.0,0.0,2
2,24.0,10,2,0,1579.0,26600.0,173900.0,330.0,5813.0,1,17.0,17.0,12.0,0.0,104.0,2.0,2
3,65.0,16,1,0,1579.0,26600.0,173900.0,330.0,5813.0,1,17.0,17.0,11.0,1.0,85.0,0.0,4
4,82.0,11,1,0,1579.0,26600.0,173900.0,330.0,5813.0,1,19.0,19.0,23.0,0.0,128.0,0.0,4
5,109.0,16,4,0,1579.0,26600.0,173900.0,330.0,5813.0,1,25.0,25.0,18.0,0.0,96.0,6.0,4
6,113.0,4,4,0,1579.0,26600.0,173900.0,330.0,5813.0,1,39.0,48.0,56.0,0.0,398.0,3.0,5
7,134.0,6,2,0,1579.0,26600.0,173900.0,330.0,5813.0,1,20.0,23.0,78.0,2.0,649.0,13.0,3
8,139.0,2,2,0,1579.0,26600.0,173900.0,330.0,5813.0,1,23.0,23.0,12.0,13.0,79.0,4.0,3
9,76.0,12,1,0,1579.0,26600.0,173900.0,330.0,5813.0,1,17.0,17.0,42.0,0.0,428.0,4.0,3


In [422]:
X=df_u
scaler = MinMaxScaler()
scaler.fit(X)
X=scaler.transform(X)
inertia = []
for i in range(1,11):
    kmeans = KMeans(
        n_clusters=i, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=42
    )
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)
fig = go.Figure(data=go.Scatter(x=np.arange(1,11),y=inertia))
fig.update_layout(title="Inertia vs Cluster Number",xaxis=dict(range=[0,11],title="Cluster Number"),
                  yaxis={'title':'Inertia'},
                 annotations=[
        dict(
            x=3,
            y=inertia[2],
            xref="x",
            yref="y",
            text="Elbow!",
            showarrow=True,
            arrowhead=7,
            ax=20,
            ay=-40
        )
    ])

In [429]:
kmeans = KMeans(
        n_clusters=3, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=42
    )
kmeans.fit(X)
clusters=pd.DataFrame(X, columns=df_u.columns)
clusters['label']=kmeans.labels_
polar=clusters.groupby("label").mean().reset_index()
polar=pd.melt(polar,id_vars=["label"])
fig4 = px.line_polar(polar, r="value", theta="variable", color="label", line_close=True,height=800,width=1400)
fig4.show()

ValueError: Shape of passed values is (13337, 3), indices imply (13337, 17)

In [424]:
pie=clusters.groupby('label').size().reset_index()
pie.columns=['label','value']
px.pie(pie,values='value',names='label',color=['blue','red','green'])

### Clustering only with hashtags

In [425]:
# Only keep the hashtags
df_h = df.iloc[:, 20:]
df_h.head(10)

Unnamed: 0,fyp,foryou,foryoupage
0,0,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,1,0,0


In [426]:
X=df_h
scaler = MinMaxScaler()
scaler.fit(X)
X=scaler.transform(X)
inertia = []
for i in range(1,11):
    kmeans = KMeans(
        n_clusters=i, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=42
    )
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)
fig = go.Figure(data=go.Scatter(x=np.arange(1,11),y=inertia))
fig.update_layout(title="Inertia vs Cluster Number",xaxis=dict(range=[0,11],title="Cluster Number"),
                  yaxis={'title':'Inertia'},
                 annotations=[
        dict(
            x=3,
            y=inertia[2],
            xref="x",
            yref="y",
            text="Elbow!",
            showarrow=True,
            arrowhead=7,
            ax=20,
            ay=-40
        )
    ])


Number of distinct clusters (8) found smaller than n_clusters (9). Possibly due to duplicate points in X.


Number of distinct clusters (8) found smaller than n_clusters (10). Possibly due to duplicate points in X.



In [427]:
kmeans = KMeans(
        n_clusters=3, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=42
    )
kmeans.fit(X)
clusters=pd.DataFrame(X, columns=df_h.columns)
clusters['label']=kmeans.labels_
polar=clusters.groupby("label").mean().reset_index()
polar=pd.melt(polar,id_vars=["label"])
fig4 = px.line_polar(polar, r="value", theta="variable", color="label", line_close=True,height=800,width=1400)
fig4.show()

In [428]:
pie=clusters.groupby('label').size().reset_index()
pie.columns=['label','value']
px.pie(pie,values='value',names='label',color=['blue','red','green'])