TikTok Trends

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Importing Dataset

In [2]:
dataset = pd.read_csv('tiktokData.csv')
X = dataset.iloc[:,0:-2].values
Y = dataset.iloc[:,-2].values

In [3]:
print(X)

[[13200000 28 150400000 True 'meme']
 [1300000 182 35100000 True 'meme']
 [696800 42 15900000 True 'meme']
 ...
 [44000 1336 2900000 True 'tiktokdance']
 [21400000 39 641900000 False 'tiktokdance']
 [6100000 422 51500000 False 'tiktokdance']]


Cleaning Data

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from scipy import stats

# Encode original music category
for i in range(len(X)):
    if 'T' in str(X[i, 3]):
        X[i,3] = 1
    else:
        X[i,3] = 0

# Encode hashtag category
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse = False), [4])], remainder='passthrough')
X = ct.fit_transform(X)

# Remove outliers
z_scores = np.abs((X - X.mean()) / X.std())
outlier_rows = np.where(z_scores > 3)[0]

X = np.delete(X, outlier_rows, axis = 0)
Y = np.delete(Y, outlier_rows, axis = 0)

Splitting data to Training and Testing Set

In [5]:
from sklearn.model_selection import train_test_split

# 85% training, 15% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)

In [6]:
print(X_train)

[[0.0 0.0 0.0 ... 171 12200000 0]
 [0.0 0.0 0.0 ... 49 1300000 1]
 [0.0 0.0 0.0 ... 33 14900000 1]
 ...
 [0.0 0.0 0.0 ... 104 44700000 0]
 [0.0 0.0 0.0 ... 259 112400000 0]
 [0.0 0.0 0.0 ... 336 122400000 0]]


In [7]:
print(Y_train)

[ 52600000  15500000   3000000  19000000 110900000 127700000  49000000
  15400000  69600000    622800   2600000  69400000  26300000  27100000
  11200000  17400000  22500000  42100000  40000000  20300000    745300
   1500000   3400000  18400000  30200000  40100000  73500000  25400000
  83500000    754300  89800000   7500000  97000000  72600000 144400000
 106100000  11200000  14800000   6100000   4300000   2000000  11500000
 187700000 144400000   1300000 118400000  38700000   4300000  12100000
  62200000 300600000   3500000  27300000  91500000  46500000  16900000
  21500000  37700000  37500000  27800000  72500000  18300000  30700000
 167400000     32500  72600000  32100000  67900000   1400000  30200000
   5100000   8800000   6600000  66400000 106100000  58100000  18800000
   3800000  52400000     40200  73300000  60300000  82200000 252100000
   4000000  59000000  42300000  72400000   3700000  31600000  36300000
  66100000  47500000  76900000  36700000   4200000   3800000   3600000
 20440

Training Multiple Linear Regression Model

In [8]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, Y_train)

# from sklearn.preprocessing import PolynomialFeatures
# poly_regressor = PolynomialFeatures(degree = 2)
# X_poly_train = poly_regressor.fit_transform(X_train)
# X_poly_test = poly_regressor.transform(X_test)

# regressor = LinearRegression()
# regressor.fit(X_poly_train, Y_train)

LinearRegression()

Predicting X_test and Comparing it to Y_Test

In [9]:
Y_pred = regressor.predict(X_test)

np.set_printoptions(suppress = True, formatter = {'float_kind': lambda x: f"{x:.0f}"})
print(np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test), 1)), 1))

[[16064075 6900000]
 [26154160 33800000]
 [149623669 300600000]
 [-256982 20700]
 [124654711 57600000]
 [10117435 21500000]
 [47925236 49800000]
 [180957734 145400000]
 [55413522 141200000]
 [38487118 18100000]
 [52731284 41900000]
 [53168273 34000000]
 [64493811 84600000]
 [18625932 5800000]
 [44276987 84300000]
 [70992935 40800000]
 [105810913 63200000]
 [177364189 299500000]
 [51172406 27000000]
 [15701129 5100000]
 [185187819 354800000]
 [3370552 3100000]
 [56536211 54100000]
 [18649005 9800000]
 [17980094 10000000]
 [11787099 8300000]
 [66959746 77200000]
 [75397912 69600000]
 [60927255 35800000]
 [7477276 4700000]
 [-2526828 51800]
 [18442981 35300000]
 [1515926 1800000]
 [44234371 81000000]
 [21713346 24400000]
 [15561307 8800000]
 [43212881 52600000]
 [19418194 25000000]
 [11664630 28000000]
 [15633023 21900000]
 [47814421 9600000]
 [12787350 18100000]
 [17497330 17300000]
 [57504767 38900000]
 [40638281 35900000]
 [30911272 11100000]
 [1352643 6100000]
 [53168273 30900000]
 [1

In [10]:
from sklearn.metrics import r2_score

r2 = r2_score(Y_test, Y_pred)

print(r2)

0.6581356382249013
