In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# import the data
df = pd.read_csv('data/clean_data.csv', index_col=0)


In [4]:
df.drop(['user_login', 'title'], axis=1, inplace=True)
df = pd.get_dummies(df)
df.is_mature = df.is_mature.astype(int)

In [5]:
scalerx = StandardScaler()
scalery = StandardScaler()

df[['user_id']] = scalerx.fit_transform(df[['user_id']])
df[['viewer_count']] = scalery.fit_transform(df[['viewer_count']])

Unnamed: 0,user_id,viewer_count,is_mature,game_name_#COMPASS,game_name_7 Days to Die,game_name_A Monster's Expedition,game_name_A Way Out,game_name_A3: Still Alive,game_name_APB Reloaded,game_name_ASMR,...,language_ru,language_sk,language_sv,language_th,language_tl,language_tr,language_uk,language_vi,language_zh,language_zh-hk
0,-0.873201,54.313821,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.961807,25.760399,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1.186338,24.288591,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.202855,21.978958,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.969659,20.850917,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10064,1.872309,-0.191446,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10065,-0.761666,-0.191446,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10066,-0.411443,-0.192310,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10067,1.021500,-0.193174,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Split the data into training and testing sets
y = df.viewer_count
df.drop(['viewer_count'], axis=1, inplace=True)
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [613]:
# multiple linear regression
# import statsmodels.api as sm

# X_sm = sm.add_constant(X)
# model = sm.OLS(y, X_sm)
# model.fit().summary()
y.describe()


Unnamed: 0,viewer_count
count,9864.0
mean,236.649027
std,1157.818362
min,0.0
25%,25.0
50%,46.0
75%,115.0
max,63119.0


In [8]:
# Basic regression
lm = LinearRegression()

cv_lm = cross_val_score(lm, X, y, scoring='neg_mean_absolute_error', cv=3)
cv_lm = list(map(lambda x: -x, cv_lm))
scalery.inverse_transform([cv_lm])

array([[4.22929657e+11, 1.97495501e+14, 5.05151101e+13]])

In [9]:
# This is the main baseline
rf = RandomForestRegressor()

cv_rf = cross_val_score(rf, X, y, scoring='neg_mean_absolute_error', cv=3)

In [10]:
print(f"CV_rf: {cv_rf}")
cv_rf_neg = list(map(lambda x: -x, cv_rf))
print(f"negated CV_rf: {cv_rf_neg}")
scalery.inverse_transform([cv_rf_neg])


CV_rf: [-0.522283   -0.20028157 -0.21588515]
negated CV_rf: [0.5222829983505595, 0.20028156867848193, 0.21588515275918907]


array([[841.32721922, 468.52694987, 486.59215024]])

In [169]:
# Now we will use tensorflow to build a model
df = pd.read_csv('data/clean_data.csv', index_col=0)
df.drop(['user_login', 'title'], axis=1, inplace=True)
df = pd.get_dummies(df)
df.is_mature = df.is_mature.astype(int)
df.describe()

Unnamed: 0,user_id,viewer_count,is_mature,game_name_#COMPASS,game_name_7 Days to Die,game_name_A Monster's Expedition,game_name_A Way Out,game_name_A3: Still Alive,game_name_APB Reloaded,game_name_ASMR,...,language_ru,language_sk,language_sv,language_th,language_tl,language_tr,language_uk,language_vi,language_zh,language_zh-hk
count,9864.0,9864.0,9864.0,9864.0,9864.0,9864.0,9864.0,9864.0,9864.0,9864.0,...,9864.0,9864.0,9864.0,9864.0,9864.0,9864.0,9864.0,9864.0,9864.0,9864.0
mean,269218500.0,236.649027,0.342863,0.000101,0.001217,0.000101,0.000203,0.000406,0.000101,0.005272,...,0.089416,0.001419,0.004258,0.016423,0.000811,0.017133,0.000203,0.000203,0.039132,0.002839
std,226907100.0,1157.818362,0.47469,0.010069,0.03486,0.010069,0.014239,0.020134,0.010069,0.072418,...,0.285358,0.037649,0.065117,0.127103,0.028468,0.129774,0.014239,0.014239,0.193919,0.053206
min,11842.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,86805560.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,165225500.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,470916400.0,115.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,778267700.0,63119.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [170]:
y = pd.DataFrame()
y['viewer_count'] = df.viewer_count
df.drop(['viewer_count'], axis=1, inplace=True)
X = df

scalery = StandardScaler()
y[['viewer_count']] = scalery.fit_transform(y[['viewer_count']])

train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.20, random_state=2)


In [171]:
normalizerx = tf.keras.layers.Normalization(axis=-1)
normalizerx.adapt(np.array(X))

In [241]:
model = keras.Sequential([
    normalizerx,
    layers.Dense(256, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])

model.summary()

model.compile(loss='huber_loss',
              optimizer=tf.keras.optimizers.Adam(0.00001), metrics=['mae'])


Model: "sequential_30"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_6 (Normalizat  (None, 887)              1775      
 ion)                                                            
                                                                 
 dense_103 (Dense)           (None, 256)               227328    
                                                                 
 dense_104 (Dense)           (None, 32)                8224      
                                                                 
 dense_105 (Dense)           (None, 1)                 33        
                                                                 
Total params: 237,360
Trainable params: 235,585
Non-trainable params: 1,775
_________________________________________________________________


In [250]:
%%time
history = model.fit(
    train_features,
    train_labels,
    verbose=1, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
CPU times: user 40.3 s, sys: 6.92 s, total: 47.2 s
Wall time: 21.7 s


In [251]:
scalery.inverse_transform([model.evaluate(test_features, test_labels,batch_size=1,verbose=1)])



array([[442.81212965, 606.86022233]])

In [252]:
n = 0
i = 0
while n < 100:
    if (scalery.inverse_transform([[test_labels.viewer_count.iloc[i]]])[0][0] < 500):
        n+=1
        print(f'i={i} {scalery.inverse_transform([[test_labels.viewer_count.iloc[i]]])}', end="")
        print(scalery.inverse_transform(model.predict(test_features.iloc[i].values.reshape(1, -1))))
    i+=1

i=0 [[55.]][[-15.629248]]
i=1 [[72.]][[39.25477]]
i=2 [[21.]][[237.7965]]
i=3 [[206.]][[107.18319]]
i=4 [[43.]][[109.27211]]
i=5 [[374.]][[-2597.9365]]
i=6 [[46.]][[357.28052]]
i=7 [[18.]][[92.92741]]
i=8 [[26.]][[247.63014]]
i=9 [[15.]][[149.86255]]
i=10 [[31.]][[90.81109]]
i=12 [[60.]][[102.88767]]
i=13 [[29.]][[427.25644]]
i=14 [[27.]][[439.2869]]
i=15 [[39.]][[51.540016]]
i=16 [[16.]][[162.24051]]
i=17 [[21.]][[163.36322]]
i=18 [[76.]][[66.68264]]
i=19 [[60.]][[192.76788]]
i=20 [[30.]][[111.232574]]
i=21 [[48.]][[96.340385]]
i=22 [[21.]][[262.48785]]
i=23 [[31.]][[483.12997]]
i=25 [[35.]][[-30.84545]]
i=26 [[115.]][[83.83787]]
i=27 [[20.]][[279.49612]]
i=28 [[29.]][[58.018562]]
i=29 [[20.]][[94.74334]]
i=30 [[29.]][[22.58616]]
i=31 [[23.]][[-125.83425]]
i=32 [[16.]][[263.56424]]
i=33 [[33.]][[33.899773]]
i=34 [[50.]][[375.46954]]
i=35 [[21.]][[57.338554]]
i=36 [[39.]][[-2520.0771]]
i=37 [[16.]][[-4.0494137]]
i=38 [[206.]][[271.9367]]
i=39 [[98.]][[204.85777]]
i=40 [[191.]][[172.430

In [246]:
model.save('models/model.h5')