In [1]:
%matplotlib inline
import pandas as pd 
import numpy as np 
from matplotlib.pyplot import hist
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

2.0.0


In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  8


In [3]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

In [4]:
def read_as_number(df_column):
    if df_column.dtype == 'O':
        df_column = df_column.str.replace(',','')
        df_column = df_column.str.replace('K','e3')
        df_column = df_column.str.replace('M','e6')
        df_column = df_column.str.replace('G','e9')
        return df_column.astype(float)        
    else:
        return df_column

In [5]:
def normalize_dtypes(df):
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['Likes'] = read_as_number(df['Likes'])
    df['Popularity'] = read_as_number(df['Popularity'])
    df['Time_epoch'] = df['Timestamp'].astype(int)
    return df

In [6]:
def add_columns(df, col_list):
    for col in col_list:
        if 'genre_' + col not in df.columns:
            df[col] = 0
    return df

In [7]:
def one_hot_econde(df):
    return pd.concat([df, pd.get_dummies(df['Genre'], prefix='genre')], axis=1)

In [8]:
def normalize(y, train_stats):
    x = y.copy()
    cols = ['Likes', 'Comments', 'Popularity', 'Followers']
    for col in cols:
        x_col = (x.loc[:, col].copy() - train_stats.loc[col, 'mean']) / train_stats.loc[col, 'std']
        x.drop(columns=[col], inplace=True)
        x[col] = pd.DataFrame(x_col, columns=[col])
    return x

In [9]:
# Building the model

def build_model(feature_names):
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[len(feature_names)]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
    return model

In [10]:
train_file = './data/Data_Train.csv'
test_file = './data/Data_Test.csv'

In [11]:
# Read from CSV
# and convert NaN to string

train_orig = pd.read_csv(train_file, na_filter=False) 
test = pd.read_csv(test_file, na_filter=False) 

In [12]:
# Normalize data types

train_orig = normalize_dtypes(train_orig)
test = normalize_dtypes(test)

In [13]:
# Drop outlier Views

outliers_msk = train_orig['Views'] > 1e6
train_wo_outliers = train_orig[~outliers_msk]

In [14]:
# Get statistics

train_stats = train_wo_outliers.describe()
train_stats.pop('Views')
train_stats = train_stats.transpose()

test_stats = test.describe()
test_stats = test_stats.transpose()
test_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unique_ID,19615.0,748706.3,481497.1,8.0,322402.5,662016.0,1189599.0,1570002.0
Comments,19615.0,118.854,872.116,0.0,1.0,12.0,60.0,87128.0
Likes,19615.0,8962.272,51971.53,0.0,136.0,669.0,2826.0,2150000.0
Popularity,19615.0,959.0376,5044.125,0.0,14.0,88.0,400.0,186000.0
Followers,19615.0,483304.5,1169496.0,1.0,17784.0,90704.0,393655.0,9789123.0
Time_epoch,19615.0,1.464741e+18,8.938279e+16,9.46944e+16,1.429663e+18,1.484265e+18,1.524871e+18,1.551061e+18


In [15]:
# One-hot encoding 

unique_genres = pd.concat([train_wo_outliers['Genre'], test['Genre']], sort=False).unique()

train_wo_outliers = one_hot_econde(train_wo_outliers)
train_wo_outliers = add_columns(train_wo_outliers, unique_genres)

test = one_hot_econde(test)
test = add_columns(test, unique_genres)

In [16]:
# # Split training train and validation

# train_msk = np.random.rand(len(train_wo_outliers)) < 0.75
# train = train_wo_outliers[train_msk]
train = train_wo_outliers
# val = train_wo_outliers[~train_msk]

In [17]:
# Normalize features

norm_train = normalize(train, train_stats)
# norm_val = normalize(val, train_stats)
norm_test = normalize(test, train_stats)

In [18]:
# Extract labels

train_labels = norm_train.pop('Views')
# val_labels = norm_val.pop('Views')

In [19]:
# Shape neural networks

feature_names = list(norm_train.columns.values)[7:]
feature_names 
# norm_train[feature_names]
len(feature_names)

25

In [20]:
# Build the model

model = build_model(feature_names)

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                1664      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 5,889
Trainable params: 5,889
Non-trainable params: 0
_________________________________________________________________


In [22]:
# Trying the model out
example_batch = norm_train[feature_names][:10]
example_result = model.predict(example_batch)
example_result

W0104 22:19:30.658853 139952386103040 training.py:504] Falling back from v2 loop because of error: Failed to find data adapter that can handle input: <class 'pandas.core.frame.DataFrame'>, <class 'NoneType'>


array([[-0.16944888],
       [-0.163258  ],
       [-0.16359307],
       [ 0.10437435],
       [-0.08274501],
       [ 0.05082845],
       [-0.1600997 ],
       [ 0.38741535],
       [-0.0974264 ],
       [-0.06971568]], dtype=float32)

In [None]:
# Training model

EPOCHS = 1000

norm_features_train = norm_train[feature_names]
# norm_features_val = norm_val[feature_names]
norm_features_test = norm_test[feature_names]


history = model.fit(
  norm_features_train, train_labels,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[tfdocs.modeling.EpochDots()])

W0104 22:20:26.893602 139952386103040 training.py:504] Falling back from v2 loop because of error: Failed to find data adapter that can handle input: <class 'pandas.core.frame.DataFrame'>, <class 'NoneType'>



Epoch: 0, loss:31685050718.2821,  mae:82692.9453,  mse:31685064704.0000,  val_loss:27267054674.6588,  val_mae:73904.4219,  val_mse:27267072000.0000,  
.........................................................................................
Epoch: 100, loss:3628802751.3015,  mae:25584.5801,  mse:3628801280.0000,  val_loss:3525323119.7334,  val_mae:25740.3926,  val_mse:3525323264.0000,  
.........................

In [None]:
history

In [None]:
len(train)

In [None]:
len(test)

In [None]:
train.dtypes

In [None]:
test.dtypes

In [None]:
len(train[~train.Song_Name.isna()])
len(train[train.Song_Name.isna()])

In [None]:
train.hist(column='Views')

In [None]:
train.Genre.unique()

In [None]:
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})
plt.figure(figsize=(10,7), dpi= 80)
# sns.distplot(train.Views, color="dodgerblue", label="Compact", **kwargs, norm_hist=False, kde=False)
sns.distplot(train.Views, color="dodgerblue", label="Compact", **kwargs)
sns.distplot(train.Popularity, color="orange", label="SUV", **kwargs)
# plt.xlim(50,75)
plt.legend()

In [None]:
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})
plt.figure(figsize=(10,7), dpi= 80)
sns.distplot(train.Popularity, color="dodgerblue", label="Compact", **kwargs)
# plt.xlim(50,75)
plt.legend()

In [None]:
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)
ax1.hist(train_wo_outliers.Views)
ax2.hist(train_wo_outliers.Popularity)
# plt.xscale('log')
# plt.yscale('log')
plt.show()

In [None]:
train_stats.loc['Likes']['mean']

In [None]:
corr_matrix=train[['Views', 'Comments', 'Popularity', 'Followers', 'Likes', 'Time_epoch']].corr()
corr_matrix

In [None]:
train[['Views', 'Comments', 'Popularity', 'Followers', 'Likes', 'Time_epoch']]

In [None]:
sns.pairplot(train[['Views', 'Comments', 'Popularity', 'Followers', 'Likes', 'Time_epoch']], diag_kind="kde")

In [None]:
sns.heatmap(corr_matrix, cmap='PuOr')

In [None]:
train.Likes.describe()

In [None]:
train.Genre.describe()

In [None]:
train.Views.describe()

In [None]:
train.Song_Name.describe()

In [None]:
train.sort_values(by=['Name','Views','Timestamp'])
train.sort_values(by=['Time_epoch', 'Timestamp'])

In [None]:
train.Timestamp.astype(int)

In [None]:
train.sort_values(by='Views')['Views']