# My project is about YouTube music videos, and here I've been trying to build a model that predicts likes and views 

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, ElasticNetCV, MultiTaskElasticNetCV
import seaborn as sns
import category_encoders as ce
from scipy.stats import boxcox, skew
from sklearn.impute import SimpleImputer
from scipy.special import inv_boxcox  
from sklearn.multioutput import MultiOutputRegressor



In [6]:
data = pd.read_csv(r"C:\Users\Dosya\Downloads\spotifyMusic\Spotify_Youtube.csv")



### 
Dropping out unofficial music videos

In [7]:
data = data[(data['Licensed'] == True) & (data['official_video'] == True)]

### Separeting categorical and numerical data to work with each category type later

In [8]:
categorical = [col for col in data.columns if data[col].dtype == object]
numerical = [col for col in data.columns if data[col].dtype in [int, float] and col not in ['Views', 'Likes']]


### Creating different data splits, so we could avoid data leakeges

In [9]:
randoming = 312
y_col = ['Views', 'Likes']

X = data.drop(y_col, axis=1)
y = data[y_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = randoming)

X_train_cat = X_train[categorical]
X_train_num = X_train[numerical]

X_test_cat = X_test[categorical]
X_test_num = X_test[numerical]

In [10]:
data.shape

(14140, 28)

### Since we have a relatively big data set we can not use OneHotEncoder, for it creates a lot of new bins that take up a lot of space

In [7]:
# OHE = OneHotEncoder(drop="first", handle_unknown = 'ignore', sparse_output = False)
# encoded_data = OHE.fit_transform(data[categorical])

# encoded_cols_names = OHE.get_feature_names_out(categorical)

#encoded_df = pd.DataFrame(encoded_data,
                          #columns = encoded_cols_names,
                           # index = data.index)

#final_frame = pd.concat([data.drop(columns = categorical), encoded_df], axis = 1)


### So I used HashingEncoder with 1000 bins instead. We might end up with some values being in the wrong bins, so model is not perfect from this point

In [11]:
my_encoder = ce.HashingEncoder(cols = categorical, n_components = 1000)
X_categorical_train_encoded = my_encoder.fit_transform(X_train_cat)
X_categorical_test_encoded = my_encoder.transform(X_test_cat)




### I have to deal with missing values, so I got this block of code below

In [13]:
num_imputer = SimpleImputer(strategy='mean')
X_train_num_imputed = num_imputer.fit_transform(X_train_num)
X_test_num_imputed = num_imputer.transform(X_test_num)

poly = PolynomialFeatures(degree=2, include_bias=True)
X_numerical_train_poly = poly.fit_transform(X_train_num_imputed)
X_numerical_test_poly = poly.transform(X_test_num_imputed)

scaler = StandardScaler()
X_numerical_train_s = scaler.fit_transform(X_numerical_train_poly)
X_numerical_test_s = scaler.transform(X_numerical_test_poly)

scaler = StandardScaler()
X_numerical_train_s = scaler.fit_transform(X_numerical_train_poly)
X_numerical_test_s = scaler.transform(X_numerical_test_poly)

scaler_cat_encoded = StandardScaler()
X_categorical_train_s = scaler_cat_encoded.fit_transform(X_categorical_train_encoded)
X_categorical_test_s = scaler_cat_encoded.transform(X_categorical_test_encoded)


X_train_final = np.hstack([X_categorical_train_s, X_numerical_train_s])
X_test_final = np.hstack([X_categorical_test_s, X_numerical_test_s])

print("X_train_final shape:", X_train_final.shape)
print("X_test_final shape: ", X_test_final.shape)


X_train_final shape: (9898, 1120)
X_test_final shape:  (4242, 1120)


### Normalizing data


In [14]:
def safe_boxcox_series(train_series, test_series):
    
    train_series = train_series.fillna(train_series.median()).astype(float)
    
    test_series = test_series.fillna(train_series.median()).astype(float)

    
    shift = 0.0
    if train_series.min() <= 0:
        shift = abs(train_series.min()) + 1e-6

    train_shifted = train_series + shift
    test_shifted = test_series + shift

  
    if (train_shifted <= 0).any():
        raise ValueError("Shift failed: still <= 0 values in train.")

    
    train_bc, lam = boxcox(train_shifted)
    test_bc = boxcox(test_shifted, lam)

    return train_bc, test_bc, lam, shift, train_series.index


y_train_views_bc, y_test_views_bc, lambda_views, shift_views, idx_train_views = safe_boxcox_series(y_train['Views'], y_test['Views'])
print("Skew train views:", skew(y_train_views_bc))
print("Skew test views: ", skew(y_test_views_bc))


y_train_likes_bc, y_test_likes_bc, lambda_likes, shift_likes, idx_train_likes = safe_boxcox_series(y_train['Likes'], y_test['Likes'])
print("Skew train likes:", skew(y_train_likes_bc))
print("Skew test likes: ", skew(y_test_likes_bc))


Skew train views: -0.017174084198226483
Skew test views:  -0.026036710596635242
Skew train likes: 0.01960559683903218
Skew test likes:  0.04058709751352874


In [55]:
data['Views'].shape

(14140,)

In [15]:
y_train_bc, y_test_bc, lam, shift, idx_train_views = safe_boxcox_series(y_train['Views'], y_test['Views'])
print("Skew train views:", skew(y_train_bc))
print("Skew test views: ", skew(y_test_views_bc))


y_train_bc, y_test_bc, lam, shift_, idx_train_likes = safe_boxcox_series(y_train['Likes'], y_test['Likes'])
print("Skew train likes:", skew(y_train_bc))
print("Skew test likes: ", skew(y_test_bc))



Skew train views: -0.017174084198226483
Skew test views:  -0.026036710596635242
Skew train likes: 0.01960559683903218
Skew test likes:  0.04058709751352874


In [54]:
data['Likes'].shape

(14140,)

### Mixing up categorical and numerical data for test sets only

In [16]:
X_train_final_df = pd.DataFrame(X_train_final)
X_test_final_df = pd.DataFrame(X_test_final)


X_train_reset = X_train.reset_index(drop=True)

train_pos = np.arange(len(X_train_reset))

mask_valid = ~(y_train['Views'].isna() | y_train['Likes'].isna()).reset_index(drop=True)

X_train_proc = X_train_final_df[mask_valid.values].values
y_views_proc = y_train_views_bc[mask_valid.values]
y_likes_proc = y_train_likes_bc[mask_valid.values]



print("X_train_proc shape:", X_train_proc.shape)
print("y_train_views_bc shape:", y_train_views_bc.shape)
print("y_train_likes_bc shape:", y_train_likes_bc.shape)


x_imputer = SimpleImputer(strategy="mean")
X_train_imp = x_imputer.fit_transform(X_train_proc)
X_test_imp = x_imputer.transform(X_test_final_df.values)



X_train_proc shape: (9879, 1120)
y_train_views_bc shape: (9898,)
y_train_likes_bc shape: (9898,)


In [56]:
X_mixed_dfs.shape

(15266, 1680)

In [17]:
from sklearn.metrics import (mean_squared_log_error, 
                            r2_score,
                            mean_absolute_error,
                            mean_squared_error)


In [None]:
model = ElasticNetCV(
    alphas=[0.01, 0.1, 0.5, 1, 5, 10],
    l1_ratio=[0.1, 0.5, 0.7, 0.9, 1],
    cv=6,
    random_state=randoming,
    max_iter=5000
)
from sklearn.linear_model import MultiTaskElasticNetCV
model_multi = MultiTaskElasticNetCV(
    alphas=[0.01, 0.1, 0.5, 1, 5, 10],
    l1_ratio=[0.1, 0.5, 0.7, 0.9, 1],
    cv=6,
    random_state=randoming,
    max_iter=5000 
)




### Since we have a lot of features it takes some time for the model to predict


In [None]:
y_train_bc = np.column_stack([y_views_proc, y_likes_proc])

print("Final shapes -> X:", X_train_imp.shape, " y:", y_train_bc.shape)

model_multi.fit(X_train_imp, y_train_bc)


y_pred_bc = model_multi.predict(X_test_imp)
print("y_pred_bc shape:", y_pred_bc.shape)


y_pred_views_orig = inv_boxcox(y_pred_bc[:, 0], lambda_views) - shift_views
y_pred_likes_orig = inv_boxcox(y_pred_bc[:, 1], lambda_likes) - shift_likes

for i in range(5):
    print(f"Predicted (Views, Likes) #{i}:",
          int(y_pred_views_orig[i]), int(y_pred_likes_orig[i]))
    

In [None]:

r2_score_views = r2_score(y_test['Views'], y_pred_views_orig)
r2_score_likes = r2_score(y_test['Likes'], y_pred_likes_orig)

print(f'The quality of views prediction: {r2_score_views}')
print(f'The quality of likes prediction: {r2_score_likes}')

mae_views = mean_absolute_error(y_test['Views'], y_pred_views_orig)
rmse_views = mean_squared_error(y_test['Views'], y_pred_views_orig, squared=False)

print(f"MAE Views: {mae_views}, RMSE Views: {rmse_views}")


# Either of the last two blocks has a mistake I did what I learned during 'IBM supervised ML' course. After carefull code fixing I still had not understood where I made a mistake.

## So, let's recap: It was very interesting to test my skills with a "Pocket project". 

### Supervised ML practice 1.

### The code in GitHub might be slightly different
