In [1]:
#Import necessay libraries
import pandas as pd
import numpy as np

#Preprocessing
from sklearn import model_selection,metrics
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,LabelEncoder,RobustScaler
#Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,roc_auc_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error,mean_squared_error
from catboost import CatBoostRegressor

## **Read Data**

In [2]:
#import the data and shape
train = pd.read_csv("../input/song-popularity-prediction/train.csv")
test = pd.read_csv("../input/song-popularity-prediction/test.csv")
sample=pd.read_csv("../input/song-popularity-prediction/sample_submission.csv")
print(train.shape,test.shape,sample.shape)
train.describe().transpose()

(40000, 15) (10000, 14) (10000, 2)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,40000.0,19999.5,11547.14972,0.0,9999.75,19999.5,29999.25,39999.0
song_duration_ms,35899.0,193165.847572,45822.127679,25658.0,166254.5,186660.0,215116.0,491671.0
acousticness,36008.0,0.276404,0.297928,-0.013551,0.039618,0.140532,0.482499,1.065284
danceability,35974.0,0.570951,0.19001,0.043961,0.42476,0.608234,0.718464,0.957131
energy,36025.0,0.683932,0.212662,-0.001682,0.539276,0.704453,0.870503,1.039741
instrumentalness,36015.0,0.036527,0.150024,-0.004398,0.000941,0.001974,0.003225,1.075415
key,35935.0,5.042605,3.372728,0.0,2.0,5.0,8.0,11.0
liveness,35914.0,0.198514,0.15167,0.027843,0.111796,0.135945,0.212842,1.065298
loudness,36043.0,-7.407596,3.877198,-32.117911,-9.578139,-6.345413,-4.620711,-0.877346
audio_mode,40000.0,0.32115,0.466924,0.0,0.0,0.0,1.0,1.0


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                40000 non-null  int64  
 1   song_duration_ms  35899 non-null  float64
 2   acousticness      36008 non-null  float64
 3   danceability      35974 non-null  float64
 4   energy            36025 non-null  float64
 5   instrumentalness  36015 non-null  float64
 6   key               35935 non-null  float64
 7   liveness          35914 non-null  float64
 8   loudness          36043 non-null  float64
 9   audio_mode        40000 non-null  int64  
 10  speechiness       40000 non-null  float64
 11  tempo             40000 non-null  float64
 12  time_signature    40000 non-null  int64  
 13  audio_valence     40000 non-null  float64
 14  song_popularity   40000 non-null  int64  
dtypes: float64(11), int64(4)
memory usage: 4.6 MB


## **Apply Simple_Imputers**

In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
train_im = pd.DataFrame(imputer.fit_transform(train))
test_im = pd.DataFrame(imputer.fit_transform(test))
#remove column
train_im.columns = train.columns
test_im.columns = test.columns

train = train_im
test = test_im

#insert the kfold columns
train['kfold'] = -1
#distributing the data
kfold = KFold(n_splits=10, shuffle=True,random_state=3333)
for fold, (tr_i,va_i) in enumerate(kfold.split(X=train)):
    train.loc[va_i,'kfold'] = fold
    
print(train.kfold.value_counts())
train.to_csv("folds_10.csv",index=False)
print("successfully folds")

4    4000
9    4000
5    4000
1    4000
8    4000
7    4000
6    4000
0    4000
2    4000
3    4000
Name: kfold, dtype: int64
successfully folds


In [5]:
train.isnull().sum()

id                  0
song_duration_ms    0
acousticness        0
danceability        0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
audio_mode          0
speechiness         0
tempo               0
time_signature      0
audio_valence       0
song_popularity     0
kfold               0
dtype: int64

## **Feature Separation**

In [6]:
df = pd.read_csv("./folds_10.csv")

#features taken to train
features = [f for f in df.columns if f not in("id","kfold","song_popularity")]
test= test[features]

## **Build_Model**

In [7]:
prediction = []
score = []

for fold in range (10):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    ytrain = xtrain.song_popularity
    yvalid = xvalid.song_popularity
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    lE = RobustScaler()
    xtrain[features] = lE.fit_transform(xtrain[features])
    xvalid[features] = lE.transform(xvalid[features])
    xtest[features] = lE.transform(xtest[features])
    
    
    #Model hyperparameter of XGboostRegressor
    #lgb parameters
    params_lgb = {
        "task": "train",
        "boosting_type": "gbdt",
        "objective": "binary",
        'subsample': 0.95312,
        'learning_rate': 0.001635,
        "max_depth": 3,
        "feature_fraction": 0.2256038826485174,
        "bagging_fraction": 0.7705303688019942,
        "min_child_samples": 290,
        "reg_alpha": 14.68267919457715,
        "reg_lambda": 66.156,
        "max_bin": 772,
        "min_data_per_group": 177,
        "bagging_freq": 1,
        "cat_smooth": 96,
        "cat_l2": 17,
        "verbosity": -1,
        'random_state':2022,
        'n_estimators':5123,
        'colsample_bytree':0.1107
    }
    
    lgb_train = lgb.Dataset(xtrain, ytrain)
    lgb_val = lgb.Dataset(xvalid, yvalid)
    
    model = lgb.train(params=params_lgb,
                      train_set=lgb_train,
                      valid_sets=lgb_val,
                      callbacks=[early_stopping(stopping_rounds=444, verbose=False),
                                 log_evaluation(period=0)])
    
   
    preds_valid = model.predict(xvalid,num_iteration=model.best_iteration)
    test_predict = model.predict(xtest,num_iteration=model.best_iteration)
    prediction.append(test_predict)
    roc1= roc_auc_score(yvalid,preds_valid)
    score.append(roc1)
    print(f"fold|split:{fold},roc:{roc1}")
    
print(np.mean(score),np.std(score))



fold|split:0,roc:0.565306373959737




fold|split:1,roc:0.5835125243482633




fold|split:2,roc:0.5649374750802565




fold|split:3,roc:0.57579786234021




fold|split:4,roc:0.5911724769939897




fold|split:5,roc:0.5584901083635153




fold|split:6,roc:0.5651727126608519




fold|split:7,roc:0.5894034865694648




fold|split:8,roc:0.5705354956327718




fold|split:9,roc:0.5761422089055924
0.5740470724854653 0.01057656710083471


## **Predict output**

In [8]:
final_predict = np.mean(np.column_stack(prediction),axis=1)
print(final_predict)
sample.song_popularity = final_predict
sample.to_csv("lgb_median_imputation_regression_v3.csv",index=False)
print('Success!')

[0.38986701 0.45619705 0.33826309 ... 0.34096964 0.4371645  0.31547874]
Success!
