# 04. Data Preprocessing

In [169]:
import pandas as pd
import numpy as np
import src.utils as utils
from datetime import date, datetime

import warnings
warnings.filterwarnings("ignore")

In [170]:
df_train_init = utils.pickle_load('data/processed/train.pkl').reset_index(drop = True)
df_test_init = utils.pickle_load('data/processed/test.pkl').reset_index(drop = True)

In [171]:
df_train_init.shape

(1634, 21)

### a. Handling Missing Values

In [172]:
# Separate y column or dependent variable
y_train = df_train_init.tracks_popularity.values
y_test = df_test_init.tracks_popularity.values

In [173]:
def cat_age(age):
    if age <= 20:
        cat = '<20'
    elif 20 < age <= 30:
        cat = '25-30'
    elif age > 30:
        cat = '>30'
    else:
        cat = 'unknown'

    return cat

def cat_days_after_debut(days):
    if days <= 365:
        cat = '<1year'
    elif 365 < days <= 365 * 5:
        cat = '1-5years'
    elif 365 * 5 < days <= 365 * 10:
        cat = '5-10years'
    elif days > 365 * 10:
        cat = '>10years'
    else:
        cat = 'unknown'
        
    return cat

    

In [209]:
def labelencoder(train, test, feature):

    le = LabelEncoder()
    le.fit(train[feature])
    train[feature] = le.transform(train[feature])
    test[feature] = le.transform(test[feature])

    return train[feature], test[feature]

def onehotencoder(train, test, feature):
    
    ohe = OneHotEncoder()
    ohe.fit(train[[feature]])
    train_encoded = pd.DataFrame(ohe.transform(train[[feature]]).toarray())
    train_encoded.columns = ohe.get_feature_names([feature])

    test_encoded = pd.DataFrame(ohe.transform(test[[feature]]).toarray())
    test_encoded.columns = ohe.get_feature_names([feature])

    return train_encoded, test_encoded


In [213]:
def preprocessing():

    # Mengganti kolom days_after_debut dan mean_age menjadi kolom kategorikal
    df['cat_age'] = df.apply(lambda row: cat_age(row['mean_age']), axis=1)
    df['cat_days_debut'] = df.apply(lambda row: cat_age(row['days_after_debut']), axis=1)

    # Drop kolom explicit dan time_signature karena dari EDA dianggap tidak berpengaruh
    df.drop(['mean_age', 'days_after_debut', 'explicit', 'time_signature', 'members', 'tracks_popularity'],axis = 1, inplace=True)

    # creating instance of labelencoder
    labelencoder = LabelEncoder()
    labelencoder.fit(['A#', 'C#', 'B', 'C', 'G', 'F', 'D', 'A', 'G#', 'E', 'F#', 'D#'])

    # Assigning numerical values and storing in another column
    df['key'] = labelencoder.transform(df['key'])

    onehotencoder = OneHotEncoder(sparse=False)
    modality_encoded = pd.DataFrame (onehotencoder.fit_transform(df[['modality']]))
    modality_encoded.columns = onehotencoder.get_feature_names(['modality'])

    company_encoded = pd.DataFrame (onehotencoder.fit_transform(df[['big5_company']]))
    company_encoded.columns = onehotencoder.get_feature_names(['big5_company'])

    age_encoded = pd.DataFrame (onehotencoder.fit_transform(df[['cat_age']]))
    age_encoded.columns = onehotencoder.get_feature_names(['cat_age'])

    debutdays_encoded = pd.DataFrame (onehotencoder.fit_transform(df[['cat_days_debut']]))
    debutdays_encoded.columns = onehotencoder.get_feature_names(['cat_days_debut'])

    df.drop(['modality', 'big5_company', 'cat_age', 'cat_days_debut'], axis = 1, inplace=True)
    df_concat = pd.concat([df, modality_encoded, company_encoded, age_encoded, debutdays_encoded], axis = 1)

    return df_concat

In [214]:
X_train = preprocessing(df_train_init)
#X_test = preprocessing(df_test_init)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [184]:
X_train.isna().sum().sum()

0

In [190]:
len(df_train_init.columns)

21

# 05. Modelling

In [179]:
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [178]:
# baseline
from sklearn.metrics import mean_squared_error

baseline_pred = np.mean(y_train)
print(baseline_pred)
baseline_mse = mean_squared_error(y_train, 
                                  np.ones(len(y_train)) * baseline_pred)
print(baseline_mse)

51.15055079559364
247.1878607737356


## a. Linear Regression

In [185]:
def fit_model(X_train, y_train,
              X_test, y_test):
    # 1. Buat objek
    lr = LinearRegression()

    # 2. Lakukan cross-val
    scores = cross_val_score(estimator = lr,
                             X = X_train,
                             y = y_train,
                             cv = 5,
                             scoring = "neg_mean_squared_error")
    
    cv_score = - np.mean(scores)
    
    # 3. Fit model
    lr.fit(X = X_train,
           y = y_train)
    
    # 4. Cari train score
    y_train_pred = lr.predict(X_train)
    train_score = mean_squared_error(y_train, y_train_pred)
    
    # 4. Cari test score
    y_test_pred = lr.predict(X_test)
    test_score = mean_squared_error(y_test, y_test_pred)

    # 5. Ekstrak coefficient
    coef_ = lr.coef_
    intercept_ = lr.intercept_
    lr_params = np.append(coef_, intercept_)

    lr_params_df = pd.DataFrame(lr_params,
                                index = list(X_train.columns) + ["constant"],
                                columns = ["coefficient"])
    
    return lr, train_score, cv_score, test_score, lr_params_df


In [189]:
len(X_test.columns)

25

In [186]:
lr, train_score, cv_score, test_score, lr_params_df = fit_model(X_train = X_train,
                                                                y_train = y_train,
                                                                X_test = X_test,
                                                                y_test = y_test)
print(f"train score: {train_score:.3f}, cv score: {cv_score:.3f}, test_score: {test_score:.3f}")

ValueError: X has 25 features, but LinearRegression is expecting 26 features as input.