# Model Training
This script is used for training the LightGBM model. All feature engineering was done previously in SAS.

In [1]:
import numpy as np
import lightgbm as lgbm
import pandas as pd
import os

In [2]:
'''
Ignore the annoying copy warnings from pandas
'''
import warnings
warnings.filterwarnings('ignore')

In [3]:
'''
File paths
'''
folder = os.path.join("/projectnb","cs542sp","netflix_wrw2", "CS542-final-project", "data")

In [4]:
from sklearn.model_selection import train_test_split

'''
Process the raw data into trainable data for the model
 - creates a train and validation set
'''
def process_data(data):
    
    data = data.dropna()
    
    for c in data.columns:
        data.loc[:,c] = pd.to_numeric(data.loc[:,c], downcast="unsigned")

    data.info()
    
    X = data.drop(['User_ID','Movie_ID', 'Rated'], axis=1)
    Y = data.loc[:,"Rated"]

    X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.25, random_state = 0)
    
    return X_train, X_valid, y_train, y_valid


In [5]:
%%time

'''
Read 50M data instances and process into useable data
'''

reader = pd.read_sas('netflix_analysis_dataset2.sas7bdat', chunksize=50_000_000)
data = next(reader)

X_train, X_valid, y_train, y_valid = process_data(data)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49371488 entries, 0 to 49999999
Data columns (total 17 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   User_ID                 uint32 
 1   Movie_ID                uint16 
 2   Year                    uint16 
 3   Rated                   uint8  
 4   Ratings_for_Movie       uint32 
 5   Ratings_for_Movie_2005  uint32 
 6   Netflix_Release_Year    uint16 
 7   Movie_Rating_Time       float64
 8   Movie_Ratings_per_Day   float64
 9   Release_Year            uint16 
 10  AVG_Rating_for_Movie    float64
 11  Ratings_from_User       uint16 
 12  Ratings_from_User_2005  uint16 
 13  AVG_Rating_from_User    float64
 14  User_Rating_Time        float64
 15  User_Ratings_per_Day    float64
 16  User_Entry_Year         uint16 
dtypes: float64(6), uint16(7), uint32(3), uint8(1)
memory usage: 3.8 GB
CPU times: user 3min 23s, sys: 1min 50s, total: 5min 13s
Wall time: 5min 13s


### Note the number of threads should be set to the number of CPUs available for training.

In [6]:
# set the number of threads to use in training
num_threads = 8

# set the number of training rounds
num_rounds = 500

In [None]:
%%time

'''
Set the data
'''
train_data = lgbm.Dataset(X_train, label=y_train, free_raw_data = False)
valid_set = lgbm.Dataset(X_valid, label=y_valid, reference=train_data, free_raw_data = False)

'''
Set the parameters
'''
params = {
    "objective":'binary',
    "num_leaves": 2047,
    "max_depth": 12,
    "learning_rate": 0.1,
    "bagging_fraction": 0.7,
    "bagging_freq": 5,
    "metric":"binary_logloss",
    "num_threads": num_threads,
    "boosting": "gbdt",
    "min_data_in_leaf":500,
    "verbose":-1
}


model_evaluation = {}

model = lgbm.train(
    params,
    train_data,
    num_boost_round=num_rounds,
    valid_sets=[valid_set],
    early_stopping_rounds = 10,
    verbose_eval = 10,
    evals_result = model_evaluation
)

Training until validation scores don't improve for 10 rounds
[10]	valid_0's binary_logloss: 0.639096
[20]	valid_0's binary_logloss: 0.630269
[30]	valid_0's binary_logloss: 0.626752
[40]	valid_0's binary_logloss: 0.624834
[50]	valid_0's binary_logloss: 0.623482
[60]	valid_0's binary_logloss: 0.622594
[70]	valid_0's binary_logloss: 0.621729
[80]	valid_0's binary_logloss: 0.621046
[90]	valid_0's binary_logloss: 0.620506
[100]	valid_0's binary_logloss: 0.619906
[110]	valid_0's binary_logloss: 0.619277
[120]	valid_0's binary_logloss: 0.6188


In [None]:
'''
Plot the feature importance of the model
'''
lgbm.plot_importance(model)

In [None]:
from sklearn.metrics import accuracy_score

'''
Evaluate the model train and valid set score
'''
def evaluate(model, X_train, X_valid, y_train, y_valid, num = 1_000_000):
    
    train_index = y_train.sample(num).index
    test_index = y_valid.sample(num).index
    
    train_prediction = model.predict(X_train.loc[train_index])
    valid_prediction = model.predict(X_valid.loc[test_index])
    
    train = accuracy_score(y_train.loc[train_index], train_prediction > 0.5)
    test = accuracy_score(y_valid.loc[test_index], valid_prediction > 0.5)
    
    '''
    Check for overfitting
    '''
    print('\tTrain score:\t{:.3f}'.format(train))
    print('\tTest score:\t{:.3f}'.format(test))


In [None]:
evaluate(model,X_train, X_valid, y_train, y_valid, num=10_000)

In [None]:
'''
Metrics report from sklearn
'''
from sklearn.metrics import classification_report

test_index = y_valid.sample(100000).index

valid_prediction = model.predict(X_valid.loc[test_index])

print(classification_report(y_valid[test_index], valid_prediction > 0.5))

In [17]:
'''
Save the model to the models folder
'''
model.save_model('models/m50M_500.txt')

<lightgbm.basic.Booster at 0x2b4e67822ac0>