In [1]:
import numpy as np
import lightgbm as lgbm
import pandas as pd

In [2]:
'''
File paths
'''
import os

folder = os.path.join("/projectnb","cs542sp","netflix_wrw2", "CS542-final-project", "data")

In [3]:
%%time

# # read in the full data :0
# reader = pd.read_sas('data/netflix_analysis_dataset.sas7bdat', chunksize=100_000_000)
# data = next(reader)

data = pd.read_sas('data/netflix_analysis_dataset.sas7bdat')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243642300 entries, 0 to 243642299
Data columns (total 13 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   User_ID                 float64
 1   Movie_ID                float64
 2   Year                    float64
 3   Rated                   float64
 4   Ratings_for_Movie       float64
 5   Ratings_for_Movie_2005  float64
 6   Netflix_Release_Year    float64
 7   Release_Year            float64
 8   AVG_Rating_for_Movie    float64
 9   Ratings_from_User       float64
 10  Ratings_from_User_2005  float64
 11  AVG_Rating_from_User    float64
 12  User_Entry_Year         float64
dtypes: float64(13)
memory usage: 23.6 GB
CPU times: user 7min 5s, sys: 1min 1s, total: 8min 6s
Wall time: 8min 8s


In [4]:
data = data.dropna()

for c in data.columns:
    data.loc[:,c] = pd.to_numeric(data.loc[:,c], downcast="unsigned")
    
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243639408 entries, 0 to 243642299
Data columns (total 13 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   User_ID                 uint32 
 1   Movie_ID                uint16 
 2   Year                    uint16 
 3   Rated                   uint8  
 4   Ratings_for_Movie       uint32 
 5   Ratings_for_Movie_2005  uint32 
 6   Netflix_Release_Year    uint16 
 7   Release_Year            uint16 
 8   AVG_Rating_for_Movie    float64
 9   Ratings_from_User       uint16 
 10  Ratings_from_User_2005  uint16 
 11  AVG_Rating_from_User    float64
 12  User_Entry_Year         uint16 
dtypes: float64(2), uint16(7), uint32(3), uint8(1)
memory usage: 11.6 GB


In [5]:
X = data.drop(['Rated'], axis=1)
Y = data.loc[:,"Rated"]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [7]:
from lightgbm import Dataset

train_data = Dataset(X_train, label=y_train)

valid_set = Dataset(X_valid, label=y_valid, reference=train_data)

In [None]:
%%time
'''
Train the model using the "lgb.train" api for more control.
'''

params = {
    "objective":'binary',
    "num_leaves": 4095,
    "max_depth": 12,
    "learning_rate": 0.1,
#     "bagging_fraction": 0.5,
#     "bagging_freq": 5,
    "metric":"binary_logloss",
    "num_threads":4,
    "boosting": "goss",
}
# goss is faster than gbdt but worse for small data

model = lgbm.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[valid_set],
    categorical_feature=[0,1],
    early_stopping_rounds = 10,
)

[LightGBM] [Info] Number of positive: 75359793, number of negative: 107369763
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23469
[LightGBM] [Info] Number of data points in the train set: 182729556, number of used features: 12
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.412412 -> initscore=-0.354005
[LightGBM] [Info] Start training from score -0.354005
[1]	valid_0's binary_logloss: 0.670084
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.663811
[3]	valid_0's binary_logloss: 0.658608
[4]	valid_0's binary_logloss: 0.654282
[5]	valid_0's binary_logloss: 0.650567
[6]	valid_0's binary_logloss: 0.647404
[7]	valid_0's binary_logloss: 0.644735
[8]	valid_0's binary_logloss: 0.642519
[9]	valid_0's binary_logloss: 0.640468
[10]	valid_0's binary_logloss: 0.638719
[11]	valid_0's binary_logloss: 0.637387
[12]	valid_0'

[66]	valid_0's binary_logloss: 0.625972
[67]	valid_0's binary_logloss: 0.625968
[68]	valid_0's binary_logloss: 0.625964
[69]	valid_0's binary_logloss: 0.625961
[70]	valid_0's binary_logloss: 0.625958
[71]	valid_0's binary_logloss: 0.625954
[72]	valid_0's binary_logloss: 0.625951
[73]	valid_0's binary_logloss: 0.625948
[74]	valid_0's binary_logloss: 0.625945
[75]	valid_0's binary_logloss: 0.625942
[76]	valid_0's binary_logloss: 0.625939
[77]	valid_0's binary_logloss: 0.625937
[78]	valid_0's binary_logloss: 0.625933
[79]	valid_0's binary_logloss: 0.62593
[80]	valid_0's binary_logloss: 0.625927
[81]	valid_0's binary_logloss: 0.625924
[82]	valid_0's binary_logloss: 0.625911
[83]	valid_0's binary_logloss: 0.625908
[84]	valid_0's binary_logloss: 0.625905
[85]	valid_0's binary_logloss: 0.625902
[86]	valid_0's binary_logloss: 0.625899
[87]	valid_0's binary_logloss: 0.625786
[88]	valid_0's binary_logloss: 0.625783
[89]	valid_0's binary_logloss: 0.62578
[90]	valid_0's binary_logloss: 0.625778
[9

[137]	valid_0's binary_logloss: 0.625647
[138]	valid_0's binary_logloss: 0.625645
[139]	valid_0's binary_logloss: 0.625643
[140]	valid_0's binary_logloss: 0.625641
[141]	valid_0's binary_logloss: 0.625639
[142]	valid_0's binary_logloss: 0.625637
[143]	valid_0's binary_logloss: 0.625635
[144]	valid_0's binary_logloss: 0.625633
[145]	valid_0's binary_logloss: 0.625631
[146]	valid_0's binary_logloss: 0.625629
[147]	valid_0's binary_logloss: 0.625628
[148]	valid_0's binary_logloss: 0.625625
[149]	valid_0's binary_logloss: 0.625623
[150]	valid_0's binary_logloss: 0.625621
[151]	valid_0's binary_logloss: 0.62562
[152]	valid_0's binary_logloss: 0.625618
[153]	valid_0's binary_logloss: 0.625617
[154]	valid_0's binary_logloss: 0.625615
[155]	valid_0's binary_logloss: 0.625613
[156]	valid_0's binary_logloss: 0.625611
[157]	valid_0's binary_logloss: 0.625609
[158]	valid_0's binary_logloss: 0.625608
[159]	valid_0's binary_logloss: 0.625606
[160]	valid_0's binary_logloss: 0.625605
[161]	valid_0's b

[208]	valid_0's binary_logloss: 0.625525
[209]	valid_0's binary_logloss: 0.625524
[210]	valid_0's binary_logloss: 0.625523
[211]	valid_0's binary_logloss: 0.625522
[212]	valid_0's binary_logloss: 0.625521
[213]	valid_0's binary_logloss: 0.625519
[214]	valid_0's binary_logloss: 0.625518
[215]	valid_0's binary_logloss: 0.625516
[216]	valid_0's binary_logloss: 0.625515


In [66]:
model.feature_importance()

array([3724, 2008,  837, 2989, 2285,  895, 2474, 3171, 2685, 2124, 1999,
        691], dtype=int32)

In [67]:
train_prediction = model.predict(X_train)

In [68]:
valid_prediction = model.predict(X_valid)

In [69]:
from sklearn.metrics import accuracy_score

'''
Check for overfitting
'''
print("Overfit check:")
print('Train score:\t{:.3f}'.format(accuracy_score(y_train, train_prediction > 0.5)))
print('Test score:\t{:.3f}'.format(accuracy_score(y_valid, valid_prediction > 0.5)))

Overfit check:
Train score:	0.655
Test score:	0.649


In [70]:
'''
Metrics output
'''
from sklearn.metrics import classification_report
print(classification_report(y_valid, prediction))

              precision    recall  f1-score   support

           0       0.65      0.83      0.73   2944556
           1       0.59      0.36      0.45   2055379

    accuracy                           0.63   4999935
   macro avg       0.62      0.59      0.59   4999935
weighted avg       0.62      0.63      0.61   4999935



In [71]:
model.save_model('models/20M_test.txt')

<lightgbm.basic.Booster at 0x2b3e58c86040>