In [1]:
import numpy as np
import lightgbm as lgbm
import pandas as pd

In [2]:
'''
File paths
'''
import os

folder = os.path.join("/projectnb","cs542sp","netflix_wrw2", "CS542-final-project", "data")

In [5]:
%%time

# # read in the full data :0
reader = pd.read_sas('netflix_analysis_dataset2.sas7bdat', chunksize=10_000_000)
data = next(reader)

# data = pd.read_sas('netflix_analysis_dataset2.sas7bdat')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 17 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   User_ID                 float64
 1   Movie_ID                float64
 2   Year                    float64
 3   Rated                   float64
 4   Ratings_for_Movie       float64
 5   Ratings_for_Movie_2005  float64
 6   Netflix_Release_Year    float64
 7   Movie_Rating_Time       float64
 8   Movie_Ratings_per_Day   float64
 9   Release_Year            float64
 10  AVG_Rating_for_Movie    float64
 11  Ratings_from_User       float64
 12  Ratings_from_User_2005  float64
 13  AVG_Rating_from_User    float64
 14  User_Rating_Time        float64
 15  User_Ratings_per_Day    float64
 16  User_Entry_Year         float64
dtypes: float64(17)
memory usage: 1.3 GB
CPU times: user 18.4 s, sys: 2.23 s, total: 20.7 s
Wall time: 20.8 s


In [6]:
# save the data to csv (speed up?) more like slowdown yikes
# data.to_csv('data/full_data.csv')

In [10]:
%%time
# type_map = {
#     "User_ID": np.uint8,
#     "Movie_ID": np.uint16,
#     "Year": np.uint16,
#     "Rated": np.uint8,
#     "Ratings_for_Movie": np.uint32,
#     "Ratings_for_Movie_2005": np.uint32,
#     "Netflix_Release_Year": np.uint16,
#     "Release_Year": np.uint16,
#     "AVG_Rating_for_Movie": np.float64,
#     "Ratings_from_User": np.uint16,
#     "Ratings_from_User_2005": np.uint16,
#     "AVG_Rating_from_User": np.float64,
#     "User_Entry_Year": np.uint16,
# # }
# type_map=None

# data = pd.read_csv('data/full_data.csv',dtype=type_map)

CPU times: user 3min 44s, sys: 1min 6s, total: 4min 51s
Wall time: 4min 51s


In [7]:
data = data.dropna()

for c in data.columns:
    data.loc[:,c] = pd.to_numeric(data.loc[:,c], downcast="unsigned")
    
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9879614 entries, 0 to 9999999
Data columns (total 17 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   User_ID                 uint32 
 1   Movie_ID                uint16 
 2   Year                    uint16 
 3   Rated                   uint8  
 4   Ratings_for_Movie       uint32 
 5   Ratings_for_Movie_2005  uint32 
 6   Netflix_Release_Year    uint16 
 7   Movie_Rating_Time       float64
 8   Movie_Ratings_per_Day   float64
 9   Release_Year            uint16 
 10  AVG_Rating_for_Movie    float64
 11  Ratings_from_User       uint16 
 12  Ratings_from_User_2005  uint16 
 13  AVG_Rating_from_User    float64
 14  User_Rating_Time        float64
 15  User_Ratings_per_Day    float64
 16  User_Entry_Year         uint16 
dtypes: float64(6), uint16(7), uint32(3), uint8(1)
memory usage: 782.0 MB


In [19]:
# data = data.drop(['Unnamed: 0'], axis=1)

In [8]:

X = data.drop(['Rated'], axis=1)
Y = data.loc[:,"Rated"]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [25]:
from lightgbm import Dataset

train_data = Dataset(X_train, label=y_train)

valid_set = Dataset(X_valid, label=y_valid, reference=train_data, categorical_feature =[0,1])

In [26]:
os.cpu_count()

32

In [27]:
%%time
'''
Train the model using the "lgb.train" api for more control. Try 20 cycles, no max depth
'''

params = {
    "objective":'binary',
    "num_leaves": 1023,
    "max_depth": 12,
    "learning_rate": 0.1,
#   "bagging_fraction": 0.5,
#     "bagging_freq": 5,
    "metric":"binary_logloss",
    "num_threads":8,
    "boosting": "goss",
    "ignore_column": [0,1],
}
# goss is faster than gbdt but worse for small data
# cat feature user name and movie name

model = lgbm.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[valid_set],
    early_stopping_rounds = 10,
    init_model=model
)

[LightGBM] [Info] Number of positive: 3051094, number of negative: 4358616
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3168
[LightGBM] [Info] Number of data points in the train set: 7409710, number of used features: 16
[LightGBM] [Info] Using GOSS
[301]	valid_0's binary_logloss: 0.600419
Training until validation scores don't improve for 10 rounds
[302]	valid_0's binary_logloss: 0.600368
[303]	valid_0's binary_logloss: 0.600283
[304]	valid_0's binary_logloss: 0.60022
[305]	valid_0's binary_logloss: 0.600176
[306]	valid_0's binary_logloss: 0.600132
[307]	valid_0's binary_logloss: 0.600036
[308]	valid_0's binary_logloss: 0.599988
[309]	valid_0's binary_logloss: 0.599966
[310]	valid_0's binary_logloss: 0.599904
[311]	valid_0's binary_logloss: 0.599853
[312]	valid_0's binary_logloss: 0.599847
[313]	valid_0's binary_logloss: 0.59983
[314]	valid_0's binary_logloss: 0.5998
[315]	valid_0'

[434]	valid_0's binary_logloss: 0.596584
[435]	valid_0's binary_logloss: 0.59655
[436]	valid_0's binary_logloss: 0.596522
[437]	valid_0's binary_logloss: 0.596505
[438]	valid_0's binary_logloss: 0.596486
[439]	valid_0's binary_logloss: 0.596462
[440]	valid_0's binary_logloss: 0.596439
[441]	valid_0's binary_logloss: 0.596418
[442]	valid_0's binary_logloss: 0.596401
[443]	valid_0's binary_logloss: 0.596375
[444]	valid_0's binary_logloss: 0.596369
[445]	valid_0's binary_logloss: 0.596357
[446]	valid_0's binary_logloss: 0.59632
[447]	valid_0's binary_logloss: 0.596324
[448]	valid_0's binary_logloss: 0.596313
[449]	valid_0's binary_logloss: 0.596292
[450]	valid_0's binary_logloss: 0.596281
[451]	valid_0's binary_logloss: 0.596274
[452]	valid_0's binary_logloss: 0.596254
[453]	valid_0's binary_logloss: 0.596228
[454]	valid_0's binary_logloss: 0.596201
[455]	valid_0's binary_logloss: 0.596157
[456]	valid_0's binary_logloss: 0.596136
[457]	valid_0's binary_logloss: 0.596129
[458]	valid_0's bi

[582]	valid_0's binary_logloss: 0.593962
[583]	valid_0's binary_logloss: 0.59393
[584]	valid_0's binary_logloss: 0.59391
[585]	valid_0's binary_logloss: 0.593904
[586]	valid_0's binary_logloss: 0.593887
[587]	valid_0's binary_logloss: 0.593869
[588]	valid_0's binary_logloss: 0.593851
[589]	valid_0's binary_logloss: 0.59384
[590]	valid_0's binary_logloss: 0.593794
[591]	valid_0's binary_logloss: 0.593794
[592]	valid_0's binary_logloss: 0.593762
[593]	valid_0's binary_logloss: 0.593763
[594]	valid_0's binary_logloss: 0.593748
[595]	valid_0's binary_logloss: 0.593724
[596]	valid_0's binary_logloss: 0.593716
[597]	valid_0's binary_logloss: 0.59367
[598]	valid_0's binary_logloss: 0.593618
[599]	valid_0's binary_logloss: 0.593611
[600]	valid_0's binary_logloss: 0.5936
[601]	valid_0's binary_logloss: 0.593564
[602]	valid_0's binary_logloss: 0.593515
[603]	valid_0's binary_logloss: 0.593508
[604]	valid_0's binary_logloss: 0.593509
[605]	valid_0's binary_logloss: 0.593493
[606]	valid_0's binary

[733]	valid_0's binary_logloss: 0.591481
[734]	valid_0's binary_logloss: 0.591465
[735]	valid_0's binary_logloss: 0.591462
[736]	valid_0's binary_logloss: 0.591457
[737]	valid_0's binary_logloss: 0.591418
[738]	valid_0's binary_logloss: 0.591392
[739]	valid_0's binary_logloss: 0.591375
[740]	valid_0's binary_logloss: 0.591353
[741]	valid_0's binary_logloss: 0.591352
[742]	valid_0's binary_logloss: 0.591336
[743]	valid_0's binary_logloss: 0.591343
[744]	valid_0's binary_logloss: 0.591326
[745]	valid_0's binary_logloss: 0.59131
[746]	valid_0's binary_logloss: 0.591305
[747]	valid_0's binary_logloss: 0.591295
[748]	valid_0's binary_logloss: 0.591274
[749]	valid_0's binary_logloss: 0.591267
[750]	valid_0's binary_logloss: 0.591241
[751]	valid_0's binary_logloss: 0.591217
[752]	valid_0's binary_logloss: 0.591194
[753]	valid_0's binary_logloss: 0.591168
[754]	valid_0's binary_logloss: 0.591167
[755]	valid_0's binary_logloss: 0.591149
[756]	valid_0's binary_logloss: 0.591149
[757]	valid_0's b

[881]	valid_0's binary_logloss: 0.589705
[882]	valid_0's binary_logloss: 0.589705
[883]	valid_0's binary_logloss: 0.58969
[884]	valid_0's binary_logloss: 0.589653
[885]	valid_0's binary_logloss: 0.589653
[886]	valid_0's binary_logloss: 0.589656
[887]	valid_0's binary_logloss: 0.589621
[888]	valid_0's binary_logloss: 0.589601
[889]	valid_0's binary_logloss: 0.589608
[890]	valid_0's binary_logloss: 0.589591
[891]	valid_0's binary_logloss: 0.589587
[892]	valid_0's binary_logloss: 0.589581
[893]	valid_0's binary_logloss: 0.589569
[894]	valid_0's binary_logloss: 0.589556
[895]	valid_0's binary_logloss: 0.589544
[896]	valid_0's binary_logloss: 0.589538
[897]	valid_0's binary_logloss: 0.589541
[898]	valid_0's binary_logloss: 0.589533
[899]	valid_0's binary_logloss: 0.589527
[900]	valid_0's binary_logloss: 0.589524
[901]	valid_0's binary_logloss: 0.589518
[902]	valid_0's binary_logloss: 0.589485
[903]	valid_0's binary_logloss: 0.589463
[904]	valid_0's binary_logloss: 0.589465
[905]	valid_0's b

[1030]	valid_0's binary_logloss: 0.588571
[1031]	valid_0's binary_logloss: 0.588564
[1032]	valid_0's binary_logloss: 0.588558
[1033]	valid_0's binary_logloss: 0.588549
[1034]	valid_0's binary_logloss: 0.58854
[1035]	valid_0's binary_logloss: 0.588532
[1036]	valid_0's binary_logloss: 0.588524
[1037]	valid_0's binary_logloss: 0.588509
[1038]	valid_0's binary_logloss: 0.588501
[1039]	valid_0's binary_logloss: 0.588499
[1040]	valid_0's binary_logloss: 0.588483
[1041]	valid_0's binary_logloss: 0.588466
[1042]	valid_0's binary_logloss: 0.588466
[1043]	valid_0's binary_logloss: 0.588472
[1044]	valid_0's binary_logloss: 0.588467
[1045]	valid_0's binary_logloss: 0.588463
[1046]	valid_0's binary_logloss: 0.588464
[1047]	valid_0's binary_logloss: 0.588457
[1048]	valid_0's binary_logloss: 0.588463
[1049]	valid_0's binary_logloss: 0.58846
[1050]	valid_0's binary_logloss: 0.588465
[1051]	valid_0's binary_logloss: 0.588429
[1052]	valid_0's binary_logloss: 0.588425
[1053]	valid_0's binary_logloss: 0.5

In [30]:
model.feature_importance().shape

(16,)

In [31]:
train_prediction = model.predict(X_train)

In [32]:
valid_prediction = model.predict(X_valid)

In [33]:
from sklearn.metrics import accuracy_score

'''
Check for overfitting
'''
print("Overfit check:")
print('Train score:\t{:.3f}'.format(accuracy_score(y_train, train_prediction > 0.5)))
print('Test score:\t{:.3f}'.format(accuracy_score(y_valid, valid_prediction > 0.5)))

Overfit check:
Train score:	0.740
Test score:	0.684


In [34]:
'''
Metrics output
'''
from sklearn.metrics import classification_report
print(classification_report(y_valid, valid_prediction > 0.5))

              precision    recall  f1-score   support

           0       0.70      0.80      0.75   1453609
           1       0.65      0.51      0.57   1016295

    accuracy                           0.68   2469904
   macro avg       0.67      0.66      0.66   2469904
weighted avg       0.68      0.68      0.68   2469904



In [35]:
model.save_model('models/m1000.txt')

<lightgbm.basic.Booster at 0x2ba6e8164c70>