In [1]:
import numpy as np
import lightgbm as lgbm
import pandas as pd

In [2]:
'''
File paths
'''
import os

folder = os.path.join("/projectnb","cs542sp","netflix_wrw2", "CS542-final-project", "data")

In [6]:
%%time

# # read in the full data :0
# reader = pd.read_sas('data/netflix_analysis_dataset.sas7bdat', chunksize=1000)
# data = next(reader)

data = pd.read_sas('data/netflix_analysis_dataset.sas7bdat')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243642300 entries, 0 to 243642299
Data columns (total 13 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   User_ID                 float64
 1   Movie_ID                float64
 2   Year                    float64
 3   Rated                   float64
 4   Ratings_for_Movie       float64
 5   Ratings_for_Movie_2005  float64
 6   Netflix_Release_Year    float64
 7   Release_Year            float64
 8   AVG_Rating_for_Movie    float64
 9   Ratings_from_User       float64
 10  Ratings_from_User_2005  float64
 11  AVG_Rating_from_User    float64
 12  User_Entry_Year         float64
dtypes: float64(13)
memory usage: 23.6 GB
CPU times: user 7min 3s, sys: 59.4 s, total: 8min 2s
Wall time: 8min 3s


In [7]:
# save the data to csv (speed up?) more like slowdown yikes
data.to_csv('data/full_data.csv')

In [10]:
%%time
# type_map = {
#     "User_ID": np.uint8,
#     "Movie_ID": np.uint16,
#     "Year": np.uint16,
#     "Rated": np.uint8,
#     "Ratings_for_Movie": np.uint32,
#     "Ratings_for_Movie_2005": np.uint32,
#     "Netflix_Release_Year": np.uint16,
#     "Release_Year": np.uint16,
#     "AVG_Rating_for_Movie": np.float64,
#     "Ratings_from_User": np.uint16,
#     "Ratings_from_User_2005": np.uint16,
#     "AVG_Rating_from_User": np.float64,
#     "User_Entry_Year": np.uint16,
# }
type_map=None

data = pd.read_csv('data/full_data.csv',dtype=type_map)

CPU times: user 3min 44s, sys: 1min 6s, total: 4min 51s
Wall time: 4min 51s


In [11]:
data = data.dropna()

for c in data.columns:
    data.loc[:,c] = pd.to_numeric(data.loc[:,c], downcast="unsigned")
    
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243639408 entries, 0 to 243642299
Data columns (total 14 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              uint32 
 1   User_ID                 uint32 
 2   Movie_ID                uint16 
 3   Year                    uint16 
 4   Rated                   uint8  
 5   Ratings_for_Movie       uint32 
 6   Ratings_for_Movie_2005  uint32 
 7   Netflix_Release_Year    uint16 
 8   Release_Year            uint16 
 9   AVG_Rating_for_Movie    float64
 10  Ratings_from_User       uint16 
 11  Ratings_from_User_2005  uint16 
 12  AVG_Rating_from_User    float64
 13  User_Entry_Year         uint16 
dtypes: float64(2), uint16(7), uint32(4), uint8(1)
memory usage: 12.5 GB


In [19]:
data = data.drop(['Unnamed: 0'], axis=1)

In [20]:

X = data.drop(['Rated'], axis=1)
Y = data.loc[:,"Rated"]

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [34]:
from lightgbm import Dataset

train_data = Dataset(X_train, label=y_train, categorical_feature =[0,1])

valid_set = Dataset(X_valid, label=y_valid, reference=train_data, categorical_feature =[0,1])

In [26]:
os.cpu_count()

32

In [35]:
%%time
'''
Train the model using the "lgb.train" api for more control. Try 20 cycles, no max depth
'''

params = {
    "objective":'binary',
    "num_leaves": 1023,
    "max_depth": -1,
    "learning_rate": 0.1,
#   "bagging_fraction": 0.5,
#     "bagging_freq": 5,
    "metric":"binary_logloss",
    "num_threads":8,
    "boosting": "goss",
}
# goss is faster than gbdt but worse for small data
# cat feature user name and movie name

model = lgbm.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[valid_set],
    early_stopping_rounds = 10,
    init_model=model
)

[LightGBM] [Info] Number of positive: 75359793, number of negative: 107369763
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23469
[LightGBM] [Info] Number of data points in the train set: 182729556, number of used features: 12
[LightGBM] [Info] Using GOSS




[21]	valid_0's binary_logloss: 0.631871
Training until validation scores don't improve for 10 rounds
[22]	valid_0's binary_logloss: 0.631076
[23]	valid_0's binary_logloss: 0.630186
[24]	valid_0's binary_logloss: 0.629554
[25]	valid_0's binary_logloss: 0.628903
[26]	valid_0's binary_logloss: 0.628317
[27]	valid_0's binary_logloss: 0.627733
[28]	valid_0's binary_logloss: 0.627212
[29]	valid_0's binary_logloss: 0.626797
[30]	valid_0's binary_logloss: 0.626304
[31]	valid_0's binary_logloss: 0.625925
[32]	valid_0's binary_logloss: 0.625599
[33]	valid_0's binary_logloss: 0.625277
[34]	valid_0's binary_logloss: 0.625014
[35]	valid_0's binary_logloss: 0.624686
[36]	valid_0's binary_logloss: 0.624431
[37]	valid_0's binary_logloss: 0.624113
[38]	valid_0's binary_logloss: 0.623887
[39]	valid_0's binary_logloss: 0.623666
[40]	valid_0's binary_logloss: 0.623416
[41]	valid_0's binary_logloss: 0.623226
[42]	valid_0's binary_logloss: 0.623072
[43]	valid_0's binary_logloss: 0.622873
[44]	valid_0's bina

Exception ignored in: <function Booster.__del__ at 0x2af3f60a51f0>
Traceback (most recent call last):
  File "/usr4/cs542sp/ejbosia/.local/lib/python3.8/site-packages/lightgbm/basic.py", line 2281, in __del__
    try:
KeyboardInterrupt: 


KeyboardInterrupt: 

In [36]:
model.feature_importance()

array([2029, 3970,  620,  274,  143,  153,  231,  207,  675,  854,  358,
        686], dtype=int32)

In [37]:
train_prediction = model.predict(X_train)

In [38]:
valid_prediction = model.predict(X_valid)

In [39]:
from sklearn.metrics import accuracy_score

'''
Check for overfitting
'''
print("Overfit check:")
print('Train score:\t{:.3f}'.format(accuracy_score(y_train, train_prediction > 0.5)))
print('Test score:\t{:.3f}'.format(accuracy_score(y_valid, valid_prediction > 0.5)))

Overfit check:
Train score:	0.641
Test score:	0.640


In [42]:
'''
Metrics output
'''
from sklearn.metrics import classification_report
print(classification_report(y_valid, valid_prediction > 0.5))

              precision    recall  f1-score   support

           0       0.65      0.85      0.74  35790103
           1       0.62      0.34      0.44  25119749

    accuracy                           0.64  60909852
   macro avg       0.63      0.60      0.59  60909852
weighted avg       0.63      0.64      0.61  60909852



In [43]:
model.save_model('models/full_70iter.txt')

<lightgbm.basic.Booster at 0x2af3f6dc84c0>