In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgbm

In [9]:
'''
File paths
'''
import os

folder = os.path.join("/projectnb","cs542sp","netflix_wrw2", "CS542-final-project")

data_path = os.path.join(folder, "netflix_analysis_dataset.sas7bdat")

In [10]:
print(folder)

/projectnb/cs542sp/netflix_wrw2/CS542-final-project


In [11]:
next(os.walk(folder))[2]

['who_rated_what_2006.sas7bdat',
 'netflix_analysis_dataset.sas7bdat',
 'who_rated_what_2006_ans_use.sas7bdat']

In [14]:
%%time

'''
# load the data in chunks ~ get a generator
with pd.read_sas(os.path.join(folder, "combined_netflix_data.sas7bdat"), chunksize=1000000) as reader:
    for i,chunk in enumerate(reader):
        print(i, end=' ')
        if i % 10 == 29:
            print()
'''

# read a chunk of 5M lines
reader = pd.read_sas(data_path, chunksize=10_000_000)

data = next(reader)

print(data.head())
print("Memory consumed by test set      :   {} MB" .format(data.memory_usage(index=True).sum()/ 1024**2))

   User_ID  Movie_ID    Year  Rated  Ratings_for_Movie  \
0      6.0   15758.0  2005.0    0.0            58576.0   
1      6.0    7828.0  2004.0    0.0            43354.0   
2      6.0   15764.0  2004.0    0.0            59352.0   
3      6.0   15764.0  2004.0    0.0            59352.0   
4      6.0    4141.0  2005.0    0.0            45636.0   

   Ratings_for_Movie_2005  Netflix_Release_Year  Release_Year  \
0                 28592.0                1999.0        1984.0   
1                 22975.0                1999.0        1999.0   
2                 16044.0                2003.0        2003.0   
3                 16044.0                2003.0        2003.0   
4                 19845.0                2000.0        2000.0   

   AVG_Rating_for_Movie  Ratings_from_User  Ratings_from_User_2005  \
0              4.067878              626.0                   310.0   
1              3.496079              626.0                   310.0   
2              2.906777              626.0        

In [15]:
# check if any columns are none
for c in data.columns:
    print(c, "null values:", data[c].isnull().values.any())

User_ID null values: False
Movie_ID null values: False
Year null values: False
Rated null values: False
Ratings_for_Movie null values: False
Ratings_for_Movie_2005 null values: False
Netflix_Release_Year null values: False
Release_Year null values: True
AVG_Rating_for_Movie null values: False
Ratings_from_User null values: False
Ratings_from_User_2005 null values: False
AVG_Rating_from_User null values: False
User_Entry_Year null values: False


In [19]:
Y = data["Rated"]
X = data.drop(['Rated'], axis=1)

X.head()

Unnamed: 0,User_ID,Movie_ID,Year,Ratings_for_Movie,Ratings_for_Movie_2005,Netflix_Release_Year,Release_Year,AVG_Rating_for_Movie,Ratings_from_User,Ratings_from_User_2005,AVG_Rating_from_User,User_Entry_Year
0,6.0,15758.0,2005.0,58576.0,28592.0,1999.0,1984.0,4.067878,626.0,310.0,3.41853,2004.0
1,6.0,7828.0,2004.0,43354.0,22975.0,1999.0,1999.0,3.496079,626.0,310.0,3.41853,2004.0
2,6.0,15764.0,2004.0,59352.0,16044.0,2003.0,2003.0,2.906777,626.0,310.0,3.41853,2004.0
3,6.0,15764.0,2004.0,59352.0,16044.0,2003.0,2003.0,2.906777,626.0,310.0,3.41853,2004.0
4,6.0,4141.0,2005.0,45636.0,19845.0,2000.0,2000.0,3.541743,626.0,310.0,3.41853,2004.0


In [21]:
# test and train set
lgbm_data = lgbm.Dataset(X,label=Y)

# save testing data
lgbm_data.save_binary(os.path.join(folder, 'temp.bin'))

[LightGBM] [Info] Saving data to binary file /projectnb/cs542sp/netflix_wrw2/CS542-final-project/temp.bin


<lightgbm.basic.Dataset at 0x2b40d46c7c10>

In [26]:
# get validation data

# read a chunk of 5M lines
reader = pd.read_sas(data_path, chunksize=1_000_000)

vdata = next(reader)

print("Memory consumed by test set      :   {} MB" .format(data.memory_usage(index=True).sum()/ 1024**2))

vdata.head()

Memory consumed by test set      :   991.8214111328125 MB


Unnamed: 0,User_ID,Movie_ID,Year,Rated,Ratings_for_Movie,Ratings_for_Movie_2005,Netflix_Release_Year,Release_Year,AVG_Rating_for_Movie,Ratings_from_User,Ratings_from_User_2005,AVG_Rating_from_User,User_Entry_Year
0,6.0,15758.0,2005.0,0.0,58576.0,28592.0,1999.0,1984.0,4.067878,626.0,310.0,3.41853,2004.0
1,6.0,7828.0,2004.0,0.0,43354.0,22975.0,1999.0,1999.0,3.496079,626.0,310.0,3.41853,2004.0
2,6.0,15764.0,2004.0,0.0,59352.0,16044.0,2003.0,2003.0,2.906777,626.0,310.0,3.41853,2004.0
3,6.0,15764.0,2004.0,0.0,59352.0,16044.0,2003.0,2003.0,2.906777,626.0,310.0,3.41853,2004.0
4,6.0,4141.0,2005.0,0.0,45636.0,19845.0,2000.0,2000.0,3.541743,626.0,310.0,3.41853,2004.0


In [None]:
validation_data = lgbm_data.create_valid(vdata.drop([]'Rated'), label = vdata['Rated'])

In [24]:
# build the model
param = {
    'num_leaves': 31,
    'objective': 'binary',
    'metric': 'auc',
}

num_round = 5

bst = lgbm.train(param, lgbm_data, num_round, valid_sets=[validation_data])

[LightGBM] [Info] Number of positive: 4109670, number of negative: 5890330
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2154
[LightGBM] [Info] Number of data points in the train set: 10000000, number of used features: 12


LightGBMError: Cannot open data file validation.svm

In [7]:
# test the result