In [53]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [3]:
# Importing the dataset
print('Importing Dataset =====>')
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
# Preparing the training set and the test set
print('Reading testing and training datasets =====>')
# Training and test set for 100k users
training_set_df = pd.read_csv('ml-100k/u1.base', delimiter = '\t', header=None)
test_set_df = pd.read_csv('ml-100k/u1.test', delimiter = '\t', header=None)

Importing Dataset =====>
Reading testing and training datasets =====>


In [4]:
# Convert training set and test set in numpy arrays
training_set_ar = np.array(training_set_df, dtype = 'int')
test_set_ar = np.array(test_set_df, dtype = 'int')
# Getting the number of users and movies
nb_users = int(max(max(training_set_ar[:,0]), max(test_set_ar[:,0])))
nb_movies = int(max(max(training_set_ar[:,1]), max(test_set_ar[:,1])))
nb_userAttributes = 4

In [7]:
# Get arrays for gender flags
users['female_user'] = (users[1] == 'F').astype(int)
users['male_user'] = (users[1] == 'M').astype(int)

In [8]:
# extract unique genre values
print('Extracting unique genres =====>')
genre = movies[2]
unique_genre = genre.unique()
genre_values = []
for movie_genre in unique_genre:
    mg = movie_genre.split("|")
    for g in mg:
        if g not in genre_values:
            genre_values.append(g)
            
genre_values = sorted(genre_values, key=str.lower)
print(genre_values)
print(len(genre_values))

Extracting unique genres =====>
['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
18


In [10]:
# get genre vector
def get_genre_vector(genre_row_val):
    mg = genre_row_val.split("|")
    gen_vec = np.zeros(len(genre_values))
    gen_index = 0
    for g in genre_values:
        if g in mg:
            gen_vec[gen_index] = 1
        gen_index += 1
    return gen_vec
# unit tests for above function
'''print(get_genre_vector("Action|Adventure|Romance"))
print(get_genre_vector("Animation|Children's|Comedy"))
print(get_genre_vector("Thriller"))
print(get_genre_vector("Animation|Children's|Comedy|Romance"))'''

'print(get_genre_vector("Action|Adventure|Romance"))\nprint(get_genre_vector("Animation|Children\'s|Comedy"))\nprint(get_genre_vector("Thriller"))\nprint(get_genre_vector("Animation|Children\'s|Comedy|Romance"))'

In [11]:
# Add Genre Vector to movies dataframe
print('Creating Genre vector on movies df ====>')
movie_data = movies[2]
movie_col = []
gen_index = 0
for movie_gen in movie_data:
    gen_vec = get_genre_vector(movie_gen)
    movie_col.append(gen_vec)
    gen_index += 1
    
movies['genre_vector'] = movie_col

Creating Genre vector on movies df ====>


In [13]:
def addgenrevector(data):
    genre_array = []
    movie_id_list = data[1].tolist()
    for movie_id in movie_id_list:
        genre_array.append(movies.loc[movies[0] == movie_id]['genre_vector'])
    data['genre_vector'] = genre_array
    return data
        
print('Adding Genre Vector to training and testing datasets =====>')
training_set_gen_df = addgenrevector(training_set_df)
training_set_gen_ar = np.array(training_set_gen_df)
test_set_gen_df = addgenrevector(test_set_df)
test_set_gen_ar = np.array(test_set_gen_df)

Adding Genre Vector to training and testing datasets =====>


In [25]:
test_set_gen_df

Unnamed: 0,0,1,2,3,genre_vector
0,1,6,5,887431973,"5 [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
1,1,10,3,875693118,"9 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,12,5,878542960,"11 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,..."
3,1,14,5,874965706,"13 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,..."
4,1,17,3,875073198,"16 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,..."
...,...,...,...,...,...
19995,458,648,4,886395899,"642 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
19996,458,1101,4,886397931,"1085 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."
19997,459,934,3,879563639,"922 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0..."
19998,460,10,3,882912371,"9 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [131]:
def createmultidimensionalmatrix(data):
    print(data.shape)
    gen_data = []
    for id_users in range(1, nb_users + 1):
        id_movies = data[1][data[0] == id_users]
        id_ratings = data[2][data[0] == id_users]
        user_genre_list = data['genre_vector'][data[0] == id_users][data[2] >= 3]
        female_user = float(users['female_user'][users[0] == id_users])
        male_user = float(users['male_user'][users[0] == id_users])
        user_age = float(users[2][users[0] == id_users])
#         reg_months = float(users[3][users[0] == id_users])
        user_genre_sum = np.zeros(len(genre_values))
        for usr_gen_vec in user_genre_list:
            if len(usr_gen_vec):
                user_genre_sum = user_genre_sum + np.array(usr_gen_vec)
        data_reshaped = np.zeros(nb_movies)
        # Create a matrix with users in rows and ratings for each movie in columns
        data_reshaped[id_movies - 1] = id_ratings
        # Add columns of user genre only for good ratings
        if user_genre_sum[0].shape:
            data_reshaped = np.append(data_reshaped, user_genre_sum[0])
        else:
            data_reshaped = np.append(data_reshaped, user_genre_sum)
            
        data_reshaped = np.append(data_reshaped, [female_user])
        data_reshaped = np.append(data_reshaped, [male_user])
        data_reshaped = np.append(data_reshaped, [user_age])
#         data_reshaped = np.append(data_reshaped, [reg_months])
        gen_data.append(list(data_reshaped))
    return gen_data
        
        
print('Creating 2D matrix ======>')    
training_gen_data = createmultidimensionalmatrix(training_set_gen_df)
test_gen_data = createmultidimensionalmatrix(test_set_gen_df)

(80000, 5)
(20000, 5)


In [182]:
list_col = pd.read_csv('list_col.csv', header=None)
list_col_v = list(np.array(list_col).reshape(1,-1))
list_col_v

[array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Female', 'Male', 'Age'], dtype=object)]

In [185]:
training_df = pd.DataFrame(data = training_gen_data,
                      columns = list_col_v[0])

In [186]:
training_df

Unnamed: 0,Toy Story (1995),GoldenEye (1995),Four Rooms (1995),Get Shorty (1995),Copycat (1995),Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),Twelve Monkeys (1995),Babe (1995),Dead Man Walking (1995),Richard III (1995),...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Female,Male,Age
0,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,2.0,3.0,20.0,6.0,14.0,0.0,0.0,1.0,0.0,1.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,1.0,0.0,9.0,1.0,7.0,0.0,0.0,0.0,1.0,56.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0,25.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,2.0,0.0,0.0,1.0,0.0,1.0,45.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,10.0,3.0,11.0,1.0,3.0,0.0,1.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,1.0,1.0,9.0,2.0,4.0,0.0,0.0,1.0,0.0,25.0
939,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,1.0,13.0,6.0,23.0,4.0,0.0,0.0,1.0,1.0
940,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,1.0,1.0,4.0,0.0,1.0,0.0,1.0,0.0,1.0,18.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,13.0,5.0,12.0,3.0,3.0,1.0,0.0,50.0


In [54]:
# Converting the data into Torch tensors
print('Creating torch tensors ======>')
training_set_1 = torch.FloatTensor(training_gen_data)
test_set_1 = torch.FloatTensor(test_gen_data)



In [148]:
training_set_1

tensor([[ 5.,  3.,  4.,  ...,  0.,  1., 10.],
        [ 4.,  0.,  0.,  ...,  1., 56., 16.],
        [ 0.,  0.,  0.,  ...,  1., 25., 15.],
        ...,
        [ 5.,  0.,  0.,  ...,  1., 18., 20.],
        [ 0.,  0.,  0.,  ...,  0., 50.,  6.],
        [ 0.,  5.,  0.,  ...,  0., 45., 12.]])

In [104]:
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(1703, 800)
        self.fc2 = nn.Linear(800, 400)
        self.fc3 = nn.Linear(400, 800)
        self.fc4 = nn.Linear(800, 1703)
        self.activation = nn.Sigmoid()
        
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5)

In [105]:
nb_epoch = 200
for epoch in range(1, nb_epoch+1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set_1[id_user]).unsqueeze(0)
        target = input.clone()
        #Select only rating related columns to compute loss
        target_ratings = target[:, :nb_movies]
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            output_ratings = output[:, :nb_movies]
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output_ratings, target_ratings)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data*mean_corrector)
            s += 1.
            optimizer.step()
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))
    
    
# Testing the SAE
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set_1[id_user]).unsqueeze(0)
    target = Variable(test_set_1[id_user]).unsqueeze(0)
    target_ratings = target[:, :nb_movies]
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        output_ratings = output[:, :nb_movies]
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output_ratings, target_ratings)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
print('test loss: '+str(test_loss/s))

epoch: 1 loss: tensor(1.6355)
epoch: 2 loss: tensor(1.1581)
epoch: 3 loss: tensor(1.0765)
epoch: 4 loss: tensor(0.9942)
epoch: 5 loss: tensor(0.9797)
epoch: 6 loss: tensor(0.9403)
epoch: 7 loss: tensor(0.9409)
epoch: 8 loss: tensor(0.9177)
epoch: 9 loss: tensor(0.9225)
epoch: 10 loss: tensor(0.9065)
epoch: 11 loss: tensor(0.9121)
epoch: 12 loss: tensor(0.9002)
epoch: 13 loss: tensor(0.9058)
epoch: 14 loss: tensor(0.8959)
epoch: 15 loss: tensor(0.9016)
epoch: 16 loss: tensor(0.8931)
epoch: 17 loss: tensor(0.8982)
epoch: 18 loss: tensor(0.8909)
epoch: 19 loss: tensor(0.8957)
epoch: 20 loss: tensor(0.8891)
epoch: 21 loss: tensor(0.8940)
epoch: 22 loss: tensor(0.8876)
epoch: 23 loss: tensor(0.8922)
epoch: 24 loss: tensor(0.8863)
epoch: 25 loss: tensor(0.8907)
epoch: 26 loss: tensor(0.8851)
epoch: 27 loss: tensor(0.8895)
epoch: 28 loss: tensor(0.8842)
epoch: 29 loss: tensor(0.8884)
epoch: 30 loss: tensor(0.8832)
epoch: 31 loss: tensor(0.8876)
epoch: 32 loss: tensor(0.8825)
epoch: 33 loss: t

In [106]:
### model's state_dict 
print("Model's state_dict:")
for param_tensor in sae.state_dict():
    print(param_tensor, "\t", sae.state_dict()[param_tensor].size())

### optimzer's state_dict 출력
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
fc1.weight 	 torch.Size([800, 1703])
fc1.bias 	 torch.Size([800])
fc2.weight 	 torch.Size([400, 800])
fc2.bias 	 torch.Size([400])
fc3.weight 	 torch.Size([800, 400])
fc3.bias 	 torch.Size([800])
fc4.weight 	 torch.Size([1703, 800])
fc4.bias 	 torch.Size([1703])
Optimizer's state_dict:
state 	 {0: {'step': 188600, 'square_avg': tensor([[6.2502e-06, 6.2500e-06, 6.2499e-06,  ..., 6.2501e-06, 6.2499e-06,
         6.2546e-06],
        [6.1226e-06, 6.2618e-06, 6.2756e-06,  ..., 6.2650e-06, 6.2353e-06,
         6.7901e-06],
        [6.2504e-06, 6.2499e-06, 6.2498e-06,  ..., 6.2499e-06, 6.2499e-06,
         6.2516e-06],
        ...,
        [6.2497e-06, 6.2499e-06, 6.2499e-06,  ..., 6.2499e-06, 6.2500e-06,
         6.2513e-06],
        [6.2497e-06, 6.2499e-06, 6.2499e-06,  ..., 6.2499e-06, 6.2499e-06,
         6.2518e-06],
        [6.2517e-06, 6.2499e-06, 6.2504e-06,  ..., 6.2502e-06, 6.2499e-06,
         6.2394e-06]])}, 1: {'step': 188600, 'square_avg': tensor([6.2500e-06

In [107]:
sae.eval()

SAE(
  (fc1): Linear(in_features=1703, out_features=800, bias=True)
  (fc2): Linear(in_features=800, out_features=400, bias=True)
  (fc3): Linear(in_features=400, out_features=800, bias=True)
  (fc4): Linear(in_features=800, out_features=1703, bias=True)
  (activation): Sigmoid()
)

In [108]:
PATH = './model/movie_recommendation.pth'
torch.save(sae.state_dict(), PATH)

In [122]:
outputs = sae(training_set_1[939])
outputs[:10]

tensor([3.8050, 3.3847, 2.9091, 3.4274, 3.2505, 3.2813, 4.0260, 3.8497, 4.0095,
        3.9220], grad_fn=<SliceBackward>)