In [1]:
import os
import re
import pandas as pd
import numpy as np
import nltk
import pickle

%load_ext autoreload
%autoreload 2

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
movies_df = pd.read_csv("/Users/georgenteves/rec_sys_proj/data/serendipity-sac2018/movies.csv",sep=',')

In [3]:
movies_df = movies_df.drop(['releaseDate', 'imdbId', 'tmdbId', 'Unnamed: 8','Unnamed: 9','Unnamed: 10','Unnamed: 11'], axis = 1) 

In [4]:
answers_df = pd.read_csv("/Users/georgenteves/rec_sys_proj/data/serendipity-sac2018/answers.csv",sep=',')

In [5]:
answers_df = answers_df.drop(['rating', 'predictedRating', 's_ser_rel', 'q','s_ser_find','s_ser_imp','s_ser_rec', 'm_ser_rel', 'm_ser_find', 'm_ser_imp' , 'm_ser_rec'], axis = 1) 

In [6]:
movies_rated_by_user = {}
answers_df_timestamp_ascending = answers_df.sort_values('timestamp')
answers_df_timestamp_ascending = answers_df_timestamp_ascending.reset_index(drop=True)

for answer in range(0, len(answers_df_timestamp_ascending)):
    if int(answers_df_timestamp_ascending['userId'][answer]) in movies_rated_by_user:
        movies_rated_by_user[int(answers_df_timestamp_ascending['userId'][answer])].append(int(answers_df_timestamp_ascending['movieId'][answer]))
    else:
        movies_rated_by_user[int(answers_df_timestamp_ascending['userId'][answer])] = []
        movies_rated_by_user[int(answers_df_timestamp_ascending['userId'][answer])].append(answers_df_timestamp_ascending['movieId'][answer])        

# SImilarity movies per user

In [7]:
def jaccard_similarity_with_previous_rated_movies(user_movies_list, current_movie):
    
    total_similarity_sum = 0
    total_movies_number = 0
    
    for movie in user_movies_list:
        number_of_criteria = 0
        current_similarity_sum = 0
        
        if (movies_df.index[movies_df['movieId'] == current_movie][0] - 1) >= 0:
            list1 = movies_df['directedBy'][movies_df.index[movies_df['movieId'] == movie][0] - 1].split(',')
            list2 = movies_df['directedBy'][movies_df.index[movies_df['movieId'] == current_movie][0] - 1].split(',')
            intersection = len(list(set(list1).intersection(list2)))
            union = (len(list1) + len(list2)) - intersection
            current_similarity_sum += float(intersection) / union
            number_of_criteria += 1

            list1 = movies_df['starring'][movies_df.index[movies_df['movieId'] == movie][0] - 1].split(',')
            list2 = movies_df['starring'][movies_df.index[movies_df['movieId'] == current_movie][0] - 1].split(',')
            intersection = len(list(set(list1).intersection(list2)))
            union = (len(list1) + len(list2)) - intersection
            current_similarity_sum += float(intersection) / union
            number_of_criteria += 1

            list1 = movies_df['genres'][movies_df.index[movies_df['movieId'] == movie][0] - 1].split(',')
            list2 = movies_df['genres'][movies_df.index[movies_df['movieId'] == current_movie][0] - 1].split(',')
            intersection = len(list(set(list1).intersection(list2)))
            union = (len(list1) + len(list2)) - intersection
            current_similarity_sum += float(intersection) / union
            number_of_criteria += 1

            current_similarity = current_similarity_sum / number_of_criteria
            total_similarity_sum += current_similarity
            total_movies_number += 1
    
    if total_movies_number > 0:
        answer = total_similarity_sum / total_movies_number
    else:
        answer = 0
    
    return answer

In [8]:
def similarity_with_previous_rated_movies_by_user(userId, current_movie):
    similarity = 0
    if userId in movies_rated_by_user:
        if current_movie not in movies_rated_by_user[userId]:
            similarity = jaccard_similarity_with_previous_rated_movies(movies_rated_by_user[userId], current_movie)
        else:
            previous_movies_rated_by_user = []
            current_movie_index = movies_rated_by_user[userId].index(current_movie)
            for movie_index in range(0, len(movies_rated_by_user[userId])):
                if movie_index < current_movie_index:
                    previous_movies_rated_by_user.append(movies_rated_by_user[userId][movie_index])
                else:
                    break
            if len(previous_movies_rated_by_user) > 0:
                similarity = jaccard_similarity_with_previous_rated_movies(previous_movies_rated_by_user, current_movie)  
                
    return similarity

### Popularity in ml-latest-small

In [9]:
movies_ratings = pd.read_csv("/Users/georgenteves/rec_sys_proj/data/ml-latest-small/ratings.csv",sep=',')

In [10]:
movies_all = pd.read_csv("/Users/georgenteves/rec_sys_proj/data/ml-latest-small/movies.csv",sep=',')

In [11]:
rated_movies_popularity = {}

for movie in range(0, len(movies_ratings)):
    if int(movies_ratings['movieId'][movie]) not in rated_movies_popularity:
        times_rated = len(movies_ratings.loc[movies_ratings['movieId'] == int(movies_ratings['movieId'][movie])])
        rated_movies_popularity[int(movies_ratings['movieId'][movie])] = -np.log10(times_rated / movies_ratings['userId'].nunique())

# Serendipity index

In [12]:
questions = ['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8']
movies_serendipity_sum = {}
movies_serendipity_users = {}
movies_serendipity = {}

for answer in range(0, len(answers_df_timestamp_ascending)):
    number_of_questions = 0
    current_user_serendipity_sum = 0
    
    for question in questions:
        if (np.isnan(answers_df_timestamp_ascending[question][answer]) == False):
            number_of_questions += 1
            current_user_serendipity_sum += answers_df_timestamp_ascending[question][answer]/5
    
    if number_of_questions != 0:
        current_user_serendipity = current_user_serendipity_sum / number_of_questions
    
    if int(answers_df_timestamp_ascending['movieId'][answer]) not in movies_serendipity_sum:
        movies_serendipity_sum[int(answers_df_timestamp_ascending['movieId'][answer])] = current_user_serendipity
    else:
        movies_serendipity_sum[int(answers_df_timestamp_ascending['movieId'][answer])] += current_user_serendipity
        
    if int(answers_df_timestamp_ascending['movieId'][answer]) not in movies_serendipity_users:
        movies_serendipity_users[int(answers_df_timestamp_ascending['movieId'][answer])] = 1
    else:
        movies_serendipity_users[int(answers_df_timestamp_ascending['movieId'][answer])] += 1

for movie in movies_serendipity_users:
    movies_serendipity[movie] = movies_serendipity_sum[movie] / movies_serendipity_users[movie]


# Categories

In [13]:
import random

number_of_categories = 5
movie_ids = movies_df["movieId"].unique()
random.shuffle(movie_ids)

In [14]:
categories = np.random.randint(low = 1,high = number_of_categories+1, size=len(movie_ids))

In [15]:
movies_with_categories = pd.DataFrame({'Movie Id': movie_ids[:], 'Category': categories[:],'Values':np.ones(len(movie_ids))}) 

In [16]:
categorized_movies = movies_with_categories.pivot(
    index='Category',
    columns='Movie Id',
    values='Values'
).fillna(0).astype(int)

In [17]:
cat_dict = {}
for cat_id in movies_with_categories['Category'].unique():
    cat_dict[cat_id] = movies_with_categories.loc[movies_with_categories['Category']==cat_id]['Movie Id'].tolist()

In [18]:
pip install pymprog

Note: you may need to restart the kernel to use updated packages.


In [19]:
from pymprog import *

In [20]:
movies_df['directedBy'].fillna('', inplace=True)
movies_df['starring'].fillna('', inplace=True)
movies_df['genres'].fillna('', inplace=True)

In [21]:
model_df = pd.DataFrame(
    [int(row['userId']), int(row['movieId']), similarity_with_previous_rated_movies_by_user(int(row['userId']), int(row['movieId'])), rated_movies_popularity[int(row['movieId'])], movies_serendipity[int(row['movieId'])]] for index, row in answers_df_timestamp_ascending.iterrows() if int(row['movieId']) in rated_movies_popularity)

In [22]:
model_df.columns = ['userId', 'movieId', 'similarity_previous_rated_by_user', 'popularity', 'movie_serendipity']

In [23]:
model_df.loc[(model_df.movie_serendipity >= 0.45),'movie_serendipity']= int(1)
model_df.loc[(model_df.movie_serendipity < 0.45),'movie_serendipity']= int(0)

In [24]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(model_df, test_size = 0.2, random_state = 28)
print ('The size of train set and test set is {0} and {1} respectively.'.format(len(train_set),len(test_set)))

The size of train set and test set is 1152 and 288 respectively.


In [25]:
train_X = []
train_Y = []

for index, values in train_set.iterrows():
    train_X.append([values['similarity_previous_rated_by_user'], values['popularity']])
    train_Y.append(values['movie_serendipity'])

In [26]:
test_X = []
test_Y = []

for index, values in test_set.iterrows():
    test_X.append([values['similarity_previous_rated_by_user'], values['popularity']])
    test_Y.append(values['movie_serendipity'])

# Linear Regression

In [27]:
from sklearn.linear_model import LinearRegression
linear_regr_model = LinearRegression()
linear_regr_model.fit(train_X, train_Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [28]:
linear_regr_model.score(test_X, test_Y)

-0.03436054346005424

In [29]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train_X, train_Y)



In [30]:
clf.score(test_X,test_Y)

0.9131944444444444

In [31]:
from sklearn.metrics import mean_squared_error
y_pred = clf.predict(test_X)

In [32]:
mean_squared_error(test_Y, y_pred)

0.08680555555555555

# Non-Linear

In [33]:
from sklearn.tree import DecisionTreeClassifier

regressor = DecisionTreeClassifier(random_state = 0)  
regressor.fit(train_X, train_Y) 
regressor.score(test_X, test_Y) 

0.8541666666666666

In [34]:
y_pred = regressor.predict(test_X)
mean_squared_error(test_Y, y_pred)

0.14583333333333334

# Solver

In [42]:
#minified movies categories with the 100 least popular movies of each category
min_cat_dict = {}

for cat_id in cat_dict:
    min_cat_dict[cat_id] = []
    current_categories_movies = pd.DataFrame(
    [int(cat_dict[cat_id][movie_index]), rated_movies_popularity[cat_dict[cat_id][movie_index]]] for movie_index in range(0, len(cat_dict[cat_id])) if cat_dict[cat_id][movie_index] in rated_movies_popularity)
    current_categories_movies.columns = ['movieId', 'popularity']
    current_categories_movies_descending = current_categories_movies.sort_values(by='popularity')
    current_categories_movies_descending = current_categories_movies_descending.reset_index(drop=True)
    for movie in current_categories_movies_descending[0:2].iterrows():
        min_cat_dict[cat_id].append(int(movie[1]['movieId']))

In [43]:
all_movies_min = []

for cat_id in min_cat_dict:
    for movie_index in range(0, len(min_cat_dict[cat_id])):
        all_movies_min.append(min_cat_dict[cat_id][movie_index])

In [45]:
params = []
the_users = answers_df['userId'].unique()[0:5]

for index in range(0, 10):
    if index < len(the_users):
        params.append({
            "movieId": all_movies_min[index],
            "userId": the_users[index],
            "recommendation": 0
        })
    else:
        params.append({
            "movieId": all_movies_min[index],
            "userId": 0,
            "recommendation": 0
        })
    
x_df = pd.DataFrame(params)

In [46]:
# pivot ratings into movie features
x_pivot = x_df.pivot(
    index='movieId',
    columns='userId',
    values='recommendation'
).fillna(0)

In [47]:
from collections import defaultdict
x_dict = {}
for ind,row in x_pivot.iterrows():
    dd = row.to_dict()
    for us in dd:
        if us > 0:
            x_dict[(us,ind)] = dd[us]

In [48]:
x_dict_keys = []
for k in x_dict:
    x_dict_keys.append(k)

In [84]:
print(x_dict_keys)

[(117112, 110), (125112, 110), (144726, 110), (200400, 110), (205229, 110), (117112, 296), (125112, 296), (144726, 296), (200400, 296), (205229, 296), (117112, 318), (125112, 318), (144726, 318), (200400, 318), (205229, 318), (117112, 356), (125112, 356), (144726, 356), (200400, 356), (205229, 356), (117112, 480), (125112, 480), (144726, 480), (200400, 480), (205229, 480), (117112, 527), (125112, 527), (144726, 527), (200400, 527), (205229, 527), (117112, 590), (125112, 590), (144726, 590), (200400, 590), (205229, 590), (117112, 593), (125112, 593), (144726, 593), (200400, 593), (205229, 593), (117112, 1210), (125112, 1210), (144726, 1210), (200400, 1210), (205229, 1210), (117112, 2571), (125112, 2571), (144726, 2571), (200400, 2571), (205229, 2571)]


In [88]:
predicted = {}
for u in the_users:
    for c in min_cat_dict:
        for m in range(0, len(min_cat_dict[c])):
            predicted[(similarity_with_previous_rated_movies_by_user(u, min_cat_dict[c][m]), rated_movies_popularity[min_cat_dict[c][m]])] = regressor.predict([[similarity_with_previous_rated_movies_by_user(u, min_cat_dict[c][m]), rated_movies_popularity[min_cat_dict[c][m]]]])  

In [92]:
begin("assign")
verbose(True)  # be verbose
x = var('x', x_dict_keys, bool)

for u in the_users: sum(x[u,min_cat_dict[c][m]] for c in min_cat_dict for m in range(0, len(min_cat_dict[c]))) == 2 
for c in min_cat_dict: (1/len(min_cat_dict[cat_id]))*sum(predicted[(similarity_with_previous_rated_movies_by_user(u, min_cat_dict[c][m]), rated_movies_popularity[min_cat_dict[c][m]])]*x[u,min_cat_dict[c][m]] for u in the_users for m in range(0, len(min_cat_dict[c]))) >= 0.5
maximize((1/len(min_cat_dict[cat_id]))*sum(predicted[(similarity_with_previous_rated_movies_by_user(u, min_cat_dict[c][m]), rated_movies_popularity[min_cat_dict[c][m]])]*x[u,min_cat_dict[c][m]] for u in the_users for c in min_cat_dict for m in range(0, len(min_cat_dict[c]))))
solver(int, 
    #this branching option often helps 
    br_tech=glpk.GLP_BR_PCH, 
)
solve() # solve the model
print("###>Objective value: %f"%vobj())
sensitivity() # sensitivity report
end() #Good habit: do away with the model

R1: (x[205229,318] + x[205229,480] + x[205229,296] + x[205229,110] + x[205229,1210] + x[205229,590] + x[205229,356] + x[205229,593] + x[205229,2571] + x[205229,527]==2)
R2: (x[117112,318] + x[117112,480] + x[117112,296] + x[117112,110] + x[117112,1210] + x[117112,590] + x[117112,356] + x[117112,593] + x[117112,2571] + x[117112,527]==2)
R3: (x[144726,318] + x[144726,480] + x[144726,296] + x[144726,110] + x[144726,1210] + x[144726,590] + x[144726,356] + x[144726,593] + x[144726,2571] + x[144726,527]==2)
R4: (x[200400,318] + x[200400,480] + x[200400,296] + x[200400,110] + x[200400,1210] + x[200400,590] + x[200400,356] + x[200400,593] + x[200400,2571] + x[200400,527]==2)
R5: (x[125112,318] + x[125112,480] + x[125112,296] + x[125112,110] + x[125112,1210] + x[125112,590] + x[125112,356] + x[125112,593] + x[125112,2571] + x[125112,527]==2)
R6: 0.5 * ( 0.0 * x[205229,318] + 0.0 * x[205229,480] + 0.0 * x[117112,318] + 0.0 * x[117112,480] + x[144726,318] + x[144726,480] + x[200400,318] + 0.0 * x

model('assign') is not the default model.