In [1]:
import os
import re
import pandas as pd
import numpy as np
import nltk
import pickle

%load_ext autoreload
%autoreload 2

import parent_modules
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances

from definitions import *
from helpers import *
from baseline_cf import *

# Load Movielens dataset for Item -Item Similarities
movielens_dataset = MovieLensRatingsDataset()
movielens_dataset_small = MovieLensRatingsDataset(keep=0.1)


## Simillarities

In [2]:
# Jaccard Similarity
movielens_dataset.jaccard_similarity.head()



movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.301887,0.142857,0.12973,0.248826,0.132979,0.039548,0.296804,0.203046,0.067797,...,0.027778,0.076923,0.069892,0.038889,0.065934,0.045198,0.05,0.027933,0.033898,0.027933
2,0.301887,1.0,0.2,0.161017,0.220126,0.128,0.027523,0.325,0.178571,0.073394,...,0.027273,0.097345,0.104348,0.036036,0.119266,0.046296,0.083333,0.018182,0.037383,0.046729
3,0.142857,0.2,1.0,0.222222,0.165217,0.246154,0.130435,0.169231,0.129032,0.1875,...,0.0,0.04918,0.046875,0.018519,0.050847,0.039216,0.055556,0.0,0.02,0.0
5,0.12973,0.161017,0.222222,1.0,0.154545,0.298246,0.097561,0.150794,0.166667,0.086957,...,0.022222,0.055556,0.052632,0.021277,0.037736,0.022222,0.041667,0.0,0.023256,0.0
6,0.248826,0.220126,0.165217,0.154545,1.0,0.149123,0.0625,0.311688,0.192308,0.080808,...,0.03,0.096154,0.083333,0.029412,0.076923,0.040404,0.07,0.0,0.020202,0.030303


In [3]:
# Pearson Similarity
movielens_dataset.pearson_similarity.head()

movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.178983,0.136128,0.153725,0.122948,0.099621,0.112117,0.185387,0.144952,0.149678,...,-0.016475,0.079961,0.028192,0.018223,0.057158,0.061225,0.057341,-0.010076,0.051466,0.014426
2,0.178983,1.0,0.173598,0.17939,0.101651,0.104321,-0.023199,0.258169,0.124763,0.0826,...,0.009875,0.129452,0.145067,0.022873,0.219752,0.048388,0.154305,-0.033434,0.110026,0.117877
3,0.136128,0.173598,1.0,0.359519,0.1224,0.373282,0.254815,0.129034,0.164854,0.202242,...,-0.055394,-0.008482,-0.016528,-0.043928,-0.004138,0.04574,0.033977,-0.050395,0.024357,-0.053386
5,0.153725,0.17939,0.359519,1.0,0.135922,0.458714,0.216356,0.144774,0.20356,0.117635,...,0.020997,0.006304,0.002984,-0.003354,-0.022626,-0.003864,0.002731,-0.045835,-0.001546,-0.048556
6,0.122948,0.101651,0.1224,0.135922,1.0,0.100664,0.138454,0.254995,0.15915,0.131322,...,0.017029,0.127574,0.083804,0.029891,0.10694,0.073934,0.124326,-0.082322,0.000549,0.028282


In [4]:
# Cosine Similarity
movielens_dataset.cosine_similarity.head()

movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.446971,0.305854,0.302137,0.396841,0.271328,0.183813,0.46352,0.358906,0.225869,...,0.095783,0.213547,0.184635,0.130367,0.1903,0.152449,0.169952,0.090992,0.137871,0.114891
2,0.446971,1.0,0.301151,0.294109,0.317119,0.237974,0.056544,0.452402,0.295874,0.157833,...,0.090493,0.226755,0.246286,0.108297,0.298076,0.122652,0.228011,0.046223,0.169823,0.179888
3,0.305854,0.301151,1.0,0.418275,0.253466,0.433397,0.287134,0.269804,0.268005,0.244136,...,0.0,0.066637,0.063582,0.015093,0.067422,0.094188,0.092896,0.0,0.070658,0.0
5,0.302137,0.294109,0.418275,1.0,0.253221,0.50634,0.248879,0.269911,0.293094,0.161519,...,0.067381,0.074178,0.075035,0.048315,0.044274,0.043489,0.058785,0.0,0.042411,0.0
6,0.396841,0.317119,0.253466,0.253221,1.0,0.229346,0.193095,0.441042,0.316268,0.197054,...,0.093126,0.221449,0.190364,0.110655,0.19921,0.141733,0.199744,0.0,0.072779,0.100128


In [5]:
# Hamming Similarity
movielens_dataset.hamming_similarity.head()

movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.490909,0.524675,0.532468,0.488312,0.519481,0.537662,0.462338,0.503896,0.537662,...,0.537662,0.532468,0.52987,0.537662,0.532468,0.542857,0.535065,0.532468,0.537662,0.532468
2,0.490909,1.0,0.703896,0.701299,0.615584,0.690909,0.714286,0.631169,0.651948,0.719481,...,0.714286,0.709091,0.703896,0.714286,0.716883,0.716883,0.719481,0.714286,0.719481,0.724675
3,0.524675,0.703896,1.0,0.854545,0.711688,0.838961,0.880519,0.667532,0.758442,0.880519,...,0.85974,0.841558,0.833766,0.857143,0.846753,0.864935,0.857143,0.862338,0.862338,0.862338
5,0.532468,0.701299,0.854545,1.0,0.724675,0.867532,0.893506,0.680519,0.781818,0.883117,...,0.880519,0.857143,0.849351,0.875325,0.85974,0.880519,0.872727,0.880519,0.880519,0.880519
6,0.488312,0.615584,0.711688,0.724675,1.0,0.719481,0.753247,0.649351,0.685714,0.745455,...,0.74026,0.732468,0.72987,0.737662,0.74026,0.742857,0.74026,0.735065,0.74026,0.742857


## Datasets Generation

### Generate CF, User-Similarities, Categories for 100% of the Dataset

In [18]:
# Calculate Baseline Df
# Read the Commends for the baseline_cf module functionality
baseline_cf_matrix = load_baseline_cf(
    from_pickle=True,
    df=movielens_dataset.full_df)
baseline_cf_matrix.shape

(385, 2221)

In [19]:
# Load Movielens dataset for User - User Similarities
movielens_user_based = MovieLensRatingsDataset(user_based=True)
movielens_user_based.pivot_df.shape

(2221, 385)

In [8]:
movie_ids = movielens_dataset.df["movieId"].unique()
categories = CategoriesDataset(movie_ids, 10)
categories_df = categories.categories_df
categories_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,236,237,238,239,240,241,242,243,244,245
0,23,30,35,53,69,71,72,78,94,104,...,0,0,0,0,0,0,0,0,0,0
1,2,14,33,42,43,50,56,67,74,82,...,0,0,0,0,0,0,0,0,0,0
2,3,8,9,10,12,18,39,54,57,75,...,0,0,0,0,0,0,0,0,0,0
3,5,21,22,29,31,38,55,63,73,90,...,0,0,0,0,0,0,0,0,0,0
4,1,6,11,17,19,20,34,46,52,65,...,0,0,0,0,0,0,0,0,0,0


### Generate CF, User-Similarities, Categories for x% partion of the Dataset

In [2]:
small_cf_pathname = os.path.join(MODELS_DIR, "baseline-cf-pickle-small")
baseline_cf_matrix_small = load_baseline_cf(
    from_pickle=False,
    pickle_file_name = small_cf_pathname,
    df=movielens_dataset_small.full_df)
baseline_cf_matrix_small.shape

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.2264
Item-based Model : Test Set - 1.2263665856098278
RMSE: 1.0509
Item-based Model : Training Set - 1.0509246683189424


(38, 196)

In [3]:
movielens_user_based_small = MovieLensRatingsDataset(
    df=movielens_dataset_small.full_df,
    preprocess_df=False,
    user_based=True
)
movielens_user_based_small.pivot_df.shape

(196, 38)

In [4]:
movie_ids_small = movielens_dataset_small.df["movieId"].unique()
categories_small = CategoriesDataset(movie_ids_small, 6)
categories_df_small = categories_small.categories_df
categories_df_small.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,58,59,60,61,62,63,64,65,66,67
0,0,1,2,7,10,13,16,17,21,22,...,166,168,171,179,180,182,185,186,190,195
1,4,5,6,9,14,15,20,25,27,30,...,163,167,172,174,177,178,184,187,191,193
2,3,8,11,12,18,19,26,31,32,36,...,192,194,0,0,0,0,0,0,0,0


## Problem-A definition and Solution

In [5]:
from pyomo.environ import *
from pyomo.solvers import plugins

L = 5

K = 0.03

D = 0.1
# ri_dataset = BaselineRIDataset(df=baseline_cf_matrix, default_limit=L)
ri_dataset_small = BaselineRIDataset(df=baseline_cf_matrix_small, default_limit=L)


def define_pyomo_model(
    ri_matrix:pd.DataFrame, 
    cs_matrix:pd.DataFrame,
    dis_matrix:pd.DataFrame,
    Lmax,
    Kmax,
    Dmax
):
    model = AbstractModel()

    # Define Parameters of the Model
    model.i = Param(within=NonNegativeIntegers, initialize = ri_matrix.shape[1] - 1)
    model.u = Param(within=NonNegativeIntegers, initialize = ri_matrix.shape[0] - 1)
    model.c = Param(within=NonNegativeIntegers, initialize = cs_matrix.shape[0] - 1)

    model.K = Param(initialize=Kmax)
    model.D = Param(initialize=Dmax)
    model.Lmax = Param(initialize=Lmax)
    model.L = RangeSet(0, model.Lmax -1)

    model.I = RangeSet(0, model.i)
    model.U = RangeSet(0, model.u)
    model.C = RangeSet(0, model.c)
    model.Cmax = RangeSet(0, cs_matrix.shape[1] - 1)


    model.ri = Param(model.U, model.I,
                    initialize=lambda model, i, j: ri_matrix.values[i][j])
    model.ds = Param(model.U, model.U,
                     initialize=lambda model, i, j: 1 - abs(dis_matrix.values[i][j]))

    model.cs = Param(model.C, model.Cmax, 
                     initialize=lambda model, i, j: cs_matrix.values[i][j])

    # model.x = Var(model.U, model.I, domain=UnitInterval)
    model.x = Var(model.U, model.I, domain=UnitInterval, bounds=(0, 1))

    # Define Constraints of the Model
    def coverage_constraint(model, c):
        cat_i = set(model.cs[c, i] for i in model.Cmax if model.cs[c, i] != 0)
        return sum( sum(model.x[u, i] for u in model.U) for i in cat_i) / len(cat_i) == model.K

    def diversity_constraint(model, c):
        cat_i = set(model.cs[c, i] for i in model.Cmax if model.cs[c, i] != 0)
        constant = model.K * len(cat_i)
        summations = sum(sum(sum(  model.x[u,i] * model.ds[u,v] * model.x[v,i] 
                                 for v in model.U if v!=u) 
                             for u in model.U) 
                         for i in cat_i)
        
        return (2 * summations/ len(cat_i) * constant * (constant - 1)) >= model.D
    
    def maximun_recommendation_constraint(model, u):
        return sum(sum( model.x[u, model.cs[c, i]] for i in model.Cmax) for c in model.C) == model.Lmax
        
    model.CoverageConstraint = Constraint(model.C, rule=coverage_constraint)
    model.DiversityConstraint = Constraint(model.C, rule=diversity_constraint)
    model.MaximumRecommendationConstraint = Constraint(model.U, rule=maximun_recommendation_constraint)
    
    # Define Objective of the Model
    def maximizer_obj(model):
        return sum(model.ri[i]*model.x[i] for i in model.U*model.I) / (model.Lmax * (model.u + 1))
    
    model.OBJ = Objective(rule=maximizer_obj, sense=maximize)
    
    return model


# model = define_pyomo_model(ri_dataset, categories_df, movielens_user_based.pearson_similarity)
model_small = define_pyomo_model(baseline_cf_matrix_small,
                                 categories_df_small,
                                 movielens_user_based_small.pearson_similarity,
                                 L,K, D)



### Full Dataset Solver

In [13]:
# # model_ins = model.create_instance()
# problem_solver = SolverFactory('glpk')
# solution = problem_solver.solve(model_ins, tee=True)

### Minified Dataset Sovler

In [6]:
from pyomo.contrib.preprocessing.plugins import zero_sum_propagator

model_ins = model_small.create_instance()
# zero_sum_propagator.ZeroSumPropagator().apply_to(model_ins)

In [7]:
problem_solver_small = SolverFactory('ipopt')
solution_small = problem_solver_small.solve(model_ins)

In [15]:
model_ins.x.pprint()


x : Size=7448, Index=x_index
    Key       : Lower : Value                  : Upper : Fixed : Stale : Domain
       (0, 0) :     0 :     0.3333317861105905 :     1 : False : False : UnitInterval
       (0, 1) :     0 :  7.671845463113957e-08 :     1 : False : False : UnitInterval
       (0, 2) :     0 :   7.67184542761804e-08 :     1 : False : False : UnitInterval
       (0, 3) :     0 :  5.520827884470576e-08 :     1 : False : False : UnitInterval
       (0, 4) :     0 :  7.047171323729504e-08 :     1 : False : False : UnitInterval
       (0, 5) :     0 :  8.769520581404793e-08 :     1 : False : False : UnitInterval
       (0, 6) :     0 :  7.047171053790419e-08 :     1 : False : False : UnitInterval
       (0, 7) :     0 :  7.671846170704233e-08 :     1 : False : False : UnitInterval
       (0, 8) :     0 :  5.520827868221766e-08 :     1 : False : False : UnitInterval
       (0, 9) :     0 :   7.04717117706802e-08 :     1 : False : False : UnitInterval
      (0, 10) :     0 :  7.6718

     (9, 147) :     0 :  6.632815677105336e-08 :     1 : False : False : UnitInterval
     (9, 148) :     0 : 5.2322613939143106e-08 :     1 : False : False : UnitInterval
     (9, 149) :     0 :  5.232260481203915e-08 :     1 : False : False : UnitInterval
     (9, 150) :     0 :  6.632816030933726e-08 :     1 : False : False : UnitInterval
     (9, 151) :     0 :  6.632816714477185e-08 :     1 : False : False : UnitInterval
     (9, 152) :     0 : 2.1863488349357155e-07 :     1 : False : False : UnitInterval
     (9, 153) :     0 :  7.199791031527724e-08 :     1 : False : False : UnitInterval
     (9, 154) :     0 :  1.712918433315506e-07 :     1 : False : False : UnitInterval
     (9, 155) :     0 :    6.6328162779272e-08 :     1 : False : False : UnitInterval
     (9, 156) :     0 :  7.199787964644781e-08 :     1 : False : False : UnitInterval
     (9, 157) :     0 :  6.632815901711293e-08 :     1 : False : False : UnitInterval
     (9, 158) :     0 :  6.632815619464275e-08 :     1

    (25, 138) :     0 :  7.671845705241193e-08 :     1 : False : False : UnitInterval
    (25, 139) :     0 :  7.047171216450932e-08 :     1 : False : False : UnitInterval
    (25, 140) :     0 :  5.520828070066852e-08 :     1 : False : False : UnitInterval
    (25, 141) :     0 :  7.671845063350206e-08 :     1 : False : False : UnitInterval
    (25, 142) :     0 :  7.047170859239033e-08 :     1 : False : False : UnitInterval
    (25, 143) :     0 : 5.5208278173705126e-08 :     1 : False : False : UnitInterval
    (25, 144) :     0 :  7.671854604073452e-08 :     1 : False : False : UnitInterval
    (25, 145) :     0 :  7.671859849348605e-08 :     1 : False : False : UnitInterval
    (25, 146) :     0 :  6.365953770819886e-08 :     1 : False : False : UnitInterval
    (25, 147) :     0 :  7.047171097630946e-08 :     1 : False : False : UnitInterval
    (25, 148) :     0 :  5.520828760691413e-08 :     1 : False : False : UnitInterval
    (25, 149) :     0 :  5.520827909080503e-08 :     1

In [23]:
from pyomo.solvers.plugins.solvers.gurobi_persistent import GurobiPersistent
gurobi_persi = GurobiPersistent(model=model_ins)
solution_small = gurobi_persi.solve(tee=True, options={
    'Heuristics': 0.5,
    'MIPFocus': 2,
    'Threads': 3,
    'SubMIPNodes': 100000000,
    'MIQCPMethod': 0,
    'PreMIQCPForm': 1,
#     'PreQLinearize': 2,
    'Presolve': 2,
    'Aggregate': 1,
    'InfUnbdInfo': 1
})

Parameter OutputFlag unchanged
   Value: 1  Min: 0  Max: 1  Default: 1
Changed value of parameter LogFile to /tmp/tmp3u75z8yn.log
   Prev:   Default: 
Changed value of parameter Heuristics to 0.5
   Prev: 0.05  Min: 0.0  Max: 1.0  Default: 0.05
Changed value of parameter MIPFocus to 2
   Prev: 0  Min: 0  Max: 3  Default: 0
Changed value of parameter Threads to 3
   Prev: 0  Min: 0  Max: 1024  Default: 0
Changed value of parameter SubMIPNodes to 100000000
   Prev: 500  Min: 0  Max: 2000000000  Default: 500
Changed value of parameter MIQCPMethod to 0
   Prev: -1  Min: -1  Max: 1  Default: -1
Changed value of parameter PreMIQCPForm to 1
   Prev: -1  Min: -1  Max: 2  Default: -1
Changed value of parameter Presolve to 2
   Prev: -1  Min: -1  Max: 2  Default: -1
Parameter Aggregate unchanged
   Value: 1  Min: 0  Max: 1  Default: 1
Changed value of parameter InfUnbdInfo to 1
   Prev: 0  Min: 0  Max: 1  Default: 0
Gurobi Optimizer version 9.0.1 build v9.0.1rc0 (linux64)
Optimize a model with 4

GurobiError: Q matrix is not positive semi-definite (PSD)