In [4]:
import os
import re
import pandas as pd
import numpy as np
import nltk
import pickle

%load_ext autoreload
%autoreload 2

import parent_modules
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances

from definitions import *
from helpers import *
from baseline_cf import *

# Load Movielens dataset for Item -Item Similarities
movielens_dataset = MovieLensRatingsDataset()
movielens_dataset_small = MovieLensRatingsDataset(keep=0.3)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Simillarities

In [2]:
# Jaccard Similarity
movielens_dataset.jaccard_similarity.head()



movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.301887,0.142857,0.12973,0.248826,0.132979,0.039548,0.296804,0.203046,0.067797,...,0.027778,0.076923,0.069892,0.038889,0.065934,0.045198,0.05,0.027933,0.033898,0.027933
2,0.301887,1.0,0.2,0.161017,0.220126,0.128,0.027523,0.325,0.178571,0.073394,...,0.027273,0.097345,0.104348,0.036036,0.119266,0.046296,0.083333,0.018182,0.037383,0.046729
3,0.142857,0.2,1.0,0.222222,0.165217,0.246154,0.130435,0.169231,0.129032,0.1875,...,0.0,0.04918,0.046875,0.018519,0.050847,0.039216,0.055556,0.0,0.02,0.0
5,0.12973,0.161017,0.222222,1.0,0.154545,0.298246,0.097561,0.150794,0.166667,0.086957,...,0.022222,0.055556,0.052632,0.021277,0.037736,0.022222,0.041667,0.0,0.023256,0.0
6,0.248826,0.220126,0.165217,0.154545,1.0,0.149123,0.0625,0.311688,0.192308,0.080808,...,0.03,0.096154,0.083333,0.029412,0.076923,0.040404,0.07,0.0,0.020202,0.030303


In [None]:
# Pearson Similarity
movielens_dataset.pearson_similarity.head()

In [None]:
# Cosine Similarity
movielens_dataset.cosine_similarity.head()

In [None]:
# Hamming Similarity
movielens_dataset.hamming_similarity.head()

## Datasets Generation

### Generate CF, User-Similarities, Categories for 100% of the Dataset

In [5]:
# Calculate Baseline Df
# Read the Commends for the baseline_cf module functionality
baseline_cf_matrix = load_baseline_cf(
    load_from_pickle=True,
    df=movielens_dataset.full_df)
baseline_cf_matrix.shape

(385, 2221)

In [7]:
# Load Movielens dataset for User - User Similarities
movielens_user_based = MovieLensRatingsDataset(user_based=True)
movielens_user_based.pivot_df.shape

(2221, 385)

In [9]:
movie_ids = movielens_dataset.df["movieId"].unique()
categories = CategoriesDataset(movie_ids, 10)
categories_df = categories.categories_df
categories_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,221,222,223,224,225,226,227,228,229,230
0,11,13,15,27,30,36,40,46,49,55,...,2175,2176,2177,2188,2200,2201,2202,2211,2213,2214
1,3,10,47,53,58,59,72,88,96,100,...,2105,2109,2136,2145,2149,2186,2189,2199,2212,0
2,6,22,24,28,45,48,54,63,66,68,...,2178,2196,2203,0,0,0,0,0,0,0
3,8,16,17,19,25,26,29,39,51,57,...,0,0,0,0,0,0,0,0,0,0
4,4,18,37,43,64,80,81,87,98,107,...,2152,2179,2198,2204,2208,0,0,0,0,0


### Generate CF, User-Similarities, Categories for x% partion of the Dataset

In [11]:
small_cf_pathname = os.path.join(MODELS_DIR, "baseline-cf-pickle-small")
baseline_cf_matrix_small = load_baseline_cf(
    load_from_pickle=True,
    pickle_file_name = small_cf_pathname,
    df=movielens_dataset_small.full_df)
baseline_cf_matrix_small.shape

(115, 660)

In [None]:
# Add computation for User-Similarities

In [12]:
movie_ids_small = movielens_dataset_small.df["movieId"].unique()
categories_small = CategoriesDataset(movie_ids_small, 10)
categories_small = categories_small.categories_df
categories_small.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,21,28,31,39,43,49,52,57,62,79,...,0,0,0,0,0,0,0,0,0,0
1,1,4,10,12,25,48,55,58,70,82,...,0,0,0,0,0,0,0,0,0,0
2,3,19,23,40,89,97,104,115,116,124,...,0,0,0,0,0,0,0,0,0,0
3,6,16,17,20,33,45,46,47,59,61,...,0,0,0,0,0,0,0,0,0,0
4,0,11,14,24,26,37,51,67,84,86,...,0,0,0,0,0,0,0,0,0,0


## Problem-A definition and Solution

In [8]:
from pyomo.environ import *

L = 5

K = 0.03

model = AbstractModel()

# Define Parameters of the Model
model.i = Param(within=NonNegativeIntegers, initialize = baseline_cf_matrix.shape[1] - 1)
model.u = Param(within=NonNegativeIntegers, initialize = baseline_cf_matrix.shape[0] - 1)
model.c = Param(within=NonNegativeIntegers, initialize = categories_df.shape[0] - 1)

model.K = Param(initialize=K)
model.Lmax = Param(initialize=L)
model.L = RangeSet(0, model.Lmax -1)

model.I = RangeSet(0, model.i)
model.U = RangeSet(0, model.u)
model.C = RangeSet(0, model.c)
model.Cmax = RangeSet(0, categories_df.shape[1] - 1)


model.rs = Param(model.U, model.I,
                initialize=lambda model, i, j: baseline_cf_matrix.values[i][j])
model.ds = Param(model.U, model.U,
                 initialize=lambda model, i, j: 1 - abs(movielens_user_based.pearson_similarity.values[i][j]))

model.cs = Param(model.C, model.Cmax, 
                 initialize=lambda model, i, j: categories_df.values[i][j])

# model.x = Var(model.U, model.I, domain=UnitInterval)
model.x = Var(model.U, model.I, domain=Binary)

# Define Constraints of the Model
def coverage_constraint(model, c):
    cat_i = set(model.cs[c, i] for i in model.Cmax if model.cs[c, i] != 0)
    return sum( sum(model.x[u, i] for i in cat_i) for u in model.U) / len(cat_i), 2 == model.K

def maximun_recommendation_constraint(model, u):
    return sum(sum( model.x[u, model.cs[c, i]] for i in model.Cmax) for c in model.C) == model.Lmax
    

model.CoverageConstraint = Constraint(model.C, rule=coverage_constraint)
model.MaximumRecommendationConstraint = Constraint(model.U, rule=maximun_recommendation_constraint)

# Define Objective of the Model
def maximizer_obj(model):
    return sum(model.rs[i]*model.x[i] for i in model.U*model.I) / model.Lmax * (model.u + 1)

model.OBJ = Objective(rule=maximizer_obj, sense=maximize)





In [9]:
model_ins = model.create_instance()
problem_solver = SolverFactory('glpk')
solution = problem_solver.solve(model_ins, tee=True)

GLPSOL: GLPK LP/MIP Solver, v4.65
Parameter(s) specified in the command line:
 --write /tmp/tmp53tuidcq.glpk.raw --wglp /tmp/tmplamgk8cw.glpk.glp --cpxlp
 /tmp/tmp9rpnvkny.pyomo.lp
Reading problem data from '/tmp/tmp9rpnvkny.pyomo.lp'...
396 rows, 855086 columns, 1709786 non-zeros
855085 integer variables, all of which are binary
4276238 lines were read
Writing problem data to '/tmp/tmplamgk8cw.glpk.glp'...
3420743 lines were written
GLPK Integer Optimizer, v4.65
396 rows, 855086 columns, 1709786 non-zeros
855085 integer variables, all of which are binary
Preprocessing...
PROBLEM HAS NO PRIMAL FEASIBLE SOLUTION
Time used:   0.7 secs
Memory used: 415.6 Mb (435765812 bytes)
Writing MIP solution to '/tmp/tmp53tuidcq.glpk.raw'...
855491 lines were written


In [19]:
model_ins.x.lb

AttributeError: 'IndexedVar' object has no attribute 'lb'

In [22]:
type(solution)

pyomo.opt.results.results_.SolverResults