In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
def get_weights(X):
    weights  = np.zeros(X.shape[1])
    sum_row = np.sum(X, axis = 1)
    suma_all = np.sum(sum_row)
    weights  = sum_row/suma_all
    print(np.round(weights,3))
    return weights

In [3]:
def ratio_consistency(n):
    if n < 3:
        return 0
    elif 3 <= n <= 15:
        return np.array([ 0.58, 0.9, 1.11, 1.24, 1.32, 1.41, 1.45, 1.49, 1.51, 1.48, 1.56, 1.57, 1.59])[n - 3]
    else:
        Y = np.array([ 0.58, 0.9, 1.11, 1.24, 1.32, 1.41, 1.45, 1.49, 1.51, 1.48, 1.56, 1.57, 1.59])
        X = np.arange(3, len(Y) + 3)
        X = sm.add_constant(X)
        model = sm.OLS(Y**6, X)
        results = model.fit()
        return results.predict([1, n])[0]**(1/6)


In [4]:
def ahp_method(dataset, tol = 0.0008):
    X = np.copy(dataset)
    print('#' *50)
    print('#' *18 + ' AHP Method ' + '#'*20)
    print('#' *50)
    previous_weights = get_weights(X)
    while True:
        X_pow = np.matmul(X, X)
        actual_weights = get_weights(X_pow)
        norma = np.max(np.abs(actual_weights - previous_weights))
        print('Norm Inf: ', norma)
        if  norma < tol:
            break
        previous_weights = actual_weights
        X = X_pow.copy()
    weights = actual_weights
    X = np.copy(dataset)
    V1 = np.matmul(X, weights)
    V1_per  =  V1/weights
    lamb_max = np.mean(V1_per)
    cons_ind = (lamb_max - X.shape[1])/(X.shape[1] - 1)
    indicador = ratio_consistency(X.shape[1])
    rc = cons_ind/indicador
    print('#' *50)
    print('lamb_max :', round(lamb_max, 3)  )
    print('CI :', round(cons_ind, 3) )
    print('CR : ', round(rc, 5) )
    print('#' *50)
    # Weigths
    for i in range(0, weights.shape[0]):
        print('w(g'+str(i+1)+'): ', round(weights[i], 3))
    print('#' *50)
    return weights, rc 


In [5]:
dataset_original = pd.read_excel('imdbmoviesdata.xlsx')
columnas_interes = dataset_original.columns[1:9]
dataset_original.index = dataset_original['Nombre']
dataset_original = dataset_original.drop(['Nombre'], axis=1)

In [6]:
dataset_normalized = dataset_original.copy()

# Normalize
def norm_column_min(col):
    return (col.max() - col)/(col.max() - col.min())

def norm_column_max(col):
    return (col - col.min())/(col.max() - col.min())

column_norm_max = ['Popularidad', 'Ventas', 'Duración', 'PromedioVotos', 'AñoEstreno', 'Votos']
column_norm_min = ['Presupuesto']

for col in column_norm_max:
    dataset_normalized[col] = norm_column_max(dataset_normalized[col])

for col in column_norm_min:
    dataset_normalized[col] = norm_column_min(dataset_normalized[col])

#### ponderación simple

In [7]:
pond_simple = dataset_normalized.copy()

In [8]:
dict_pond_simple_gen = {
    'Animation' : 16,
    'Fantasy' : 15,
    'Science Fiction' : 14,
    'Adventure' : 13,
    'Action' : 12,
    'War' : 11,
    'Mystery' : 10,
    'Horror': 9,
    'Thriller' : 8,
    'Crime' : 7,
    'Familiy' : 6,
    'Comedy' : 5,
    'Romance' : 4,
    'Drama' : 3,
    'Western' : 2,
    'Music' : 1
}

pond_simple['Género'] = pond_simple['Género'].map(dict_pond_simple_gen)
pond_simple['Género'] = norm_column_max(pond_simple['Género'])

In [9]:
pond_fields = [7, 1, 3, 2, 4, 8, 6 ,5]
# normalize array of pond_fields
pond_fields = np.array(pond_fields)
pond_fields = pond_fields/np.sum(pond_fields)

pond_simple['Ranking'] = pond_simple[columnas_interes].dot(pond_fields)
pond_simple.sort_values(by=['Ranking'], ascending=False, inplace=True)

In [10]:
pond_simple

Unnamed: 0_level_0,Popularidad,Presupuesto,Ventas,Duración,Género,Votos,PromedioVotos,AñoEstreno,Ranking
Nombre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Interstellar,0.756284,0.611786,0.223503,0.458333,0.800000,0.664958,0.921569,0.981818,0.734753
Jurassic World,1.000000,0.647082,0.544124,0.314103,0.733333,0.569027,0.627451,1.000000,0.726609
Inception,0.283645,0.623551,0.296756,0.391026,0.733333,1.000000,0.901961,0.909091,0.699220
Mad Max: Fury Road,0.861539,0.647082,0.136023,0.301282,0.733333,0.632879,0.745098,1.000000,0.698762
Avatar,0.285741,0.442369,1.000000,0.435897,0.733333,0.865840,0.745098,0.890909,0.697209
...,...,...,...,...,...,...,...,...,...
Perfect,0.003811,0.955328,0.004609,0.285256,0.133333,0.000307,0.196078,0.454545,0.154204
Cool as Ice,0.010723,0.985917,0.000393,0.208333,0.133333,0.000410,0.117647,0.563636,0.153875
One-Eyed Jacks,0.004955,0.985917,0.001510,0.368590,0.066667,0.000717,0.549020,0.018182,0.150548
The Night of the Iguana,0.005234,0.992976,0.004278,0.317308,0.133333,0.000512,0.470588,0.072727,0.150046


#### AHP

In [21]:
dict_pond_preferential = {
    'Animation' : 50,
    'Fantasy' : 42,
    'Science Fiction' : 40,
    'Adventure' : 37,
    'Action' : 30,
    'War' : 25,
    'Mystery' : 23,
    'Horror': 20,
    'Thriller' : 18,
    'Crime' : 17,
    'Familiy' : 15,
    'Comedy' : 10,
    'Romance' : 8,
    'Drama' : 5,
    'Western' : 3,
    'Music' : 1
}

In [22]:
lista_generos = dataset_normalized['Género'].unique()

In [52]:
n_les = len(lista_generos)
matriz = np.eye(n_les)
for i in range(0, n_les):
    for j in range(i + 1, n_les):
        matriz[i, j] = np.round((np.abs(dict_pond_preferential[lista_generos[i]]/dict_pond_preferential[lista_generos[j]])), 2)
        matriz[j, i] = 1/matriz[i,j]

In [53]:
pd.DataFrame(matriz, columns=lista_generos, index=lista_generos)

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Drama,Familiy,Fantasy,Horror,Music,Mystery,Romance,Science Fiction,Thriller,War,Western
Action,1.0,0.81,0.6,3.0,1.76,6.0,2.0,0.71,1.5,30.0,1.3,3.75,0.75,1.67,1.2,10.0
Adventure,1.234568,1.0,0.74,3.7,2.18,7.4,2.47,0.88,1.85,37.0,1.61,4.62,0.92,2.06,1.48,12.33
Animation,1.666667,1.351351,1.0,5.0,2.94,10.0,3.33,1.19,2.5,50.0,2.17,6.25,1.25,2.78,2.0,16.67
Comedy,0.333333,0.27027,0.2,1.0,0.59,2.0,0.67,0.24,0.5,10.0,0.43,1.25,0.25,0.56,0.4,3.33
Crime,0.568182,0.458716,0.340136,1.694915,1.0,3.4,1.13,0.4,0.85,17.0,0.74,2.12,0.42,0.94,0.68,5.67
Drama,0.166667,0.135135,0.1,0.5,0.294118,1.0,0.33,0.12,0.25,5.0,0.22,0.62,0.12,0.28,0.2,1.67
Familiy,0.5,0.404858,0.3003,1.492537,0.884956,3.030303,1.0,0.36,0.75,15.0,0.65,1.88,0.38,0.83,0.6,5.0
Fantasy,1.408451,1.136364,0.840336,4.166667,2.5,8.333333,2.777778,1.0,2.1,42.0,1.83,5.25,1.05,2.33,1.68,14.0
Horror,0.666667,0.540541,0.4,2.0,1.176471,4.0,1.333333,0.47619,1.0,20.0,0.87,2.5,0.5,1.11,0.8,6.67
Music,0.033333,0.027027,0.02,0.1,0.058824,0.2,0.066667,0.02381,0.05,1.0,0.04,0.12,0.02,0.06,0.04,0.33


In [54]:
#matriz_generos = create_matrix(lista_generos, dict_gen)
weights, rc = ahp_method(matriz)

##################################################
################## AHP Method ####################
##################################################
[0.086 0.106 0.143 0.029 0.049 0.014 0.043 0.12  0.057 0.003 0.068 0.023
 0.128 0.05  0.072 0.009]
[0.087 0.107 0.145 0.029 0.049 0.014 0.044 0.122 0.058 0.003 0.067 0.023
 0.118 0.052 0.073 0.009]
Norm Inf:  0.009824871845244346
[0.087 0.107 0.145 0.029 0.049 0.014 0.044 0.122 0.058 0.003 0.067 0.023
 0.118 0.052 0.073 0.009]
Norm Inf:  4.333841476578115e-06
##################################################
lamb_max : 16.004
CI : 0.0
CR :  0.00015
##################################################
w(g1):  0.087
w(g2):  0.107
w(g3):  0.145
w(g4):  0.029
w(g5):  0.049
w(g6):  0.014
w(g7):  0.044
w(g8):  0.122
w(g9):  0.058
w(g10):  0.003
w(g11):  0.067
w(g12):  0.023
w(g13):  0.118
w(g14):  0.052
w(g15):  0.073
w(g16):  0.009
##################################################


In [37]:
# dict = generos : weight
dict_weights = {}
for i in range(len(weights)):
    dict_weights[lista_generos[i]] = weights[i]

def get_weight(genero):
    return dict_weights[genero]

dataset_normalized['Género'] = dataset_normalized['Género'].apply(get_weight)

In [38]:
campos_peliculas = dataset_normalized.columns

In [39]:
dict_peliculas = {'Popularidad-Presupuesto': 1,
 'Popularidad-Ventas': 3,
 'Popularidad-Duración':6,
 'Popularidad-Género': 9,
 'Popularidad-Votos': 1,
 'Popularidad-PromedioVotos': 1,
 'Popularidad-AñoEstreno': 9,
 'Presupuesto-Ventas': 1,
 'Presupuesto-Duración': 0.3,
 'Presupuesto-Género': 0.2,
 'Presupuesto-Votos': 0.5,
 'Presupuesto-PromedioVotos': 1,
 'Presupuesto-AñoEstreno': 0.1,
 'Ventas-Duración': 5,
 'Ventas-Género': 1,
 'Ventas-Votos': 0.6,
 'Ventas-PromedioVotos': 0.1,
 'Ventas-AñoEstreno': 0.7,
 'Duración-Género': 5,
 'Duración-Votos': 3,
 'Duración-PromedioVotos': 3,
 'Duración-AñoEstreno': 1,
 'Género-Votos': 0.5,
 'Género-PromedioVotos': 0.7,
 'Género-AñoEstreno': 0.4,
 'Votos-PromedioVotos': 1,
 'Votos-AñoEstreno': 0.8,
 'PromedioVotos-AñoEstreno': 4}

In [135]:
matriz_peliculas = pd.read_excel('matriz_peliculas.xlsx', index_col=0).to_numpy()
weights_peliculas, rc = ahp_method(matriz_peliculas)

##################################################
################## AHP Method ####################
##################################################
[0.237 0.119 0.105 0.034 0.083 0.192 0.199 0.03 ]
[0.288 0.085 0.09  0.034 0.065 0.194 0.21  0.035]
Norm Inf:  0.05090524295359786
[0.289 0.088 0.095 0.035 0.068 0.19  0.197 0.037]
Norm Inf:  0.012311244360826507
[0.289 0.088 0.094 0.035 0.068 0.19  0.198 0.037]
Norm Inf:  0.0005758684182028684
##################################################
lamb_max : 8.944
CI : 0.135
CR :  0.0956
##################################################
w(g1):  0.289
w(g2):  0.088
w(g3):  0.094
w(g4):  0.035
w(g5):  0.068
w(g6):  0.19
w(g7):  0.198
w(g8):  0.037
##################################################


In [56]:
pd.DataFrame(matriz_peliculas, index=campos_peliculas, columns=campos_peliculas ).to_excel('matriz_peliculas.xlsx')

In [41]:
dataset_normalized['Ranking'] = dataset_normalized[campos_peliculas].dot(weights_peliculas)
dataset_normalized = dataset_normalized.sort_values(by=['Ranking'], ascending=False).head(10)

In [42]:
pond_simple.head(10)

Unnamed: 0_level_0,Popularidad,Presupuesto,Ventas,Duración,Género,Votos,PromedioVotos,AñoEstreno,Ranking
Nombre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Interstellar,0.756284,0.611786,0.223503,0.458333,0.8,0.664958,0.921569,0.981818,0.734753
Jurassic World,1.0,0.647082,0.544124,0.314103,0.733333,0.569027,0.627451,1.0,0.726609
Inception,0.283645,0.623551,0.296756,0.391026,0.733333,1.0,0.901961,0.909091,0.69922
Mad Max: Fury Road,0.861539,0.647082,0.136023,0.301282,0.733333,0.632879,0.745098,1.0,0.698762
Avatar,0.285741,0.442369,1.0,0.435897,0.733333,0.86584,0.745098,0.890909,0.697209
The Avengers,0.231307,0.48237,0.546291,0.375,0.866667,0.911448,0.784314,0.945455,0.685605
Star Wars: The Force Awakens,0.338518,0.52943,0.743537,0.352564,0.733333,0.541355,0.823529,1.0,0.640004
Guardians of the Galaxy,0.433683,0.600021,0.277993,0.304487,0.733333,0.574152,0.901961,0.981818,0.636838
The Dark Knight Rises,0.199571,0.411779,0.388631,0.445513,0.733333,0.688019,0.823529,0.945455,0.610323
The Lord of the Rings: The Return of the King,0.215679,0.778851,0.402239,0.560897,0.8,0.576612,0.901961,0.781818,0.604191


In [43]:
op1 = dataset_normalized.head(10).index 
op2 = pond_simple.head(10).index

In [48]:
pd.DataFrame(np.array([op1, op2]).T, columns=['AHP', 'Ponderación Simple'])

Unnamed: 0,AHP,Ponderación Simple
0,Jurassic World,Interstellar
1,Interstellar,Jurassic World
2,Mad Max: Fury Road,Inception
3,Avatar,Mad Max: Fury Road
4,Star Wars: The Force Awakens,Avatar
5,Inception,The Avengers
6,Guardians of the Galaxy,Star Wars: The Force Awakens
7,The Dark Knight,Guardians of the Galaxy
8,The Avengers,The Dark Knight Rises
9,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King


In [44]:
# not in intersection

op1.difference(op2), op2.difference(op1)

(Index(['The Dark Knight'], dtype='object', name='Nombre'),
 Index(['The Dark Knight Rises'], dtype='object', name='Nombre'))