In [27]:
import numpy as np
import pandas as pd
import os
from sklearn.neighbors import BallTree,DistanceMetric,KDTree
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
carreers = pd.read_csv('../../preprocessed_data/carreers_join.csv',index_col=0)
ids = carreers.index.values

In [26]:
np.save('../../preprocessed_data/ids',ids)

In [29]:
names = pd.read_csv('../../preprocessed_data/carreras2.csv')
names = names.apply(lambda x: "{}/{}".format(x['Nombre'],x['Universidad']),axis=1)

### Preprocessing
Se preprocesan lso datos numéricos para que tengan media 0 y varianza 1

In [30]:
scaler = StandardScaler()
columns_to_scale = ['Arancel','Empleabilidad','Ingreso','Corte 2017','Latitud','Longitud','Cant. Semestres']# + [str(x) for x in range(300)]
carreers[columns_to_scale] = scaler.fit_transform(carreers[columns_to_scale])
carreers = carreers.reset_index().values
df = pd.DataFrame(carreers)


In [24]:
carreers

array([[  1.00000000e+00,   1.00000000e+00,   2.00000000e+00, ...,
          3.00000000e+00,   1.21180442e-01,   1.81491229e+00],
       [  2.00000000e+00,   1.00000000e+00,   2.00000000e+00, ...,
          3.00000000e+00,   1.35118813e-01,   1.69674246e+00],
       [  3.00000000e+00,   1.00000000e+00,   2.00000000e+00, ...,
          3.00000000e+00,   1.36684704e-01,   1.72661586e+00],
       ..., 
       [  2.62900000e+03,   5.00000000e+01,   2.26000000e+02, ...,
          7.00000000e+00,   4.60908093e-15,   0.00000000e+00],
       [  2.63000000e+03,   5.00000000e+01,   2.26000000e+02, ...,
          9.00000000e+00,   4.60908093e-15,   0.00000000e+00],
       [  2.63100000e+03,   5.00000000e+01,   2.26000000e+02, ...,
          5.00000000e+00,   4.60908093e-15,   0.00000000e+00]])

In [23]:
df.to_csv('../../preprocessed_data/carreers_scaled.csv',index=False, header=df.columns.values)

In [5]:
joblib.dump(scaler,'../../serialized/balltree/scaler')

['../../serialized/balltree/scaler']

### Distance Metric
Para poder obtener distancias significativas sintácticamente entre las carreras es necesario usar una métrica de distancia apropiada que considere variables continuas y categóricas de manera similar.
Para eso se usará dissimilaridad de gower. Basada en lo que sale en https://stats.stackexchange.com/questions/173144/convert-categorical-data-to-numerical-data-to-compute-a-distance-then

Para las features de W2V se uso similaridad coseno, calculada mediante el método n_similarity del modelo. Para evitar tener cargado el modelo en memoria se calculo la similaridad pairwise y se serializo en una matriz de (n_carreras,n_carreras)

In [31]:
def categorical_distance(c_j,c_k):
    return int(not c_j==c_k)

def continuous_distance(x_j,x_k,r_i):
    return np.divide(np.absolute(x_j-x_k),r_i)

def gower_distance(X_j,X_k,**kwargs):
    distance = 0
    for col in categorical:
        distance += np.dot(W_i[column_hash[col]],categorical_distance(X_j[column_hash[col]],X_k[column_hash[col]]))
    for col in continuous:
        distance += np.dot(W_i[column_hash[col]],continuous_distance(X_j[column_hash[col]],X_k[column_hash[col]],R_i[column_hash[col]]))
    distance += W_i[11]*(1 - np.absolute(similarity_matrix[int(X_j[column_hash['ID']]),int(X_k[column_hash['ID']])]))
    
    return distance

In [32]:
column_hash = {
    'ID':0,
    'ID Universidad':1,
    'ID Campus':2,
    'Cant. Semestres':3,
    'Arancel':4,
    'Empleabilidad':5,
    'Ingreso':6,
    'Corte 2017':7,
    'area_id':8,
    'Latitud':9,
    'Longitud':10
}
categorical = ['ID Universidad','ID Campus','area_id']
continuous = ['Arancel','Empleabilidad','Ingreso','Corte 2017','Latitud','Longitud','Cant. Semestres']
W_i = np.array([1 for _ in range(12)],dtype='float32')
for col in categorical:
    W_i[column_hash[col]] = 0.66
for col in continuous:
    W_i[column_hash[col]] = 1
R_i = np.array([1 for _ in range(12)],dtype='float32')
W_i[11] = 4
for col in continuous:
    R_i[column_hash[col]] = np.max(carreers[column_hash[col]]) - np.min(carreers[column_hash[col]])

similarity_matrix = np.load('../../preprocessed_data/similarity_matrix.npy')
args = {'categorical':categorical,'continuous':continuous,'column_hash':column_hash, 'W_i':W_i, 'R_i':R_i,'similarity_matrix':similarity_matrix}
gower_distance(carreers[10],carreers[5],**args)

3.2255417201242502

In [33]:
metric = DistanceMetric.get_metric('pyfunc',func=gower_distance,**args)

In [34]:
tree = BallTree(carreers,metric=metric)

In [35]:
id_query = 3
q = carreers[id_query]
result = tree.query([q],k=5,return_distance=False)
print(names[ids[id_query]])
for i in result:
    print(np.array(names[ids[i]]))
    

Teoría de la Música/Universidad de Chile
['Teoría de la Música/Universidad de Chile'
 'Teoría e Historia del Arte/Universidad de Chile'
 'Ingeniería en Sonido/Universidad de Chile'
 'Diseño Teatral/Universidad de Chile'
 'Música y Composición/Universidad UNIACC']


In [319]:
joblib.dump(tree,'../../serialized/balltree/balltree')

['../../serialized/balltree/balltree']