## Importar lo importante

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import time
import datetime as dt
import pickle
import numpy as np
import borrador
from __future__ import division
from auxiliar_functions import *
from aux_functions_comparisson import *
import tfe
from geopy.distance import vincenty
from itertools import chain, combinations
import random
import scipy as sp

## Obtener datos

In [None]:
with open('data/shared_rois_and_min_distance.pickle') as f:
    shared_rois = pickle.load(f)
    min_distance = pickle.load(f)

In [None]:
with open('data/feature_data.pickle') as f:
    abril_vectors = pickle.load(f)
    septiembre_vectors = pickle.load(f)

In [None]:
with open('data/rois_meters_data.pickle') as f:
    abril_the_rois = pickle.load(f)
    septiembre_the_rois = pickle.load(f)

In [None]:
limit = np.min((len(abril_vectors),len(septiembre_vectors)))
limit

In [None]:
shared_1_rois_2_month = 0
shared_2_rois_2_month = 0
for i in range(limit):
    if shared_rois[i,i] > 0:
        shared_1_rois_2_month += 1
    if shared_rois[i,i] > 1:
        shared_2_rois_2_month += 1
print "% de tarjetas que comparten un roi en ambos meses:",str(round(shared_1_rois_2_month*100/limit,2)) + "%"
print "% de tarjetas que comparten dos roi en ambos meses:", str(round(shared_2_rois_2_month*100/limit,2)) + "%"

### Histograma cantidad de vecinos de hogar y trabajo (2 rois) entre abril y septiembre

In [None]:
n_of_neighbours = []
for i in range(limit):
    n_of_neighbours.append(len(get_neighbours_index(abril_the_rois,shared_rois,i,2)))

In [None]:
plt.hist(n_of_neighbours,range(10))

### Histograma cantidad de vecinos de hogar y trabajo (2 rois) entre abril y septiembre

In [None]:
n10_of_neighbours = filter(lambda x: x>9,n_of_neighbours)

In [None]:
plt.hist(n10_of_neighbours,30)

## Feature Exploration

In [None]:
N_FEATURES = 19

In [None]:
features_names = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
    "random_entropy","p100_diff_last_origin","p100_diff_first_origin","card_type",\
    "start_time","end_time","traveled_days","traveled_days_bs","frequence_regularity",\
    "p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]

In [None]:
features_dict = {"msal":0,"mlal":1,"kmDistance":2,"kmMaxDist":3,"kmMinDist":4,"rg":5,"unc_entropy":6,
                  "random_entropy":7,"p100_diff_last_origin":8,"p100_diff_first_origin":9,"card_type":10,
                  "start_time":11,"end_time":12,"traveled_days":13,"traveled_days_bs":14,"frequence_regularity":15,
                  "p100_exclusive_bus_days":16,"p100_exclusive_metro_days":17,"P100_bus_trips":18}

## Seleccionar Features para la comparación

In [None]:
selected_features = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
    "random_entropy","p100_diff_last_origin","p100_diff_first_origin",\
    "start_time","end_time","traveled_days","frequence_regularity",\
    "p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]


In [None]:
abril_selected_features = filter_features(abril_vectors,selected_features,features_dict)
septiembre_selected_features = filter_features(septiembre_vectors,selected_features,features_dict)

### Seleccionar con distancia euclidiana y similitud braycurtis entre los vecinos con las features ya normalizadas con min max

In [None]:
abril_norm_vectors = np.ones((limit, N_FEATURES))
septiembre_norm_vectors = np.ones((limit, N_FEATURES))
for i in range(abril_selected_features.shape[1]):
    abril_norm_vectors[:,i] = normalizar_min_max(abril_selected_features[:,i])
    septiembre_norm_vectors[:,i] = normalizar_min_max(septiembre_selected_features[:,i])

In [None]:
a_matrix_euclidiana = compare_vectors_with_neighbours_normalized(abril_norm_vectors,septiembre_norm_vectors,abril_the_rois,\
                                           septiembre_the_rois,shared_rois,limit,2,sp.spatial.distance.euclidean)
n_identified_e,selected_distance_e,identified_indexs_e,abstenidos_e = get_n_correct(a_matrix_euclidiana,limit)
porcentaje_correcto = n_identified_e*100/limit
print "Dist Euclidiana con norma minmax general: "+str(round(porcentaje_correcto,2))+ "%"
print "Falsos positivos: "+ str(((limit-n_identified_e-len(abstenidos_e))*100/limit)) + "%"
print "Falsos negativos: "+ str(len(abstenidos_e)*100/limit) + "%"

In [None]:
a_matrix_braycurtis = compare_vectors_with_neighbours_normalized(abril_norm_vectors,septiembre_norm_vectors,\
                                                     abril_the_rois,septiembre_the_rois,shared_rois,limit,2,\
                                                     sp.spatial.distance.braycurtis)
n_identified_b,selected_distance_b,identified_indexs_b,abstenidos_b = get_n_correct(a_matrix_braycurtis,limit)
porcentaje_correcto = n_identified_b*100/limit
print "Dist braycurtis con norma minmax general: "+str(round(porcentaje_correcto,2))+ "%"
print "Falsos positivos: "+ str(((limit-n_identified_b-len(abstenidos_b))*100/limit)) + "%"
print "Falsos negativos: "+ str(len(abstenidos_b)*100/limit) + "%"

### Ver si son los mismos seleccionados

In [None]:
corr_in_both = []
abst_in_both = []
wrong_in_both = []
corr_id_in_e = []
corr_id_in_b = []
wrong_id_in_e = []
wrong_id_in_b = []
for i in range(limit):
    if identified_indexs_b[i] == identified_indexs_e[i]:
        if identified_indexs_b[i] == -1:
            abst_in_both.append(i)
        elif identified_indexs_b[i] == i:
            corr_in_both.append(i)
        else:
            wrong_in_both.append(i)
    elif identified_indexs_b[i] != identified_indexs_e[i]:
        if  identified_indexs_b[i] == -1 and identified_indexs_e[i] == i:
            corr_id_in_e.append(i)
        elif identified_indexs_b[i] == -1:
            wrong_id_in_e.append(i)
        elif identified_indexs_e[i] == -1 and identified_indexs_b[i] == i:
            corr_id_in_b.append(i)
        else:
            wrong_id_in_b.append(i)
            

In [None]:
print "Ambos correctos: "+ str(len(corr_in_both))
print "Ambos incorrectos: "+ str(len(wrong_in_both))
print "Ambos abstenidos: "+ str(len(abst_in_both))
print "Porcentaje en ambos igual: "+ str((len(corr_in_both)+len(wrong_in_both)+len(abst_in_both))*100/limit)

In [None]:
print "Correcto con euclidiana y abstenido con braycurtis: "+ str(len(corr_id_in_e))
print "Incorrecto con euclidiana y abstenido con braycurtis: "+ str(len(wrong_id_in_e))
print "Correcto con braycurtis y abstenido con euclidiana: "+ str(len(corr_id_in_b))
print "Incorrecto con braycurtis y abstenido con euclidiana: "+ str(len(wrong_id_in_b))


### Ver en que orden queda el vector correcto

In [None]:
def get_position_right_answer(distance_matrix):
    positions = []
    n_neighbours = []
    neighbours_distances = []
    for i in range(limit):
        neighbours = np.where(distance_matrix[i] != -1)
        the_index = np.argmax(distance_matrix[i,:])
        if distance_matrix[i,the_index] == -1:
            continue
        elif(the_index!=i):
            for j in neighbours:
                neighbours_distances.append(distance_matrix[i,j])
            sorted_neighbours = np.sort(neighbours_distances)
            positions.append(np.where(sorted_neighbours==distance_matrix[i,i]))
            n_neighbours.append(len(neighbours))
    return [positions,n_neighbours]

In [None]:
p,nn = get_position_right_answer(a_matrix_braycurtis)