In [1]:
import pandas as pd
import numpy as np
from itertools import chain
import ipywidgets as widgets
from ipywidgets import AppLayout
import IPython.display as pyDis

from context import community_module
from community_module.community_detection.similarityCommunityDetection import SimilarityCommunityDetection
from community_module.similarity.emotionSimilarity import EmotionSimilarity
from community_module.visualization.gephiVisualization import GephiVisualization

from sklearn.metrics.pairwise import cosine_similarity

In [2]:


# Leemos los ficheros de respuestas de usuarios
users_df = pd.read_csv('../../data/MNCN/user_profiles.csv')
users_grouped_df = pd.read_csv('../../data/MNCN/user_profiles_grouped.csv')

# Datos necesarios para los filtros
q103_index = list(range(1,7))
q107a_index = list(range(7, 13))
q107b_index = list(range(13, 16))
q110a_index = list(range(16, 18))
q110b_index = list(range(18, len(users_df.columns)))

# Questions
questions = {
    'q103': q103_index,
    'q107a': q107a_index,
    'q107b': q107b_index,
    'q110a': q110a_index,
    'q110b': q110b_index
}

# Explains index
explain_indexes = {
    'q103': [7, 9],
    'q107a': [5, 6],
    'q107b': [8],
    'q110a': [],
    'q110b': []
}

# Diccionario de respuestas
answers = {
    'q103_0': 'Reducir el tiempo de la ducha',
    'q103_1': 'Comprar menos ropa',
    'q103_2': 'No usar productos con muchos envases',
    'q103_3': 'Ir a pie a más sitios',
    'q103_4': 'Reducir los residuos que genero',
    'q103_5': 'Reciclar de forma correcta',
    'q107a_0': 'En coche',
    'q107a_1': 'Andando',
    'q107a_2': 'En bicicleta',
    'q107a_3': 'En autobús',
    'q107a_4': 'En metro',
    'q107a_5': 'En patinete',
    'q107b_0': 'Estaría dispuesto a cambiar de medio de transporte',
    'q107b_1': 'No estaría dispuesto a cambiar de medio de transporte',
    'q107b_2': 'Quizá estaría dispuesto a cambiar de medio de transporte',
    'q110a_0': 'Si he tenido una mascota exótica',
    'q110a_1': 'No he tenido una mascota exótica',
    'q110b_0': 'Adoptaria una totuga de florida',
    'q110b_1': 'Adoptaria un perro común',
    'q110b_2': 'Adoptaria un gato común',
    'q110b_3': 'Adoptaria una cotorra argentina',
    'q110b_4': 'Adoptaria una cacatúa',
    'q110b_5': 'Adoptaria un mono capuchino',
}

# Funcion de similitud y n clusters a detectar
sim = 'cosine'

In [3]:
groups_to_explain_df = None
users_groups_df = None

def search_groups(data, indexes, explains, percentage=1):
    # Comprobamos que no hay algún usuario que solo contenga ceros
    filter_no_answers = data.apply(lambda x: x.values.sum() != 0, axis=1)
    correct_data = data[filter_no_answers]
    correct_explains = explains[filter_no_answers]

    n_clusters = 2
    finish_search = False

    while not finish_search:

        #print("Clusters:", n_clusters)
        community_detection = SimilarityCommunityDetection(correct_data)
        result = community_detection.calculate_communities(metric=sim, n_clusters=n_clusters)

        correct_data['group'] = result.values()
        correct_explains['group'] = correct_data['group']

        # Comprobamos que para cada grupo tenemos al menos, una respuesta común
        # answers_per_group_df = correct_data.groupby(by='group').sum().reset_index()
        answers_per_group_df = correct_explains.groupby(by='group').sum().reset_index()
        max_answers_per_group = answers_per_group_df.apply(lambda x: np.max(x[1:].values), axis=1).values

        #print("Max answers:", max_answers_per_group.values)

        members_per_group = correct_explains.groupby(by='group').count().iloc[:,0].values
        min_same_answers_per_group = np.round(members_per_group * percentage)

        finish_search = np.greater_equal(max_answers_per_group, min_same_answers_per_group).sum() == len(members_per_group)

        if not finish_search:
            n_clusters += 1

    # Genero un DF para extraer las explicaciones de los grupos
    groups_to_explain_df = correct_explains.groupby(by='group').sum().reset_index()
    groups_to_explain_df['members'] = members_per_group

    # Genero la tabla de usuario/grupo
    result_df = pd.DataFrame.from_dict(result, orient='index', columns=['group'])
    users_group = users_df.join(result_df)
    
    return groups_to_explain_df, users_group

def explain_groups(groups_to_explain_df):
    for i in range(len(groups_to_explain_df)):

        if int(groups_to_explain_df.iloc[i]['members']) >= 2:
            row = groups_to_explain_df.iloc[i]
            max_value = row[1:-1].max()

            row_ans = row[1:-1]
            row_ans = row_ans[row_ans == max_value]

            print("-----------")
            print("## GROUP", i)
            print("# N. MEMBERS:", int(groups_to_explain_df.iloc[i]['members']))
            print("# COMMON PROPERTIES:")

            for r in row_ans.index.values:
                print("  -", r)

In [4]:


# Preparo la interfaz
questions_wid = widgets.SelectMultiple(
    options=list(questions.keys()),
    descriptions='Seleccionar preguntas',
    disable=False
)

percentage_wid = widgets.FloatSlider(
    value=0.94,
    min=0.0,
    max=1.0,
    step=0.01,
    description='Min % respuestas comunes'
)

button = widgets.Button(
    description = 'Buscar'
)


def btn_event(obj):
    # Incluimos los índices de las preguntas seleccionadas
    indexes = list()
    indexes_explains = list()
    
    for q in questions_wid.value:
        indexes.extend(questions[q])
        indexes_explains.extend(explain_indexes[q])
        
    # Filtramos el dataset, seleccionando solo las preguntas que queremos
    data = users_df.iloc[:,indexes]
    explains = users_grouped_df.iloc[:,indexes_explains]
    
    # Aplicamos la búsqueda de grupos
    groups_to_explain_df, users_groups_df = search_groups(data, indexes, explains, percentage=percentage_wid.value)
    
    # TODO: Aquí filtrar grupos por número mínimo de miembros
    # Obtenemos los ids de grupos que queremos eliminar (menos de 2 usuarios)
    filter_groups = users_groups_df.groupby(by='group').count()['UserId'] < 2
    groups_to_filter = filter_groups.index.values[filter_groups]
    
    # Imprimir el número de usuarios que se filtran
    # Quitamos el grupo del dataset
    users_out = users_groups_df[users_groups_df['group'].isin(groups_to_filter)].index.values
    print('Users without groups:', len(users_out))
    users_groups_df = users_groups_df[~users_groups_df['group'].isin(groups_to_filter)]
    
    # Explicamos los grupos obtenidos
    explain_groups(groups_to_explain_df)
    
    # Pintamos en Gephi
    gv = GephiVisualization()
    
    # Preparamos los datos de usuarios y distancias
    users = users_groups_df[['UserId', 'School', 'Grade', 'Type', 'Zone', 'group']].values
    data = data[~data.index.isin(users_out)]
    distances = cosine_similarity(data)
    
    gv.load_community(users, distances, users_properties=['School', 'Grade', 'Type', 'Zone', 'community'])
    
button.on_click(btn_event)
AppLayout(hader=None, left_sidebar=questions_wid, center=None, right_sidebar=percentage_wid, footer=button)

AppLayout(children=(Button(description='Buscar', layout=Layout(grid_area='footer'), style=ButtonStyle()), Sele…

Users without groups: 5
-----------
## GROUP 0
# N. MEMBERS: 3
# COMMON PROPERTIES:
  - Reduce Consumption
  - Recycle
-----------
## GROUP 1
# N. MEMBERS: 30
# COMMON PROPERTIES:
  - Reduce Consumption
-----------
## GROUP 2
# N. MEMBERS: 2
# COMMON PROPERTIES:
  - Reduce Consumption
-----------
## GROUP 3
# N. MEMBERS: 4
# COMMON PROPERTIES:
  - Reduce Consumption
  - Recycle
-----------
## GROUP 5
# N. MEMBERS: 7
# COMMON PROPERTIES:
  - Reduce Consumption
  - Recycle
  - Change Transport
-----------
## GROUP 7
# N. MEMBERS: 3
# COMMON PROPERTIES:
  - Recycle
  - Change Transport
-----------
## GROUP 8
# N. MEMBERS: 3
# COMMON PROPERTIES:
  - Reduce Consumption
  - Recycle
  - Change Transport
-----------
## GROUP 12
# N. MEMBERS: 4
# COMMON PROPERTIES:
  - Reduce Consumption
  - Change Transport
