In [1]:
# IMPORTS
from manufacturing_company.src.common.const import *
from manufacturing_company.src.network.social_network import *
from manufacturing_company.src.classification_algorithms.standard_classification import *

from sklearn.metrics import jaccard_similarity_score, jaccard_score, f1_score



In [55]:
# COLLECTIVE CLASSIFICATION
df_positions = pd.read_csv('manufacturing_company/data/raw/positions.csv', sep=';', comment='#', index_col=ID)

for i in range(1, SIZE + 1):
    df_features = pd.read_csv('manufacturing_company/data/intermediate/05_features/' + str(i) + '_months_features.csv', sep=';', index_col=ID)
    
    df_communication = pd.read_csv('manufacturing_company/data/intermediate/03_minimum_activity/' + str(i) + '_months_communication.csv', sep=';')
    G = create_network(df_communication, weight=False)
    
    

In [56]:
def select_nodes_based_on_utility_score(utility_score_name, utility_score, pct, levels):  
    utility_score = utility_score.sort_values(utility_score_name, ascending=False)

    known_nodes_map = dict()
    
    for position in range(1, levels + 1):
        employees_on_given_position = utility_score[utility_score[POSITION] == position]
        all_nodes = len(employees_on_given_position)
        known_nodes = round(all_nodes * pct)
        
        known_nodes_id = employees_on_given_position.iloc[:known_nodes].index
        known_nodes_map.update({id: position for id in known_nodes_id})
        
    return known_nodes_map


In [57]:
def message_passing(G, known_nodes, threshold): 
    df_nodes = pd.DataFrame(G.nodes(data='utility_score'), columns=[ID, 'utility_score'])
    df_nodes = df_nodes.set_index(ID)
    
    order_desc = df_nodes.sort_values('utility_score', ascending=True).index
    
    nodes = pd.DataFrame(G.nodes(data='label'), columns=[ID, 'label'])
    nodes = nodes.set_index(ID)
    nodes = nodes.loc[order_desc, 'label'].to_dict()
    # print(nodes)
    label_hist = {k: [] for k in nodes.keys()}
    
    G = G.to_undirected()
    
    end = False
    
    while not end:
        old_labels = [value for (key, value) in nodes.items() if key not in known_nodes]
        for node, label in nodes.items():
            if label != -1:
                neighbors = G.neighbors(node)
                
                for neighbor in neighbors:
                    neighbor_label = nodes[neighbor]
                    if neighbor not in known_nodes:
                        label_hist[neighbor].append(label)
        
        for node, label in nodes.items():
            unique_labels = len(set(label_hist[node])) == 1
            count_1 = label_hist[node].count(1)
            count_2 = round(label_hist[node].count(2) / float(threshold))
            if label == -1:
                print(count_1)
                print(count_2)
            # print(label_hist)
                print('**************')
            if count_1 > count_2:
                nodes[node] = 1
            elif count_2 > count_1:
                nodes[node] = 2
            elif unique_labels:
                nodes[node] = label_hist[node][0]
                
            label_hist[node] = []
            
        # if label == 1:
        #     # print('LABEL: ' + str(label))
        #     nodes[neighbor] = label
        #     label_hist[neighbor] = []
        # else:
        #     label_hist[neighbor].append(label)
        #     if label_hist[neighbor].count(label) >= threshold:
        #         # print('in majority')
        #         nodes[neighbor] = label
        #         label_hist[neighbor] = []
        new_labels = [value for (key, value) in nodes.items() if key not in known_nodes]
        # print(old_labels)
        # print(new_labels)
        # print(list(nodes.values()).count(-1))
        if (jaccard_score(old_labels, new_labels, average='micro') >= 0.99) & (-1 not in nodes.values()):
        # if jaccard_similarity_score(old_labels, new_labels) >= 0.9:
            end = True
        
    return nodes


In [58]:
def collective_classification(G, df_features, pct, levels, df_positions, threshold):
    feature_names = df_features.columns
    
    df_features = assign_management_levels(levels, df_features, df_positions)
    
    for utility_score_name in feature_names:
        utility_score = df_features[[utility_score_name, POSITION]]     
        known_nodes = select_nodes_based_on_utility_score(utility_score_name, utility_score, pct, levels)
        
        nx.set_node_attributes(G, -1, 'label')
        nx.set_node_attributes(G, known_nodes, 'label')
        nx.set_node_attributes(G, utility_score[utility_score_name], 'utility_score')
        
        nodes = message_passing(G, known_nodes, threshold)
        # print(nodes)
        nodes = pd.DataFrame.from_dict(nodes, orient='index', columns=[POSITION])
        nodes.index.name = ID
        
        nodes = nodes.loc[~nodes.index.isin(known_nodes)]
        
        df_merged = pd.merge(nodes, df_features[POSITION], on=ID)
        # print(df_merged)
        f1 = f1_score(df_merged.iloc[:, 0], df_merged.iloc[:, 1], average='macro')
        print('PCT: ', pct)
        print('F1: ', f1)
        print('Utiliti score: ', utility_score_name)
        print('***********************************')
        

In [1]:
# collective_classification(G, df_features, 0.6, 2, df_positions, 3)

In [284]:
select_nodes_based_on_utility_score(BETWEENNESS, df, pct, 2)

{80: 1, 10: 1, 50: 2, 40: 2, 90: 2}

In [76]:
df = pd.DataFrame({ID: [10,20,30,40,50,60,70,80,90,100], BETWEENNESS: [5,6,2,8,9,4,6,7,8,3], POSITION: [1,2,1,2,2,2,2,1,2,1]})
df = df.set_index(ID)
df_positions = pd.DataFrame({ID: [10,20,30,40,50,60,70,80,90,100], POSITION: [1,2,1,2,2,2,2,1,2,1]})
pct = 0.5

In [52]:
import networkx as nx

In [57]:
nx.set_node_attributes(G, {80: 1, 10: 1, 50: 2, 40: 2, 90: 2}, 'asdf')

In [226]:
# pd.DataFrame(G.nodes(data='utility_score'), columns=[ID, 'utility_score'], index=ID).sort_values('utility_score', ascending=False).index

In [343]:
import numpy as np
np.round(2.6)

3.0

In [453]:
list(G.neighbors(39))

[81,
 136,
 156,
 62,
 16,
 157,
 116,
 140,
 28,
 95,
 17,
 97,
 132,
 61,
 121,
 123,
 119,
 101,
 85,
 89,
 152,
 130,
 155,
 47,
 166,
 103,
 133,
 73,
 8,
 128,
 112,
 27,
 83,
 22,
 108,
 58,
 52,
 32,
 3,
 113,
 150,
 104,
 78,
 94,
 74,
 36,
 45,
 56,
 50,
 77,
 64,
 14,
 37,
 129,
 105,
 102,
 144,
 18,
 66,
 148,
 137,
 161,
 60,
 54,
 91,
 82,
 86,
 141,
 69,
 13,
 19,
 33,
 107,
 163,
 98,
 67,
 135]

In [10]:
4%2

0