In [281]:
# IMPORTS
from manufacturing_company.src.common.const import *
from manufacturing_company.src.network.social_network import *
from manufacturing_company.src.classification_algorithms.standard_classification import *

from sklearn.metrics import jaccard_similarity_score, jaccard_score, f1_score

In [322]:
# COLLECTIVE CLASSIFICATION
df_positions = pd.read_csv('manufacturing_company/data/raw/positions.csv', sep=';', comment='#', index_col=ID)

for i in range(1, SIZE + 1):
    df_features = pd.read_csv('manufacturing_company/data/intermediate/05_features/' + str(i) + '_months_features.csv', sep=';', index_col=ID)
    
    df_communication = pd.read_csv('manufacturing_company/data/intermediate/03_minimum_activity/' + str(i) + '_months_communication.csv', sep=';')
    G = create_network(df_communication, weight=False)
    
    

In [323]:
def select_nodes_based_on_utility_score(utility_score_name, utility_score, pct, levels):  
    utility_score = utility_score.sort_values(utility_score_name, ascending=False)

    known_nodes_map = dict()
    
    for position in range(1, levels + 1):
        employees_on_given_position = utility_score[utility_score[POSITION] == position]
        all_nodes = len(employees_on_given_position)
        known_nodes = round(all_nodes * pct)
        
        known_nodes_id = employees_on_given_position.iloc[:known_nodes].index
        known_nodes_map.update({id: position for id in known_nodes_id})
        
    return known_nodes_map


In [324]:
def message_passing(G, known_nodes, threshold): 
    df_nodes = pd.DataFrame(G.nodes(data='utility_score'), columns=[ID, 'utility_score'])
    df_nodes = df_nodes.set_index(ID)
    
    order_desc = df_nodes.sort_values('utility_score', ascending=False).index
    
    nodes = pd.DataFrame(G.nodes(data='label'), columns=[ID, 'label'])
    nodes = nodes.set_index(ID)
    nodes = nodes.loc[order_desc, 'label'].to_dict()
    # print(nodes)
    label_hist = {k: [] for k in nodes.keys()}
    
    G = G.to_undirected()
    
    end = False
    
    while not end:
        old_labels = [value for (key, value) in nodes.items() if key not in known_nodes]
        for node, label in nodes.items():
            if label != -1:
                neighbors = G.neighbors(node)
                
                for neighbor in neighbors:
                    neighbor_label = nodes[neighbor]
                    if neighbor not in known_nodes:
                        if label == 1:
                            # print('LABEL: ' + str(label))
                            nodes[neighbor] = label
                            label_hist[neighbor] = []
                        else:
                            label_hist[neighbor].append(label)
                            if label_hist[neighbor].count(label) >= threshold:
                                # print('in majority')
                                nodes[neighbor] = label
                                label_hist[neighbor] = []
        new_labels = [value for (key, value) in nodes.items() if key not in known_nodes]
        print(old_labels)
        print(new_labels)
        if jaccard_score(old_labels, new_labels, average='micro') >= 0.99:
        # if jaccard_similarity_score(old_labels, new_labels) >= 0.9:
            end = True
        
    return nodes


In [325]:
def collective_classification(G, df_features, pct, levels, df_positions, threshold):
    feature_names = df_features.columns
    
    df_features = assign_management_levels(levels, df_features, df_positions)
    
    for utility_score_name in feature_names:
        utility_score = df_features[[utility_score_name, POSITION]]     
        known_nodes = select_nodes_based_on_utility_score(utility_score_name, utility_score, pct, levels)
        
        nx.set_node_attributes(G, -1, 'label')
        nx.set_node_attributes(G, known_nodes, 'label')
        nx.set_node_attributes(G, utility_score[utility_score_name], 'utility_score')
        
        nodes = message_passing(G, known_nodes, threshold)
        # print(nodes)
        nodes = pd.DataFrame.from_dict(nodes, orient='index', columns=[POSITION])
        nodes.index.name = ID
        
        nodes = nodes.loc[~nodes.index.isin(known_nodes)]
        
        df_merged = pd.merge(nodes, df_features[POSITION], on=ID)
        print(df_merged)
        f1 = f1_score(df_merged.iloc[:, 0], df_merged.iloc[:, 1], average='macro')
        print('PCT: ', pct)
        print('F1: ', f1)
        print('Utiliti score: ', utility_score_name)
        print('***********************************')
        

In [327]:
# collective_classification(G, df_features, 0.6, 2, df_positions, 2)

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
     ManagementLevel_x  ManagementLevel_y
ID                                       
163                  2                  1
137                  2                  1
143                  2                  1
86                   2                  1
66                   2                  2
153                  2                  2
113                  2              

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
     ManagementLevel_x  ManagementLevel_y
ID                                       
125                  1                  2
134                  1                  2
63                   1                  2
92                   1                  2
34                   1                  2
27                   1                  1
141                  1              

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
     ManagementLevel_x  ManagementLevel_y
ID                                       
121                  2                  1
156                  2                  1
65                   2                  2
126                  2                  2
154                  2                  2
147                  2                  2
105                  2              

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
     ManagementLevel_x  ManagementLevel_y
ID                                       
69                   2                  1
86                   2                  1
143                  2                  1
137                  2                  1
112                  2                  2
67                   2                  2
35                   2              


[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
     ManagementLevel_x  ManagementLevel_y
ID                                       
162                  2                  1
121                  2                  1
143                  2                  1
37                   2                  2
3                    2                  2
47                   2                  1
56                   2                  2
15                   2                  2
156                  2                  1
88                   2                  2
153                  2                  2
16                   2                  2
105                  2                  2
113                  2                  2
58                   2                  2
32                   2                  2
137                  2                  1
116                  2                  2
72            

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [284]:
select_nodes_based_on_utility_score(BETWEENNESS, df, pct, 2)

{80: 1, 10: 1, 50: 2, 40: 2, 90: 2}

In [76]:
df = pd.DataFrame({ID: [10,20,30,40,50,60,70,80,90,100], BETWEENNESS: [5,6,2,8,9,4,6,7,8,3], POSITION: [1,2,1,2,2,2,2,1,2,1]})
df = df.set_index(ID)
df_positions = pd.DataFrame({ID: [10,20,30,40,50,60,70,80,90,100], POSITION: [1,2,1,2,2,2,2,1,2,1]})
pct = 0.5

In [52]:
import networkx as nx

In [57]:
nx.set_node_attributes(G, {80: 1, 10: 1, 50: 2, 40: 2, 90: 2}, 'asdf')

In [226]:
# pd.DataFrame(G.nodes(data='utility_score'), columns=[ID, 'utility_score'], index=ID).sort_values('utility_score', ascending=False).index

In [252]:
df_positions

Unnamed: 0_level_0,ManagementLevel
ID,Unnamed: 1_level_1
86,1
7,1
27,1
36,1
69,1
70,1
85,1
104,1
121,1
148,1
