In [3]:
# IMPORTS
from manufacturing_company.src.common.const import *
from manufacturing_company.src.network.social_network import *
from manufacturing_company.src.classification_algorithms.standard_classification import *
from manufacturing_company.src.classification_algorithms.NodeInfo import NodeInfo
from sklearn.metrics import jaccard_similarity_score, jaccard_score, f1_score

In [154]:
# COLLECTIVE CLASSIFICATION
df_positions = pd.read_csv('manufacturing_company/data/raw/positions.csv', sep=';', comment='#', index_col=ID)

for i in range(1, SIZE + 1):
    df_features = pd.read_csv('manufacturing_company/data/intermediate/05_features/' + str(i) + '_months_features.csv', sep=';', index_col=ID)
    
    df_communication = pd.read_csv('manufacturing_company/data/intermediate/03_minimum_activity/' + str(i) + '_months_communication.csv', sep=';')
    G = create_network(df_communication, weight=False)
    
    

In [155]:
def select_nodes_based_on_utility_score(utility_score_name, utility_score, pct, levels):  
    utility_score = utility_score.sort_values(utility_score_name, ascending=False)

    known_nodes_map = dict()
    
    for position in range(1, levels + 1):
        employees_on_given_position = utility_score[utility_score[POSITION] == position]
        all_nodes = len(employees_on_given_position)
        known_nodes = round(all_nodes * pct)
        
        known_nodes_id = employees_on_given_position.iloc[:known_nodes].index
        known_nodes_map.update({id: position for id in known_nodes_id})
        
    return known_nodes_map


In [159]:
def message_passing(G, known_nodes, threshold, minority_labels): 
    df_nodes = pd.DataFrame(G.nodes(data='utility_score'), columns=[ID, 'utility_score'])
    df_nodes = df_nodes.set_index(ID)
    
    order_desc = df_nodes.sort_values('utility_score', ascending=True).index
    
    nodes = pd.DataFrame(G.nodes(data='label'), columns=[ID, 'label'])
    nodes = nodes.set_index(ID)
    nodes = nodes.loc[order_desc, 'label'].to_dict()
    # print(nodes)
    label_counter = {node_id: NodeInfo() for node_id in nodes.keys()}
    
    G = G.to_undirected()
    
    end = False
    
    while not end:
        old_labels = [value for (key, value) in nodes.items() if key not in known_nodes]
        for node, label in nodes.items():
            if label != -1:
                neighbors = G.neighbors(node)
                for neighbor in neighbors:
                    if neighbor not in known_nodes:
                        label_counter[neighbor].labels.append(label)
        
        # UPDATE LABELS
        for node, label in nodes.items():
            unique_labels = len(set(label_counter[node].labels)) == 1

            # TODO method calculate label frequency
            label_freq = None
            if len(set(nodes.values())) == 2:
                label_freq = {1: 0, 2: 0}
            if len(set(nodes.values())) == 3:
                label_freq = {1: 0, 2: 0, 3: 0}
                
            for l in label_freq.keys():
                size = label_counter[node].labels.count(l)
                if l not in minority_labels:
                    size = round(size / float(threshold))
                label_freq[l] = size

            same_freq = len(set(label_freq.values())) == 1

            # TODO function select update strategy
            if unique_labels:
                nodes[node] = label_counter[node].labels[0]
                label_counter[node].unchanged_iter = 0
            elif same_freq:
                # TODO update unchanged state
                label_counter[node].unchanged_iter += 1

                if label_counter[node].unchanged_iter > 100:
                    neighbors = G.neighbors(node)

                    max_utility_score = -1
                    node_label = None

                    for neighbor_id in neighbors:
                        neighbor = G.nodes[neighbor_id]
                        if neighbor['utility_score'] > max_utility_score:
                            max_utility_score = neighbor['utility_score']
                            node_label = neighbor['label']

                    nodes[node] = node_label
                    label_counter[node].unchanged_iter = 0
            else:
                nodes[node] = max(label_freq.items(), key=operator.itemgetter(1))[0]
                label_counter[node].unchanged_iter = 0

            label_counter[node].labels = []
            
        new_labels = [value for (key, value) in nodes.items() if key not in known_nodes]
        # print(old_labels)
        # print(new_labels)
        # print(list(nodes.values()).count(-1))
        if (jaccard_score(old_labels, new_labels, average='micro') >= 0.99) & (-1 not in nodes.values()):
            end = True
        
    return nodes


In [157]:
def collective_classification(G, df_features, pct, levels, df_positions, threshold, minority_labels):
    feature_names = df_features.columns
    
    df_features = assign_management_levels(levels, df_features, df_positions)
    
    for utility_score_name in feature_names:
        utility_score = df_features[[utility_score_name, POSITION]]     
        known_nodes = select_nodes_based_on_utility_score(utility_score_name, utility_score, pct, levels)
        
        nx.set_node_attributes(G, -1, 'label')
        nx.set_node_attributes(G, known_nodes, 'label')
        nx.set_node_attributes(G, utility_score[utility_score_name], 'utility_score')
        
        nodes = message_passing(G, known_nodes, threshold, minority_labels)
        # print(nodes)
        nodes = pd.DataFrame.from_dict(nodes, orient='index', columns=[POSITION])
        nodes.index.name = ID
        
        nodes = nodes.loc[~nodes.index.isin(known_nodes)]
        
        df_merged = pd.merge(nodes, df_features[POSITION], on=ID)
        # print(df_merged)
        f1 = f1_score(df_merged.iloc[:, 0], df_merged.iloc[:, 1], average='macro')
        print('PCT: ', pct)
        print('F1: ', f1)
        print('Utiliti score: ', utility_score_name)
        print('***********************************')
        

In [158]:
collective_classification(G, df_features, pct=0.6, levels=2, df_positions=df_positions, threshold=3, minority_labels=[1])

PCT:  0.6
F1:  0.8500000000000001
Utiliti score:  in_degree
***********************************
PCT:  0.6
F1:  0.6651162790697674
Utiliti score:  out_degree
***********************************
PCT:  0.6
F1:  0.7775718257645967
Utiliti score:  betweenness
***********************************
PCT:  0.6
F1:  0.4606741573033708
Utiliti score:  closeness
***********************************


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


PCT:  0.6
F1:  0.4606741573033708
Utiliti score:  eigenvector
***********************************
PCT:  0.6
F1:  0.1272727272727273
Utiliti score:  clustering_coeff
***********************************


  'recall', 'true', average, warn_for)


PCT:  0.6
F1:  0.7491289198606272
Utiliti score:  pagerank
***********************************


  'recall', 'true', average, warn_for)


PCT:  0.6
F1:  0.4606741573033708
Utiliti score:  hubs
***********************************
PCT:  0.6
F1:  0.8024691358024691
Utiliti score:  authorities
***********************************
PCT:  0.6
F1:  0.7433155080213902
Utiliti score:  max_clique
***********************************


PCT:  0.6
F1:  0.5909090909090909
Utiliti score:  cliques_count
***********************************
PCT:  0.6
F1:  0.4606741573033708
Utiliti score:  overtime
***********************************
PCT:  0.6
F1:  0.5909090909090909
Utiliti score:  work_at_weekend
***********************************


  'recall', 'true', average, warn_for)


PCT:  0.6
F1:  0.4606741573033708
Utiliti score:  neighborhood_variability_sender
***********************************
PCT:  0.6
F1:  0.6886005560704356
Utiliti score:  neighborhood_variability_recipient
***********************************
PCT:  0.6
F1:  0.4606741573033708
Utiliti score:  neighborhood_variability_all
***********************************


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [284]:
select_nodes_based_on_utility_score(BETWEENNESS, df, pct, 3)

{80: 1, 10: 1, 50: 2, 40: 2, 90: 2}

In [76]:
df = pd.DataFrame({ID: [10,20,30,40,50,60,70,80,90,100], BETWEENNESS: [5,6,2,8,9,4,6,7,8,3], POSITION: [1,2,1,2,2,2,2,1,2,1]})
df = df.set_index(ID)
df_positions = pd.DataFrame({ID: [10,20,30,40,50,60,70,80,90,100], POSITION: [1,2,1,2,2,2,2,1,2,1]})
pct = 0.5

In [52]:
import networkx as nx

In [57]:
nx.set_node_attributes(G, {80: 1, 10: 1, 50: 2, 40: 2, 90: 2}, 'asdf')

In [226]:
# pd.DataFrame(G.nodes(data='utility_score'), columns=[ID, 'utility_score'], index=ID).sort_values('utility_score', ascending=False).index

In [343]:
import numpy as np
np.round(2.6)

3.0

In [453]:
list(G.neighbors(39))

[81,
 136,
 156,
 62,
 16,
 157,
 116,
 140,
 28,
 95,
 17,
 97,
 132,
 61,
 121,
 123,
 119,
 101,
 85,
 89,
 152,
 130,
 155,
 47,
 166,
 103,
 133,
 73,
 8,
 128,
 112,
 27,
 83,
 22,
 108,
 58,
 52,
 32,
 3,
 113,
 150,
 104,
 78,
 94,
 74,
 36,
 45,
 56,
 50,
 77,
 64,
 14,
 37,
 129,
 105,
 102,
 144,
 18,
 66,
 148,
 137,
 161,
 60,
 54,
 91,
 82,
 86,
 141,
 69,
 13,
 19,
 33,
 107,
 163,
 98,
 67,
 135]

In [27]:
neighbors = G.neighbors(77)

max_utility_score = -1
label = None

for neighbor in neighbors:
    node = G.nodes[neighbor]
    if node['utility_score'] > max_utility_score:
        max_utility_score = node['utility_score']
        label = node['label']
        
print(label, max_utility_score)

1 0.6453193150025995


In [126]:
d = {1: 5, 2: 8, 3: 7}
hist = [1,1,2,2,2,2,2,2]
threshold = 3
minority_labels = [1]
unique_labels = len(set(hist)) == 1
levels = 2

In [127]:
count_1 = hist.count(1)
count_2 = round(hist.count(2) / float(threshold))
print(count_1, count_2)

2 2


In [137]:
label_freq = None
if levels == 2:
    label_freq = {1: 0, 2: 0}
if levels == 3:
    label_freq = {1: 0, 2: 0, 3: 0}
for l in label_freq.keys():
    size = hist.count(l)
    if l not in minority_labels:
        size = round(size / float(threshold))
    label_freq[l] = size
same_freq = len(set(label_freq.values())) == 1
label_freq

{1: 2, 2: 2}

In [138]:
if count_1 > count_2:
    print('if count_1 > count_2')
elif count_2 > count_1:
    print('if count_2 > count_1')
elif unique_labels:
    print('unique_labels')
else:
    print('unchanged_iter')

unchanged_iter


In [139]:
if unique_labels:
    print('unique_labels')
elif same_freq:
    print('unchanged_iter')
else:
    print(max(label_freq.items(), key=operator.itemgetter(1))[0])

unchanged_iter


In [140]:
nodes['labels'].unique()

NameError: name 'nodes' is not defined

In [136]:
set(label_freq.values())

{2}