In [1]:
#Purpose of this code: Try different community detection algorithms 

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from itertools import combinations
import os
import datetime
from community import community_louvain
from networkx.algorithms.community import greedy_modularity_communities
from networkx.algorithms.community import label_propagation_communities
from networkx.algorithms.community import k_clique_communities
import random
from collections import Counter

In [2]:
#Read in the network that we will be performing community detection on.
G = nx.read_gexf("../Networks/All_CPD_Network_NewEdgeWeights_NoEdgesBelow2.gexf")

In [3]:
#We now want to try several different community detection algorithms. 
random.seed(1)

#THIS IS THE LOUVAIN METHOD. 
#Returns dictionary of node to community assignment
louvain = community_louvain.best_partition(G, weight = "weight", resolution = 0.0075, randomize = 0)

#Greedy Modularity Communities
#Returns each community as a set of nodes in it. 
greedy = [list(x) for x in greedy_modularity_communities(G, weight= "weight")]

#Label Propagation
#Returns each community as a set of nodes in it. 
lab_prop = [list(x) for x in list(label_propagation_communities(G))]

#K-clique
#Returns each community as a set of nodes in it. 
kClique = [list(x) for x in list(k_clique_communities(G, 3))]


In [4]:
# Read in roster data and add column for other crew members
roster = pd.read_csv("../Datasets/Final_Roster.csv").drop(columns=['Unnamed: 0', 'X'])

#I also want to add officers who may be in the complaint dataset and not the roster dataset.
md = pd.read_csv("../Datasets/Complaint_Dataset.csv")
md = md[['officer', 'UID']]
md = md.drop_duplicates()
rosterUIDs = list(roster['UID'])
md = md[~md.UID.isin(rosterUIDs)]
roster = roster.append(md)

#Here are the officers of the known crews. Lists provided by Chicago Tribune, and the Invisible Institute.
FinneganCrew = [3456, 23841, 27778, 20038, 12074, 22282, 1868, 29612, 8562, 17042, 25206, 12825, 25306, 22235, 3454]
wattsOfficers = [7780, 24399, 2334, 3564, 10361, 13777, 15883, 16181, 19331, 20481, 23933, 26902, 27101, 27871, 30215, 31456, 3584]
skullCap = [25503, 25732, 25962, 27439, 32384]
austinSeven = [5722, 13082, 19484, 23417, 26243, 28927, 31438]

def crew(row):
    uid = row['UID']
    #return the crew name of the UID. If not in a crew, return "None"
    if uid in FinneganCrew:
        return "Finnegan"
    elif uid in wattsOfficers:
        return "Watts"
    elif uid in skullCap:
        return "Skullcap"
    elif uid in austinSeven:
        return "Austin_Seven"
    return "None"
roster['Crew'] = roster.apply (lambda row: crew(row), axis=1)

In [5]:
#Now I want to add a column to the roster for which community each officer belongs to for each category
def returnList(uid, listToCheck):
    index =  [listToCheck.index(x) for x in listToCheck if str(uid) in x]
    if len(index)>0:
        return index[0]
    return "None"

roster['Louvain_0075'] = roster.apply(lambda row: louvain.get(str(row['UID'])), axis = 1)
roster['Greedy'] = roster.apply(lambda row: returnList(row['UID'], greedy), axis = 1)
roster['Label_Propagation'] = roster.apply(lambda row: returnList(row['UID'], lab_prop), axis = 1)
roster['kClique'] = roster.apply(lambda row: returnList(row['UID'], kClique), axis = 1)

In [6]:
def jaccardSimilarity(list1, list2):
    #returns the Jaccard similarity of two lists
    union = len(set(list1+ list2))
    intersection = len([x for x in list1 if x in list2])
    return intersection/union

In [7]:
#We want to know the jaccard indexes of each crew with each method
crews = ["Watts","Austin_Seven","Skullcap","Finnegan"]
def getMode(listy):
    first =  Counter(listy).most_common(1)[0][0]
    if first == "None":
        if len(Counter(listy).most_common(1)[0])>1:
            return Counter(listy).most_common(1)[0][1]
    return first
for index in [14,15,16,17]:
    for crew in crews:
        actualCrew = roster[roster['Crew'] == crew]
        actualUIDs = list(actualCrew['UID'])
        detectedCrew = roster[roster.iloc[:, index]==getMode(actualCrew.iloc[:, index])]
        detectedUIDs = list(detectedCrew['UID'])
        print("Method: ", roster.columns[index], "-", crew, " - Jaccard Similarity: ", jaccardSimilarity(actualUIDs, detectedUIDs))

Method:  Louvain_0075 - Watts  - Jaccard Similarity:  0.5
Method:  Louvain_0075 - Austin_Seven  - Jaccard Similarity:  0.21428571428571427
Method:  Louvain_0075 - Skullcap  - Jaccard Similarity:  0.23076923076923078
Method:  Louvain_0075 - Finnegan  - Jaccard Similarity:  0.13333333333333333
Method:  Greedy - Watts  - Jaccard Similarity:  0.32142857142857145
Method:  Greedy - Austin_Seven  - Jaccard Similarity:  0.03684210526315789
Method:  Greedy - Skullcap  - Jaccard Similarity:  0.0410958904109589
Method:  Greedy - Finnegan  - Jaccard Similarity:  0.057803468208092484
Method:  Label_Propagation - Watts  - Jaccard Similarity:  0.5
Method:  Label_Propagation - Austin_Seven  - Jaccard Similarity:  0.23076923076923078
Method:  Label_Propagation - Skullcap  - Jaccard Similarity:  0.2727272727272727
Method:  Label_Propagation - Finnegan  - Jaccard Similarity:  0.11494252873563218
Method:  kClique - Watts  - Jaccard Similarity:  0.5294117647058824
Method:  kClique - Austin_Seven  - Jaccard

In [8]:
#Louvain was the winner! Now it's time to try out a bunch of resolutions so that we can identify which is the best.
# Spoiler alert: The resolution of 0.006 had the best average jaccard index
louvainRoster = roster[['UID', 'Crew']]
random.seed(1)
louvain05 = community_louvain.best_partition(G, weight = "weight", resolution = 0.05, randomize = 0)
random.seed(1)
louvain02 = community_louvain.best_partition(G, weight = "weight", resolution = 0.02, randomize = 0)
random.seed(1)
louvain01 = community_louvain.best_partition(G, weight = "weight", resolution = 0.01, randomize = 0)
random.seed(1)
louvain0075 = community_louvain.best_partition(G, weight = "weight", resolution = 0.0075, randomize = 0)
random.seed(1)
louvain005 = community_louvain.best_partition(G, weight = "weight", resolution = 0.005, randomize = 0)
random.seed(1)
louvain0025 = community_louvain.best_partition(G, weight = "weight", resolution = 0.0025, randomize = 0)

louvainRoster['Louvain_05'] = roster.apply(lambda row: louvain05.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_02'] = roster.apply(lambda row: louvain02.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_01'] = roster.apply(lambda row: louvain01.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_0075'] = roster.apply(lambda row: louvain0075.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_005'] = roster.apply(lambda row: louvain005.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_0025'] = roster.apply(lambda row: louvain0025.get(str(row['UID'])), axis = 1)

for index in [2,3,4,5,6,7]:
    for crew in crews:
        actualCrew = louvainRoster[louvainRoster['Crew'] == crew]
        actualUIDs = list(actualCrew['UID'])
        detectedCrew = louvainRoster[louvainRoster.iloc[:, index]==getMode(actualCrew.iloc[:, index])]
        detectedUIDs = list(detectedCrew['UID'])
        print("Resolution: ", louvainRoster.columns[index], "-", crew, " - Jaccard Similarity: ", jaccardSimilarity(actualUIDs, detectedUIDs))
        
        
#Zoom in between 0.0025 and 0.0075 to identify the actual maximum
random.seed(1)
louvain007 = community_louvain.best_partition(G, weight = "weight", resolution = 0.007, randomize = 0)
random.seed(1)
louvain0065 = community_louvain.best_partition(G, weight = "weight", resolution = 0.0065, randomize = 0)
random.seed(1)
louvain006 = community_louvain.best_partition(G, weight = "weight", resolution = 0.006, randomize = 0)
random.seed(1)
louvain0055 = community_louvain.best_partition(G, weight = "weight", resolution = 0.0055, randomize = 0)
random.seed(1)
louvain0045 = community_louvain.best_partition(G, weight = "weight", resolution = 0.0045, randomize = 0)
random.seed(1)
louvain0040 = community_louvain.best_partition(G, weight = "weight", resolution = 0.0040, randomize = 0)
random.seed(1)
louvain0035 = community_louvain.best_partition(G, weight = "weight", resolution = 0.0035, randomize = 0)
random.seed(1)
louvain0030 = community_louvain.best_partition(G, weight = "weight", resolution = 0.0030, randomize = 0)

louvainRoster['Louvain_007'] = roster.apply(lambda row: louvain007.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_0065'] = roster.apply(lambda row: louvain0065.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_006'] = roster.apply(lambda row: louvain006.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_0055'] = roster.apply(lambda row: louvain0055.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_0045'] = roster.apply(lambda row: louvain0045.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_0040'] = roster.apply(lambda row: louvain0040.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_0035'] = roster.apply(lambda row: louvain0035.get(str(row['UID'])), axis = 1)
louvainRoster['Louvain_0030'] = roster.apply(lambda row: louvain0030.get(str(row['UID'])), axis = 1)


for index in [8,9,10,11,12,13, 14,15]:
    for crew in crews:
        actualCrew = louvainRoster[louvainRoster['Crew'] == crew]
        actualUIDs = list(actualCrew['UID'])
        detectedCrew = louvainRoster[louvainRoster.iloc[:, index]==getMode(actualCrew.iloc[:, index])]
        detectedUIDs = list(detectedCrew['UID'])
        print("Resolution: ", louvainRoster.columns[index], "-", crew, " - Jaccard Similarity: ", jaccardSimilarity(actualUIDs, detectedUIDs))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Resolution:  Louvain_05 - Watts  - Jaccard Similarity:  0.5
Resolution:  Louvain_05 - Austin_Seven  - Jaccard Similarity:  0.10256410256410256
Resolution:  Louvain_05 - Skullcap  - Jaccard Similarity:  0.2727272727272727
Resolution:  Louvain_05 - Finnegan  - Jaccard Similarity:  0.3125
Resolution:  Louvain_02 - Watts  - Jaccard Similarity:  0.5
Resolution:  Louvain_02 - Austin_Seven  - Jaccard Similarity:  0.21052631578947367
Resolution:  Louvain_02 - Skullcap  - Jaccard Similarity:  0.3
Resolution:  Louvain_02 - Finnegan  - Jaccard Similarity:  0.25
Resolution:  Louvain_01 - Watts  - Jaccard Similarity:  0.5
Resolution:  Louvain_01 - Austin_Seven  - Jaccard Similarity:  0.1875
Resolution:  Louvain_01 - Skullcap  - Jaccard Similarity:  0.23076923076923078
Resolution:  Louvain_01 - Finnegan  - Jaccard Similarity:  0.11764705882352941
Resolution:  Louvain_0075 - Watts  - Jaccard Similarity:  0.5
Resolution:  Louvain_0075 - Austin_Seven  - Jaccard Similarity:  0.4
Resolution:  Louvain_007

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Resolution:  Louvain_007 - Watts  - Jaccard Similarity:  0.5
Resolution:  Louvain_007 - Austin_Seven  - Jaccard Similarity:  0.36363636363636365
Resolution:  Louvain_007 - Skullcap  - Jaccard Similarity:  0.23076923076923078
Resolution:  Louvain_007 - Finnegan  - Jaccard Similarity:  0.13333333333333333
Resolution:  Louvain_0065 - Watts  - Jaccard Similarity:  0.5
Resolution:  Louvain_0065 - Austin_Seven  - Jaccard Similarity:  0.25
Resolution:  Louvain_0065 - Skullcap  - Jaccard Similarity:  0.23076923076923078
Resolution:  Louvain_0065 - Finnegan  - Jaccard Similarity:  0.13333333333333333
Resolution:  Louvain_006 - Watts  - Jaccard Similarity:  0.5
Resolution:  Louvain_006 - Austin_Seven  - Jaccard Similarity:  0.36363636363636365
Resolution:  Louvain_006 - Skullcap  - Jaccard Similarity:  0.23076923076923078
Resolution:  Louvain_006 - Finnegan  - Jaccard Similarity:  0.05263157894736842
Resolution:  Louvain_0055 - Watts  - Jaccard Similarity:  0.5
Resolution:  Louvain_0055 - Austin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [211]:
#Since the Louvain Resolution of 0.006 performs the best, that is what we will use moving forward. We return to the roster and add a column for that louvain community called "Community_ID". I also want to drop the existing Louvain column
roster = roster.drop(columns=['Louvain_0075'])
roster['Community_ID'] = roster.apply(lambda row: louvain006.get(str(row['UID'])), axis = 1)
roster.Community_ID = roster.Community_ID.fillna('None')

In [217]:
#Now I want to export this dataset
roster.to_csv("../Datasets/Officer_Community_Assignments.csv", index = False)