In [1]:
import os
import re

import numpy as np
import pandas as pd

from graph import *

In [2]:
# configurações de exibição das tabelas
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

# Lendo os arquivos

In [3]:
# listando os diretórios de usuários
base_path = './data/enron-sample/'
dirs = os.listdir('./data/enron-sample')

In [4]:
# criando uma lista com os arquivos
# (user_path, dirs, (path, dirs, files))

files = []
for dir in dirs:
    files.append(list(os.walk(base_path + dir)))

# Criando um `DataFrame`

In [5]:
# criando um df a partir dos emails

df = pd.DataFrame()

complete_paths = []
contents = []
for i in range(len(dirs)):
    for k in range(len(files[i])):
        user_dir_path = files[i][k][0].replace('\\', '/')
        data_files = files[i][k][2]

        for file_name in data_files:
            complete_path = user_dir_path + '/' + file_name

            with open(complete_path, 'r') as f:
                content = f.read()

            complete_paths.append(complete_path)
            contents.append(content)

df['path'] = complete_paths
df['content'] = contents

In [6]:
# extraindo emails de remetente e destinatário
df['from'] = df['content'].str.extract('From: (.*)\n')
df['to'] = df['content'].str.extract('To: (.*)\n')

# limpando a str dos emails
df['from'] = df['from'].str.strip()
df['to'] = df['to'].str.strip()

# substituindo emails nulos por np.nan
df['from'] = df['from'].replace('', np.nan)
df['from'] = df['from'].replace(' ', np.nan)
df['to'] = df['to'].replace('', np.nan)
df['to'] = df['to'].replace(' ', np.nan)

replace_nan = lambda x: x.replace('..', '.') if str(x) != 'nan' else x

df['from'] = df['from'].map(replace_nan)
df['to'] = df['to'].map(replace_nan)

In [7]:
df.head()

Unnamed: 0,path,content,from,to
0,./data/enron-sample/brawner-s/all_documents/1,Message-ID: <9225178.1075856114648.JavaMail.ev...,tdoremus@tfsbrokers.com,
1,./data/enron-sample/brawner-s/all_documents/10,Message-ID: <24241310.1075856114864.JavaMail.e...,jons@amerexenergy.com,sandra.f.brawner@enron.com
2,./data/enron-sample/brawner-s/all_documents/100,Message-ID: <25430194.1075856121754.JavaMail.e...,enron.announcements@enron.com,all.houston@enron.com
3,./data/enron-sample/brawner-s/all_documents/101,Message-ID: <22484553.1075856121777.JavaMail.e...,sandra.brawner@enron.com,kennethbrawner@msn.com
4,./data/enron-sample/brawner-s/all_documents/102,Message-ID: <7884756.1075856121799.JavaMail.ev...,sandra.brawner@enron.com,kennethbrawner@msn.com


In [8]:
df.shape

(12517, 4)

In [9]:
# criando uma lista de emails únicos

unique_emails_from = df['from'].unique()
unique_emails_to = df['to'].unique()

unique_emails = np.append(unique_emails_from, unique_emails_to)

print(len(unique_emails))
unique_emails[:10]

3191


array(['tdoremus@tfsbrokers.com', 'jons@amerexenergy.com',
       'enron.announcements@enron.com', 'sandra.brawner@enron.com',
       'perfmgmt@enron.com', 'jared.kaiser@enron.com',
       'michael.garberding@enron.com', 'outlook.team@enron.com',
       'jeffrey.shankman@enron.com', 'airam.arteaga@enron.com'],
      dtype=object)

# 1. Criando o grafo

In [10]:
# 1. criando o grafo

graph = Graph()

for email in unique_emails:

    # cria um caso não exista um vértice do email
    if email not in list(graph.adjacency_matrix.keys()):
        graph.add_vertex(email)

for row in df.iterrows():
    row_data = row[1]
    
    email_from = row_data['from']
    email_to = row_data['to']

    # caso ambos emails existam
    if str(email_from) != 'nan' and str(email_to) != 'nan':

        # se a aresta já existir, recria aumentando o peso
        if graph.has_edge(email_from, email_to):
            weight = graph.weight(email_from, email_to)

            graph.remove_edge(email_from, email_to)
            graph.add_edge(email_from, email_to, weight+1, on_error='pass')

        # cria a aresta se essa não existir
        else:
            graph.add_edge(email_from, email_to, 1)

In [11]:
matrix = graph.get_adjacency_matrix()

# primeiros 10 contatos de 'sandra.brawner@enron.com'
list(zip(list(matrix['sandra.brawner@enron.com'].keys()),
         list(list(matrix['sandra.brawner@enron.com'].values()))))[:10]

[('andrea.ring@enron.com', 8),
 ('parking.transportation@enron.com', 3),
 ('jons@amerexyenergy.com', 4),
 ('rrussin@natsource.com', 3),
 ('ina.rangel@enron.com', 3),
 ('louis.dicarlo@enron.com', 3),
 ('ksmalek@aep.com', 3),
 ('tk.lohman@enron.com', 6),
 ('john.griffith@enron.com', 4),
 ('david.delainey@enron.com', 4)]

# 2. Informações do grafo

In [12]:
# 2. a) número de vértices do grafo

print(graph.nvertex)
print(len(graph.adjacency_matrix))

2836
2836


In [13]:
# 2. b) número de arestas do grafo
graph.nedges

2847

In [14]:
# 20 nós (emails) de maior grau

nodes = list(graph.adjacency_matrix.keys())

degrees = []
for node in nodes:
    degrees.append(graph.degree(node))

degrees = pd.Series(data=degrees, index=nodes)
degrees.sort_values(ascending=False)[:20]

drew.fossum@enron.com           308
darron.giron@enron.com          285
james.derrick@enron.com         258
mike.carson@enron.com           166
lindy.donoho@enron.com          155
martin.cuilla@enron.com         111
c.giron@enron.com                99
f.brawner@enron.com              46
sandra.brawner@enron.com         44
mcuilla@enron.com                42
no.address@enron.com             40
shelley.corman@enron.com         22
mcuilla@ect.enron.com            21
40enron@enron.com                21
audrey.robertson@enron.com       21
dennis.lee@enron.com             20
victor.lamadrid@enron.com        20
powerprices@amerexenergy.com     19
lorna.brennan@enron.com          16
a.howard@enron.com               16
dtype: int64

In [15]:
# graph.depth_first_search(start='jons@amerexenergy.com',
#                          end='the.mailout@enron.com')

In [16]:
# graph.breadth_first_search(start='jons@amerexenergy.com',
#                            end='the.mailout@enron.com')

In [17]:
# path, cost = graph.dijkstra(start='andrea.ring@enron.com',
#                             end='darron.giron@enron.com')