In [1]:
from py2neo import Graph, authenticate, Relationship
import csv
import pandas as pd

In [6]:
authenticate("localhost:7474", "neo4j", "admin")

In [7]:
graph = Graph()

In [8]:
allData = pd.DataFrame.from_csv("./data.csv")
len(allData)

4159802

In [9]:
# Clean not well parsed lines  
allData = allData.dropna()  
len(allData)

4150011

In [10]:
# Only  686502 loaded in Neo4j
df = allData[:686502]

In [11]:
df.head()

Unnamed: 0,id,month,day,hour,AccessPoint,status,Message,action,Building Type,MAC address
0,986990216,Apr,11.0,07:56:56,AdmBldg19AP3,(Info):,Station 0040961e58be Authenticated\r\n,authenticated,Admin,0040961e58be
1,986990247,Apr,11.0,07:57:27,AdmBldg19AP3,(Info):,Station 0040961e58be Authenticated\r\n,authenticated,Admin,0040961e58be
2,986990247,Apr,11.0,07:57:27,AdmBldg19AP3,(Info):,Station 0040961e58be Associated\r\n,associated,Admin,0040961e58be
3,986990293,Apr,11.0,07:58:13,AdmBldg19AP3,(Info):,Station 0040961e58be Authenticated\r\n,authenticated,Admin,0040961e58be
4,986990364,Apr,11.0,07:59:24,AdmBldg19AP3,(Info):,Station 0040961e58be Authenticated\r\n,authenticated,Admin,0040961e58be


In [12]:
import datetime
import calendar
months = {v: k for k,v in enumerate(calendar.month_abbr)}
dt = []
for index, row in df.iterrows():
    dt.append(datetime.datetime(year=2017, month=months.get(row["month"]), day=int(row["day"]),
                           hour=int(row["hour"][0:2]), minute=int(row["hour"][3:5]), second=int(row["hour"][7:])))
    
df["datetime"] = dt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
df.head()

Unnamed: 0,id,month,day,hour,AccessPoint,status,Message,action,Building Type,MAC address,datetime
0,986990216,Apr,11.0,07:56:56,AdmBldg19AP3,(Info):,Station 0040961e58be Authenticated\r\n,authenticated,Admin,0040961e58be,2017-04-11 07:56:06
1,986990247,Apr,11.0,07:57:27,AdmBldg19AP3,(Info):,Station 0040961e58be Authenticated\r\n,authenticated,Admin,0040961e58be,2017-04-11 07:57:07
2,986990247,Apr,11.0,07:57:27,AdmBldg19AP3,(Info):,Station 0040961e58be Associated\r\n,associated,Admin,0040961e58be,2017-04-11 07:57:07
3,986990293,Apr,11.0,07:58:13,AdmBldg19AP3,(Info):,Station 0040961e58be Authenticated\r\n,authenticated,Admin,0040961e58be,2017-04-11 07:58:03
4,986990364,Apr,11.0,07:59:24,AdmBldg19AP3,(Info):,Station 0040961e58be Authenticated\r\n,authenticated,Admin,0040961e58be,2017-04-11 07:59:04


# Fill Neo4j Database

In [None]:
# Create MAC addresses nodes
for add in df["MAC address"].unique():
    graph.run('''
    CREATE (MAC: address {name:{N}})
    RETURN MAC    
    ''', { "N": add})    

In [17]:
# Mapping AccessPoint:Building

import numpy as np

temp = []
for ap, bu in zip(df["AccessPoint"], df["Building Type"]) : 
    temp.append([ap,bu])
accessPoint_and_buildingType = [list(x) for x in set(tuple(x) for x in temp)]
accessPoint_and_buildingType

[['ResBldg93AP2', 'Residential'],
 ['LibBldg1AP8', 'Library'],
 ['AcadBldg34AP1', 'Academic'],
 ['ResBldg56AP11', 'Residential'],
 ['AcadBldg8AP5', 'Academic'],
 ['AdmBldg7AP1', 'Admin'],
 ['ResBldg94AP6', 'Residential'],
 ['AdmBldg12AP2', 'Admin'],
 ['AdmBldg14AP1', 'Admin'],
 ['AcadBldg9AP1', 'Academic'],
 ['LibBldg3AP2', 'Library'],
 ['ResBldg23AP2', 'Residential'],
 ['AcadBldg18AP5', 'Academic'],
 ['ResBldg25AP1', 'Residential'],
 ['ResBldg13AP2', 'Residential'],
 ['ResBldg55AP4', 'Residential'],
 ['AcadBldg30AP6', 'Academic'],
 ['ResBldg33AP1', 'Residential'],
 ['AcadBldg10AP15', 'Academic'],
 ['ResBldg23AP1', 'Residential'],
 ['ResBldg21AP2', 'Residential'],
 ['AcadBldg2AP1', 'Academic'],
 ['ResBldg15AP2', 'Residential'],
 ['AcadBldg33AP1', 'Academic'],
 ['ResBldg30AP2', 'Residential'],
 ['ResBldg20AP4', 'Residential'],
 ['ResBldg93AP1', 'Residential'],
 ['SocBldg4AP1', 'Social'],
 ['ResBldg58AP1', 'Residential'],
 ['AcadBldg26AP6', 'Academic'],
 ['AdmBldg24AP2', 'Admin'],
 ['Aca

In [None]:
# Create AP
for line in accessPoint_and_buildingType:
    graph.run('''
    CREATE (AP:AccessPoint {buildingType:{BU}, nameAP:{N}})
    RETURN AP    
    ''', {"N": line[0], "BU": line[1]})

In [None]:
# Create relationships between MAC addresses and AP

for index, row in df.iterrows():
    (
        graph.run('''
        MATCH (MAC) , (AP)           
        WHERE MAC.name = $name and AP.nameAP = $nameAP 
        CREATE (MAC)-[action:RELTYPE{action:$action, status:$status, date:$datetime, message:$message}]->(AP)
        RETURN action
        ''', parameters = {'name': row['MAC address'], 'nameAP': row['AccessPoint'],
                           'action': row['action'], 'status': row['status'],
                           'datetime': str(row['datetime']), 'message': row['Message'] }).data()
    )

# Database requests and aggregating results

In [18]:
#  List of MAC addresses
users =(
        graph.run('''
        MATCH (MAC:address)           
        RETURN MAC.name
        ''').data()
    )

In [21]:
users[0]["MAC.name"]

u'0040961e58be'

In [23]:
# transform to a list of  
usernames = [item.values() for item in users]
usernames = [sublist[0] for sublist in usernames]
print usernames

[u'0040961e58be', u'nothing', u'00601db0635a', u'00409699dfba', u'004096daa8fe', u'00409630cdc9', u'004096342c16', u'00409699915b', u'00409664173d', u'004096a0b921', u'003065343950', u'003065b40dda', u'003065fc1494', u'0040968cf0ea', u'0030650c0321', u'0040962dc86b', u'0030655b9345', u'00409622cb2b', u'003065e8f0ca', u'00022db6cc78', u'004096d06bb2', u'004096da8a8e', u'00601d9bf252', u'00306526784b', u'003065708c70', u'004096ff6784', u'003065d55e08', u'00409649fe8d', u'00306506fde2', u'004096229b1c', u'0030655b8297', u'0040964fb1f3', u'0030651794b6', u'004096828008', u'0040963582b8', u'003065f88ab1', u'00409616fc02', u'003065901355', u'003065e2d2b5', u'0030659f0025', u'00601df57e26', u'003065c010fb', u'004096f20d21', u'003065475356', u'003065e79136', u'00601d9a221b', u'00306582237e', u'00306572a9d8', u'0030659970a6', u'004096aec485', u'003065e0de26', u'004096c99fa5', u'00409653f0d3', u'0040966fb204', u'004096906d36', u'00409696d938', u'004096fde907', u'004096fee63e', u'00601d29a832', u

In [24]:
# How many differents Access Points for a given MAC address ?
nbAPConnected = []
for name in usernames :
    nbAPConnected.append(
    
    (
        graph.run('''
        MATCH (MAC)-[r]->(AP)   
        WHERE MAC.name=$name
        RETURN count(DISTINCT AP.nameAP)
        ''', parameters = {'name': name}).data()
    )
    )

In [25]:
nbAPConnected[0]

[{u'count(DISTINCT AP.nameAP)': 19}]

In [26]:
nbAPConnected = [item[0] for item in nbAPConnected]
print nbAPConnected

[[{u'count(DISTINCT AP.nameAP)': 19}], [{u'count(DISTINCT AP.nameAP)': 8}], [{u'count(DISTINCT AP.nameAP)': 2}], [{u'count(DISTINCT AP.nameAP)': 10}], [{u'count(DISTINCT AP.nameAP)': 6}], [{u'count(DISTINCT AP.nameAP)': 4}], [{u'count(DISTINCT AP.nameAP)': 25}], [{u'count(DISTINCT AP.nameAP)': 107}], [{u'count(DISTINCT AP.nameAP)': 106}], [{u'count(DISTINCT AP.nameAP)': 155}], [{u'count(DISTINCT AP.nameAP)': 3}], [{u'count(DISTINCT AP.nameAP)': 2}], [{u'count(DISTINCT AP.nameAP)': 23}], [{u'count(DISTINCT AP.nameAP)': 23}], [{u'count(DISTINCT AP.nameAP)': 11}], [{u'count(DISTINCT AP.nameAP)': 5}], [{u'count(DISTINCT AP.nameAP)': 1}], [{u'count(DISTINCT AP.nameAP)': 9}], [{u'count(DISTINCT AP.nameAP)': 27}], [{u'count(DISTINCT AP.nameAP)': 3}], [{u'count(DISTINCT AP.nameAP)': 20}], [{u'count(DISTINCT AP.nameAP)': 7}], [{u'count(DISTINCT AP.nameAP)': 8}], [{u'count(DISTINCT AP.nameAP)': 78}], [{u'count(DISTINCT AP.nameAP)': 10}], [{u'count(DISTINCT AP.nameAP)': 8}], [{u'count(DISTINCT AP

In [27]:
nbAP = [item.values() for item in nbAPConnected]  
nbAP = [sublist[0] for sublist in nbAP]
# Transform Long to Int in order to use these data in np matrix later  
nbAP = [np.asscalar(np.int16(val)) for val in nbAP]
print nbAP

[19, 8, 2, 10, 6, 4, 25, 107, 106, 155, 3, 2, 23, 23, 11, 5, 1, 9, 27, 3, 20, 7, 8, 78, 10, 8, 12, 7, 34, 4, 59, 21, 66, 45, 77, 5, 11, 31, 81, 3, 6, 1, 5, 2, 14, 1, 20, 54, 3, 2, 5, 29, 39, 26, 8, 4, 8, 30, 9, 2, 129, 3, 4, 11, 6, 66, 2, 32, 10, 4, 11, 10, 39, 52, 9, 7, 2, 52, 2, 80, 18, 4, 37, 1, 5, 4, 7, 3, 74, 6, 1, 1, 17, 9, 5, 8, 40, 100, 2, 4, 6, 7, 5, 4, 37, 7, 6, 1, 10, 4, 1, 5, 3, 5, 2, 46, 4, 8, 2, 4, 12, 4, 2, 10, 14, 15, 2, 6, 32, 3, 3, 1, 6, 5, 8, 3, 44, 59, 15, 25, 11, 8, 16, 24, 14, 13, 29, 18, 17, 31, 7, 2, 21, 2, 25, 10, 62, 9, 16, 1, 16, 12, 2, 9, 25, 37, 21, 46, 3, 1, 9, 44, 7, 2, 6, 9, 13, 17, 46, 1, 35, 5, 24, 6, 16, 18, 20, 12, 1, 36, 13, 7, 10, 2, 16, 3, 22, 8, 4, 9, 21, 1, 5, 10, 2, 13, 20, 7, 7, 8, 1, 28, 3, 14, 12, 2, 13, 8, 2, 5, 1, 3, 10, 7, 7, 10, 2, 1, 14, 15, 44, 25, 13, 1, 8, 5, 30, 2, 22, 4, 2, 6, 9, 28, 16, 3, 2, 50, 16, 9, 7, 1, 12, 1, 2, 9, 14, 13, 15, 2, 4, 3, 21, 1, 68, 15, 18, 37, 6, 1, 4, 3, 5, 21, 12, 2, 1, 13, 44, 12, 5, 2, 7, 3, 20, 2, 22, 3,

In [30]:
# As only 686502 on 4159802 log lines have been inserted in Neo4j, some MAC addresses are not implied in a connexion with an AP   
print len(usernames)
print len(nbAP)
nonZero = len([i for i, e in enumerate(nbAP) if e != 0])
nonZero

1916
1916


425

# Create a results DataFrame in order to store aggregating results

In [None]:
results = pd.DataFrame({'MAC': usernames,
     'nbAPConnected': nbAP})

In [33]:
results.head()

Unnamed: 0,MAC,nbAPConnected
0,0040961e58be,19
1,nothing,8
2,00601db0635a,2
3,00409699dfba,10
4,004096daa8fe,6


In [35]:
admin, academic, social, library, residential = [], [], [], [], []

for name in usernames :
    admin.append(( 
        graph.run('''
        MATCH (MAC)-[r]->(AP)   
        WHERE MAC.name=$name AND AP.buildingType = "Admin"
        RETURN MAC.name
        ''', parameters = {'name': name}).data()
    ))
    academic.append(( 
        graph.run('''
        MATCH (MAC)-[r]->(AP)   
        WHERE MAC.name=$name AND AP.buildingType = "Academic"
        RETURN MAC.name
        ''', parameters = {'name': name}).data()
    ))
    social.append(( 
        graph.run('''
        MATCH (MAC)-[r]->(AP)   
        WHERE MAC.name=$name AND AP.buildingType = "Social"
        RETURN MAC.name
        ''', parameters = {'name': name}).data()
    ))
    library.append(( 
        graph.run('''
        MATCH (MAC)-[r]->(AP)   
        WHERE MAC.name=$name AND AP.buildingType = "Library"
        RETURN MAC.name
        ''', parameters = {'name': name}).data()
    ))
    residential.append(( 
        graph.run('''
        MATCH (MAC)-[r]->(AP)   
        WHERE MAC.name=$name AND AP.buildingType = "Residential"
        RETURN MAC.name
        ''', parameters = {'name': name}).data()
    ))

In [37]:
from numpy import array
results["coToAdmin"] =  array([bool(len(item)) for item in admin ]).astype(int)
results["coToAcademic"] = array([bool(len(item)) for item in academic ]).astype(int) 
results["coToSocial"] = array([bool(len(item)) for item in social ]).astype(int) 
results["coToLibrary"] = array([bool(len(item)) for item in library ]).astype(int) 
results["coToResidential"] = array([bool(len(item)) for item in residential]).astype(int) 

In [38]:
# Users only connected to 'Academic' or 'Admin' buildings

AcaOrAdm = []
for name in usernames :
    AcaOrAdm.append(
    
    ( 
        graph.run('''
        MATCH (MAC)-[r]->(AP)   
        WHERE MAC.name=$name AND AP.buildingType IN ["Library", "Residential", "Social"]
        RETURN MAC.name
        ''', parameters = {'name': name}).data()
    )
    )

isOnlyAcaOrAdm = array([not bool(len(item)) for item in AcaOrAdm ]).astype(int)
results["only Academic or Admin"] = isOnlyAcaOrAdm

In [39]:
results.head()

Unnamed: 0,MAC,nbAPConnected,coToAdmin,coToAcademic,coToSocial,coToLibrary,coToResidential,only Academic or Admin
0,0040961e58be,19,1,1,0,1,1,0
1,nothing,8,1,1,0,1,0,0
2,00601db0635a,2,1,0,0,1,0,0
3,00409699dfba,10,1,1,0,1,0,0
4,004096daa8fe,6,0,1,0,0,0,1


# Handle Dates

In [40]:
dates = df[(df["MAC address"] == usernames[0]) & (df.action!= "deauthenticating")].datetime.tolist()
dates[0]

Timestamp('2017-04-11 07:56:06')

In [41]:
dates[0].to_pydatetime()

datetime.datetime(2017, 4, 11, 7, 56, 6)

In [46]:
from datetime import datetime

allWe = []
allDates = []
isWe = []
isNight = []
for user in usernames :
    dates = df[(df["MAC address"] == user) & (df.action!= "deauthenticating")].datetime.tolist()
    weekend = []
    night = []
    for date in dates :
        weekno = date.to_pydatetime().weekday()
        hour = date.to_pydatetime().hour
        weekend.append(weekno>=5)
        night.append(hour<8 or hour>19)
 #       night.append(hour<7 or hour>20)
    allWe.append(weekend)
    d= {x:weekend.count(x) for x in weekend}
    resTrue = 0.0
    resFalse = 1.0
    if  d.get(True) != None :
        resTrue = d.get(True)
    if  d.get(False) != None :
        resFalse = d.get(False)
    isWe.append(resTrue / resFalse)
    allDates.append(night)
    
   # print {x:night.count(x) for x in night}
    
    d= {x:night.count(x) for x in night}
    resTrue = 0.0
    resFalse = 1.0
    if  d.get(True) != None :
        resTrue = d.get(True)
    if  d.get(False) != None :
        resFalse = d.get(False)
  #  print resTrue / resFalse
    isNight.append(resTrue / resFalse)
    


In [47]:
results["is Night"]= isNight
results["is Weekend"] = isWe

In [48]:
results.head()

Unnamed: 0,MAC,nbAPConnected,coToAdmin,coToAcademic,coToSocial,coToLibrary,coToResidential,only Academic or Admin,is Night,is Weekend
0,0040961e58be,19,1,1,0,1,1,0,0.0,0.0
1,nothing,8,1,1,0,1,0,0,0.0,0.0
2,00601db0635a,2,1,0,0,1,0,0,0.0,0.0
3,00409699dfba,10,1,1,0,1,0,0,0.0,0.0
4,004096daa8fe,6,0,1,0,0,0,1,0.0,0.0


In [49]:
results.to_csv("./results.csv")