In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
print module_path
if module_path not in sys.path:
    sys.path.append(module_path)

/home/ryan/Projects/sa_voting


In [2]:
from BuildDB.build_db import mongo_connect
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
pd.set_option('display.max_colwidth', -1)

In [3]:
def get_conversation(sample):
    df = pd.DataFrame.from_dict(sample)
    header = ['content','from_addr','to_addr','transport_type','transport_name','timestamp','session_event']
    df_conversations = pd.DataFrame()
    
    for c in df['conversations']:
        data = [c['content'],c['from_addr'],c['to_addr'],c['transport_type'],c[u'transport_name'], c['timestamp'],c['session_event']]
        df_conversations = df_conversations.append([data])
    
    df_conversations.columns = header
    df_conversations = df_conversations.set_index('timestamp')
    return df_conversations


def alpha(conversation_df):
    """average number of responses given per open session and total number of sessions"""
    total_number_of_responses = len(conversation_df[conversation_df['session_event']=='resume'])
    number_of_sessions = len(conversation_df[conversation_df['session_event']=='close'])
    if number_of_sessions == 0:
        return 0, 0 
    else:
        return float(total_number_of_responses)/float(number_of_sessions), float(number_of_sessions)


def get_total_time_in_system(df):
    if len(df[df['session_event'] == 'new']) != 0:
        # start is the first instance we observe a new_connection
        start = datetime.strptime(df[df['session_event'] == 'new'].index[0], TIME_STAMP_FORMAT)
        # finish is the very last report of the system
        finish = datetime.strptime(df.index[-1],TIME_STAMP_FORMAT)
        delta = finish - start
        return delta.seconds
    else:
        return 0


def beta(conversation_df):
    """Returns total time interacting and total time in system """
    total_time_in_system = get_total_time_in_system(conversation_df)
    total_time_interacting = 0.0
    start_time = 0.0
    finish_time = 0.0
    in_session = False
    for rows in conversation_df.iterrows():

        if rows[1]['session_event'] == 'new':
            start_time = datetime.strptime(rows[0],TIME_STAMP_FORMAT)
            in_session = True

        if rows[1]['session_event'] == 'close' and in_session is True:
            finish_time = datetime.strptime(rows[0],TIME_STAMP_FORMAT)
            delta = finish_time - start_time
            time_interacting_in_session = delta.seconds
            total_time_interacting += delta.seconds
            in_session = False
            
    return total_time_interacting, total_time_in_system

def surfing(conversation_df, address):
    q = conversation_df[conversation_df['to_addr'] != address]['to_addr']
    if len(set(q[q  != u'None' ])) > 1:
        return True
    else:
        return False

In [4]:
# constants here
CONNECT = mongo_connect
TIME_STAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f"

In [5]:
conn, db, conversation, raw = CONNECT()
try:
    # if you want to make queries to the db, do it here
    random_sample = list(conversation.aggregate([ { '$sample': { 'size': 1000 } } ]))
    types_of_events = raw.distinct('session_event')
finally:
    conn.close()

In [6]:
for sample in random_sample:
    print "address: {}".format(sample['poi'])
    conversation_data_frame = get_conversation(sample)
    print len(conversation_data_frame)
    print alpha(conversation_data_frame)
    print beta(conversation_data_frame)
    print surfing(conversation_data_frame, sample['poi'])

address: m40389497002
38
(0.0, 3.0)
(0.0, 0)
False
address: +27724160243
66
(5.333333333333333, 3.0)
(360.0, 84267)
True
address: m55853062002
39
(0, 0)
(0.0, 0)
False
address: m62034846002
54
(0.0, 1.0)
(0.0, 0)
False
address: +27790752108
36
(3.0, 1.0)
(120.0, 29233)
True
address: +27712325723
9
(4.0, 1.0)
(76.0, 76)
False
address: +27796666712
8
(2.0, 1.0)
(56.0, 66059)
False
address: +27815198172
145
(4.7, 10.0)
(833.0, 27520)
True
address: m18068402002
58
(0, 0)
(0.0, 0)
False
address: +27797247289
92
(0.3157894736842105, 19.0)
(671.0, 31601)
True
address: +27793130535
33
(4.0, 1.0)
(119.0, 2843)
False
address: m52264790002
162
(0.0, 1.0)
(0.0, 0)
False
address: +27820942707
6
(1.0, 1.0)
(119.0, 38870)
False
address: +27791836734
35
(3.0, 1.0)
(105.0, 5593)
False
address: +27713962222
95
(1.5, 12.0)
(825.0, 68563)
True
address: +27729533084
6
(1.0, 1.0)
(119.0, 17618)
False
address: m26141289002
122
(0.0, 2.0)
(0.0, 0)
False
address: +27763222169
5
(0.0, 1.0)
(119.0, 72935)
False


KeyboardInterrupt: 

In [8]:
import csv
with open('out.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['id', 'length', 'a1', 'a2', 'b1', 'b2', 'surfing'])
    
    for sample in random_sample:
        my_id = sample['poi']
        conversation_data_frame = get_conversation(sample)
        length = len(conversation_data_frame)
        a1, a2 = alpha(conversation_data_frame)
        b1, b2 = beta(conversation_data_frame)
        surfing_var = surfing(conversation_data_frame, sample['poi'])
        spamwriter.writerow([my_id, length, a1, a2, b1, b2, surfing_var])
