In [1]:
from apps.mongo_connect import mongo_connect
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
# constants here
CONNECT = mongo_connect
TIME_STAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f"

In [3]:
def get_conversation(sample):
    df = pd.DataFrame.from_dict(sample)
    header = ['content','from_addr','to_addr','transport_type','transport_name','timestamp','session_event']
    df_conversations = pd.DataFrame()
    
    for c in df['conversations']:
        data = [c['content'],c['from_addr'],c['to_addr'],c['transport_type'],c[u'transport_name'], c['timestamp'],c['session_event']]
        df_conversations = df_conversations.append([data])
    
    df_conversations.columns = header
    df_conversations = df_conversations.set_index('timestamp')
    return df_conversations

### We start off by getting a random_sample and the different types of session events from the db
### PLACE ALL QUERIES TO DB HERE

In [4]:
conn, db, conversation, raw = CONNECT()
try:
    # if you want to make queries to the db, do it here
    random_sample = list(conversation.aggregate([ { '$sample': { 'size': 10 } } ]))
    types_of_events = raw.distinct('session_event')
finally:
    conn.close()

# Length of each conversation

when we read the mongo into Python, we are reading in dict with the following structure:
SAMPLE

    - id
    - poi: this is the unique id based of the msisdn
    - conversations: a list of dicts sorted by timestamp
    
CONVERSATION

u'content': None
u'from_addr': u'+27762463746'
u'from_addr_type': None
u'group': None
u'helper_metadata': {u'go': 
                            {u'conversation_key': u'5685a593efa1441cbcd5fd2452d6cd4a'
                             u'conversation_type': u'jsbox',
                             u'user_account': u'aa30257efe8644c1a86e8dd63cd63836'}}
u'in_reply_to': None,
u'inbound_push_trigger': True,
u'message_id': u'c97784bbfeff4b638a4137cf2544e280',
u'message_type': u'user_message',
u'message_version': u'20110921'
u'provider': None,
u'routing_metadata': {}
u'session_event': None
u'timestamp': u'2014-05-23 13:36:07.120457'
u'to_addr': u'None'
u'to_addr_type': None
u'transport_metadata': {}
u'transport_name': None
u'transport_type': None    

In [5]:
for sample in random_sample:
    print len(sample['conversations'])

6
35
4
34
14
18
21
6
33
38


# Example Conversation

In [6]:
pd.set_option('display.max_colwidth', -1)
get_conversation(random_sample[3])

Unnamed: 0_level_0,content,from_addr,to_addr,transport_type,transport_name,session_event
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-10 07:15:16.607820,,+27795401251,*120*7692*3#,ussd,truteq_7692_transport,new
2014-04-10 07:15:17.867908,Welcome to Voting is Power! Start by choosing your language:\n1. English\n2. Afrikaans\n3. Zulu,*120*7692*3#,+27795401251,ussd,truteq_7692_transport,
2014-04-10 07:15:23.471401,1,+27795401251,*120*7692*3#,ussd,truteq_7692_transport,resume
2014-04-10 07:15:24.442971,Elections! Does ur vote matter?\n1. YES every vote matters\n2. NO. I'll vote anyway\n3. NO. I'm NOT voting\n4. I'm NOT REGISTRD to vote\n5. I'm 2 YOUNG to vote,*120*7692*3#,+27795401251,ussd,truteq_7692_transport,
2014-04-10 07:15:38.096212,1,+27795401251,*120*7692*3#,ussd,truteq_7692_transport,resume
2014-04-10 07:15:39.680410,Please accept the terms and conditions to get started\n1. Accept & Join\n2. Read t&c\n3. Quit,*120*7692*3#,+27795401251,ussd,truteq_7692_transport,
2014-04-10 07:15:46.898641,1,+27795401251,*120*7692*3#,ussd,truteq_7692_transport,resume
2014-04-10 07:15:48.292653,"Tx 4 joining! We need ur voting ward. Type ur home address & we'll work it out. This is prvt, only ur voting ward will be stored &u will be anonymous",*120*7692*3#,+27795401251,ussd,truteq_7692_transport,
2014-04-10 07:16:47.810747,User timeout,+27795401251,*120*7692*3#,ussd,truteq_7692_transport,close
2014-04-10 07:16:48.720179,Tnx 4 volunteering 2 be a citizen reporter for the '14 elections! Start by answering questions or report election activity! Dial back in to *120*7692*3# begin!,,+27795401251,sms,ambient_go_smpp_transport,


### Extract One Conversation Dataframe

In [7]:
conversation_data_frame = get_conversation(random_sample[0])

# Building Functions

In [8]:
def alpha(conversation_df):
    """average number of responses given per open session and total number of sessions"""
    total_number_of_responses = len(conversation_df[conversation_df['session_event']=='resume'])
    number_of_sessions = len(conversation_df[conversation_df['session_event']=='close'])
    return float(total_number_of_responses)/float(number_of_sessions), float(number_of_sessions)

In [9]:
alpha(conversation_data_frame)

(1.0, 1.0)

In [10]:
average_response, total_number_of_sessions = alpha(conversation_data_frame)

In [11]:
def get_total_time_in_system(df):
    # start is the first instance we observe a new_connection
    start = datetime.strptime(df[df['session_event'] == 'new'].index[0], TIME_STAMP_FORMAT)
    # finish is the very last report of the system
    finish = datetime.strptime(df.index[-1],TIME_STAMP_FORMAT)
    delta = finish - start
    return delta.seconds


def beta(conversation_df):
    """Returns total time interacting and total time in system """
    total_time_in_system = get_total_time_in_system(conversation_df)
    total_time_interacting = 0.0
    start_time = 0.0
    finish_time = 0.0
    in_session = False
    for rows in conversation_df.iterrows():

        if rows[1]['session_event'] == 'new':
            start_time = datetime.strptime(rows[0],TIME_STAMP_FORMAT)
            in_session = True

        if rows[1]['session_event'] == 'close' and in_session is True:
            finish_time = datetime.strptime(rows[0],TIME_STAMP_FORMAT)
            delta = finish_time - start_time
            time_interacting_in_session = delta.seconds
            total_time_interacting += delta.seconds
            in_session = False
            
    return total_time_interacting, total_time_in_system

In [12]:
beta(conversation_data_frame)

(119.0, 76364)

# Some summary about our samples

In [13]:
for sample in random_sample:
    print "address: {}".format(sample['poi'])
    conversation_data_frame = get_conversation(sample)
    print len(conversation_data_frame)
    print alpha(conversation_data_frame)
    print beta(conversation_data_frame)
    print 

address: +27797713859
6
(1.0, 1.0)
(119.0, 76364)

address: +27720724481
35
(2.75, 4.0)
(401.0, 70262)

address: +27712465078
4
(0.0, 1.0)
(17.0, 71230)

address: +27795401251
34
(3.0, 1.0)
(91.0, 23754)

address: +27822573967
14
(5.0, 1.0)
(120.0, 43225)

address: +27826670079
18
(1.0, 2.0)
(208.0, 21981)

address: +27715658749
21
(3.5, 2.0)
(178.0, 41760)

address: +27791770496
6
(1.0, 1.0)
(39.0, 80701)

address: +27760315286
33
(3.0, 1.0)
(119.0, 67211)

address: +27760740794
38
(1.3333333333333333, 3.0)
(172.0, 21150)



every conversation session starts with `new` in `session_event`.  if its a live session all inbound messages will have session event resume and close when the session times' outs
or [ends?].



### So we can get two measures

1) average number of responses given per open session

2) total number of session

### Other time related measres

1) total time interacting with system (last contact - first contact) [exclude outbound messages that are ignored]

2) total time in system from initiation until system close [fixed date]

3) average response time -- we need to do some work here to figure out what is considered a response 

### Do people surf channels?

IF yes, then we need to segement our counts by channel
That is +27797251256 (if there is no error) first subscribes to lottery. Then doesn't enroll in lotter and tries the 

### Inductive what have we learned

When an SMS messages was sent-- if there was no response for some unknown period of time it appears that we get a inbound "None" message back before the next outbound message is sent. 

### Look at Gen's code for cleaning again and test using
27797251256 

Check about free channel after election! 27797251256 is an example 