In [1]:
import os
import pandas as pd
import pymongo
from sshtunnel import SSHTunnelForwarder
from datetime import datetime, timedelta
from pymongo import MongoClient

In [2]:
TIME_STAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f"

In [3]:
def mongo_connect():
    # Try to connect to MongoDB,  exit if not successful.
    try:
        conn = MongoClient('localhost', 27018)
        # print "Connected successfully to MongoDB, instance created!"

    except(pymongo.errors.ConnectionFailure, e):
        print("Could not connect to MongoDB: {}".format(e))

    name = 'SA_Voting_Data'
    db = conn[name]  
    conversations = db.conversations_collection
    raw = db.raw_collection
    features = db.features_collection

    return conn, db, conversations, raw, features

In [4]:
def get_addresses():
    conn, db, conversations, raw, features = mongo_connect()
    try:
        c = conversations.distinct('poi')
    finally:
        conn.close()
    
    return list(c)

def get_conversation(poi):
    conn, db, conversations, raw, features = mongo_connect()
    try:
        c = conversations.find({'poi': poi})
    finally:
        conn.close()
    df = pd.DataFrame(list(c))
    header = ['poi','content','from_addr','to_addr','transport_type','transport_name','session_event','_id']
    df = df.set_index('timestamp')
    
    # we drop duplicates of timestamp which is the index
    df = df[~df.index.duplicated(keep='first')]
    return df[header]

In [5]:
addresses = get_addresses()

In [6]:
conn, db, conversations, raw, features = mongo_connect()
try:
    c = conversations.find({'poi': {'$in': addresses[0:100]}})
    df = pd.DataFrame(list(c))
except:
    conn.close()

In [7]:
from features.utils import make_payload, setup_conversation

In [8]:
df.head()

Unnamed: 0,_id,content,from_addr,from_addr_type,group,helper_metadata,in_reply_to,inbound_push_trigger,message_id,message_type,...,poi,provider,routing_metadata,session_event,timestamp,to_addr,to_addr_type,transport_metadata,transport_name,transport_type
0,595d3fb841e9c43b197c9f41,,+27748024688,,,"{'go': {'conversation_type': 'jsbox', 'is_paid...",,,bd1e8b1ca02d402f9d2540086911b5f5,user_message,...,27748024688,,{'go_hops': [[['TRANSPORT_TAG:truteq_7692:*120...,new,2014-04-16 05:14:37.642580,*120*7692*2#,,{},truteq_7692_transport,ussd
1,595d3fb941e9c43b197c9f42,Welcome to Voting is Power! Start by choosing ...,*120*7692*2#,,,"{'go': {'conversation_type': 'jsbox', 'is_paid...",bd1e8b1ca02d402f9d2540086911b5f5,,d703b551e90a40dc973f26ffa7686514,user_message,...,27748024688,,{'go_hops': [[['CONVERSATION:jsbox:12b7d1093b0...,,2014-04-16 05:14:40.360669,+27748024688,,{},truteq_7692_transport,ussd
2,595d3fb941e9c43b197c9f43,3,+27748024688,,,"{'go': {'conversation_type': 'jsbox', 'is_paid...",,,96749ad26d8b437e9e4f7a106152ea34,user_message,...,27748024688,,{'go_hops': [[['TRANSPORT_TAG:truteq_7692:*120...,resume,2014-04-16 05:14:55.293199,*120*7692*2#,,{},truteq_7692_transport,ussd
3,595d3fb941e9c43b197c9f44,Yiskhathi sokhetho! Liyasiza yini ivoti lakho?...,*120*7692*2#,,,"{'go': {'conversation_type': 'jsbox', 'is_paid...",96749ad26d8b437e9e4f7a106152ea34,,66f7b7f05cd04a7c8f676f89db10d9ed,user_message,...,27748024688,,{'go_hops': [[['CONVERSATION:jsbox:12b7d1093b0...,,2014-04-16 05:14:56.778567,+27748024688,,{},truteq_7692_transport,ussd
4,595d3fb941e9c43b197c9f45,1,+27748024688,,,"{'go': {'conversation_type': 'jsbox', 'is_paid...",,,9b33a280846344819234b5711c5e5d3e,user_message,...,27748024688,,{'go_hops': [[['TRANSPORT_TAG:truteq_7692:*120...,resume,2014-04-16 05:15:13.950075,*120*7692*2#,,{},truteq_7692_transport,ussd


 GET ONE POI and SETUP CONVERSATION

In [9]:
example_df = setup_conversation(df[df['poi'] == addresses[10]])
example_df.head()

Unnamed: 0_level_0,poi,content,from_addr,to_addr,transport_type,transport_name,session_event,_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-04-10 06:58:15.551037,27766763040,,+27766763040,*120*4729#,ussd,truteq_4729_transport,new,595d6b3f833f86645bd049f4
2014-04-10 06:58:16.756844,27766763040,Welcome to Voting is Power! Start by choosing ...,*120*4729#,+27766763040,ussd,truteq_4729_transport,,595d6b3f833f86645bd049f5
2014-04-10 06:58:38.750785,27766763040,1,+27766763040,*120*4729#,ussd,truteq_4729_transport,resume,595d6b3f833f86645bd049f6
2014-04-10 06:58:39.620157,27766763040,Elections! Does ur vote matter?\n1. YES every ...,*120*4729#,+27766763040,ussd,truteq_4729_transport,,595d6b3f833f86645bd049f7
2014-04-10 06:59:10.644552,27766763040,1,+27766763040,*120*4729#,ussd,truteq_4729_transport,resume,595d6b3f833f86645bd049f8


# feature time to response for each push and reminder messages #4
# feature: average time to answer a question #3

In [10]:
# %load features/utils.py
import pandas as pd
from datetime import datetime
from features.settings import TIME_STAMP_FORMAT
import pickle


def get_conversation(conversations, poi):
    c = conversations.find({'poi': poi})
    df = pd.DataFrame(list(c))
    header = ['poi', 'content', 'from_addr', 'to_addr', 'transport_type', 'transport_name', 'session_event', '_id']
    df = df.set_index('timestamp')
    # we drop duplicates of timestamp which is the index
    df = df[~df.index.duplicated(keep='first')]
    return df[header]


def count_mxit_responses(conversation_df):
    try:
        count = len(conversation_df[(conversation_df['transport_type'] == 'mxit') & (conversation_df['content'] != None) & (conversation_df['to_addr'] == 'vipvoice2014')])
    except:
        count = 0
    return count


def number_of_mxit_sessions(conversation_df):
    if 'mxit' in conversation_df['transport_type'].unique():
        return 1
    else:
        return 0


def alpha(conversation_df):
    """average number of responses given per open session and total number of sessions"""
    total_number_of_responses = len(conversation_df[conversation_df['session_event']=='resume'])
    number_of_sessions_sms = len(conversation_df[conversation_df['session_event']=='close'])
    total_number_of_responses += count_mxit_responses(conversation_df)
    number_of_sessions = number_of_sessions_sms + number_of_mxit_sessions(conversation_df)
    if float(number_of_sessions) != 0:
        return float(total_number_of_responses)/float(number_of_sessions), float(number_of_sessions)
    else:
        return 0, 0


def get_total_time_in_system(df):
    try:
        # start is the first instance we observe a new_connection
        start = datetime.strptime(df[df['session_event'] == 'new'].index[0], TIME_STAMP_FORMAT)
        # finish is the very last report of the system
        finish = datetime.strptime(df.index[-1], TIME_STAMP_FORMAT)
        delta = finish - start
        return delta.seconds
    except:
        return 0


def beta(conversation_df):
    """Returns total time interacting and total time in system """
    total_time_in_system = get_total_time_in_system(conversation_df)
    total_time_interacting = 0.0
    start_time = 0.0
    finish_time = 0.0
    in_session = False
    for rows in conversation_df.iterrows():

        if rows[1]['session_event'] == 'new':
            start_time = datetime.strptime(rows[0], TIME_STAMP_FORMAT)
            in_session = True

        if rows[1]['session_event'] == 'close' and in_session is True:
            finish_time = datetime.strptime(rows[0], TIME_STAMP_FORMAT)
            delta = finish_time - start_time
            time_interacting_in_session = delta.seconds
            total_time_interacting += delta.seconds
            in_session = False

    return total_time_interacting, total_time_in_system


def channels(conversation_df):
    channel_list = conversation_df[(conversation_df['to_addr'] == '*120*7692*2#') | (conversation_df['to_addr'] == '*120*7692*3#') | (conversation_df['to_addr'] == '*120*4729#')]['to_addr'].tolist()
    previous = None
    ch1_to_ch2 = False
    ch2_to_ch1 = False
    ch2_to_ch3 = False
    ch3_to_ch2 = False
    ch1_to_ch3 = False
    ch3_to_ch1 = False
    for x in channel_list[1:]:
        next_one = x
        if next_one == previous:
            previous = next_one
            pass
        else:
            # ch1 = '*120*7692*2#' ch2 = '*120*7692*3#' ch3 = '*120*4729#'
            if (previous == '*120*7692*2#' and next_one == '*120*7692*3#'):
                ch1_to_ch2 = True
            if(next_one == '*120*7692*2#' and previous == '*120*7692*3#'):
                ch2_to_ch1 = True
            if (previous == '*120*7692*3#' and next_one == '*120*4729#'):
                ch2_to_ch3 = True
            if (next_one == '*120*7692*3#' and previous == '*120*4729#'):
                ch3_to_ch2 = True
            if (previous == '*120*7692*2#' and next_one == '*120*4729#'):
                ch3_to_ch1 = True
            if (next_one == '*120*7692*2#' and previous == '*120*4729#'):
                ch1_to_ch3 = True
            previous = next_one

    return ch1_to_ch2, ch2_to_ch1, ch2_to_ch3, ch3_to_ch2, ch1_to_ch3, ch3_to_ch1


def get_average_response_time(conversation_df):
    questions = conversation_df[conversation_df['from_addr'] != conversation_df['poi']]
    questions_loc = [conversation_df.index.get_loc(loc) for loc in questions.index.tolist()]
    potential_answers_loc = [conversation_df.index.get_loc(loc) + 1 for loc in questions.index.tolist()]
    cleaned_questions = list(set(questions_loc) - set(potential_answers_loc))
    cleaned_answers = [q + 1 for q in cleaned_questions]

    if len(conversation_df) - 1 in cleaned_questions:
        cleaned_questions.remove(len(conversation_df) - 1)

    cleaned_answers = [q + 1 for q in cleaned_questions]
    times = []

    for q, a in zip(cleaned_questions, cleaned_answers):
        r1 = conversation_df.iloc[q]
        r2 = conversation_df.iloc[a]
        r1_date = datetime.strptime(r1.name, TIME_STAMP_FORMAT)
        r2_date = datetime.strptime(r2.name, TIME_STAMP_FORMAT)
        times.append([(r2_date - r1_date).total_seconds(), r1.content, r2.content])

    df = pd.DataFrame(times)
    if len(times) == 0:
        return 0, df
    else:

        df.columns = ['times', 'question', 'answer']
        df.times.mean()
        return df.times.mean(), df


def addresses_and_types(conversation_df):
    addresses = conversation_df['to_addr'].unique().tolist()
    invalid_to_addresses = [conversation_df.poi.iloc[0], '', ' ', 'None']
    for invalid in invalid_to_addresses:
        try:
            addresses.remove(invalid)
        except:
            pass
    return addresses, conversation_df['transport_type'].unique().tolist()


def make_payload(conversation_df):
    address = conversation_df['poi'][0]
    average_response, total_number_of_sessions = alpha(conversation_df)
    total_time_interacting, total_time_in_system = beta(conversation_df)
    ch1_to_ch2, ch2_to_ch1, ch2_to_ch3, ch3_to_ch2, ch1_to_ch3, ch3_to_ch1 = channels(conversation_df)
    average_response_time, response_df = get_average_response_time(conversation_df)
    addresses, message_types = addresses_and_types(conversation_df)

    payload = {
        'poi': address,
        'average_response_count': average_response,
        'total_number_of_sessions': total_number_of_sessions,
        'total_time_interacting': total_time_interacting,
        'total_time_in_system': total_time_in_system,
        'ch1_to_ch2': ch1_to_ch2,
        'ch2_to_ch1': ch2_to_ch1,
        'ch2_to_ch3': ch2_to_ch3,
        'ch3_to_ch2': ch3_to_ch2,
        'ch1_to_ch3': ch1_to_ch3,
        'ch3_to_ch1': ch3_to_ch1,
        'to_address_values': addresses,
        'transport_types': message_types,
        'average_response_time': average_response_time,
        'response_data': response_df.to_dict(orient='records'),
    }
    pickle.dump(payload, open("./features/data/out_{}.pkl".format(address), "wb"))
    return payload


def setup_conversation(conversation_df):
    header = ['poi', 'content', 'from_addr', 'to_addr', 'transport_type', 'transport_name', 'session_event', '_id']
    conversation_df = conversation_df.set_index('timestamp')
    # we drop duplicates of timestamp which is the index
    df = conversation_df[~conversation_df.index.duplicated(keep='first')]
    return df[header]


average number of responses given per open session total number of sessions

In [11]:
average_response, total_number_of_sessions = alpha(example_df)
(average_response, total_number_of_sessions)

(1.3333333333333333, 6.0)

In [12]:
beta(example_df)

(518.0, 85076)

# surfing : need to make sure it ignores the endline #2

In [13]:
channels(example_df)

(False, False, False, False, False, False)

GET all channels which the person dialed in who they talked to?

In [14]:
set(example_df['to_addr'].values)

{'*120*4729#', '+27766763040', 'None', None, '*120*4729*1#', '*120*7692#'}

Get all transport types

In [15]:
set(example_df['transport_name'].values)

{'truteq_7692_transport',
 None,
 'ambient_go_smpp_transport',
 'truteq_4729_transport'}

In [16]:
set(example_df['transport_type'].values)

{'ussd', None, 'sms'}

#### CHANNELS

\*120\*7692\*  residual short code

\*120\*4729\*1# E-day Monitoring

\*120\*4729\*3# Endline

\*120\*7692\*1# VIP live magazine

\*120\*7692\*2# Main Channel Control

\*120\*7692\*3# Main Channel Lottery

\*120\*4729# Main Channel Subsidary 

\*120\*4279# OR have your voice heard on vip

# RESPONSE TIME

In [17]:
get_average_response_time(example_df)

(105650.55264869564,
            times                                           question  \
 0   2.199394e+01  Welcome to Voting is Power! Start by choosing ...   
 1   3.102439e+01  Elections! Does ur vote matter?\n1. YES every ...   
 2   3.414702e+01  Please accept the terms and conditions to get ...   
 3   2.830700e+01  Tx 4 joining! We need ur voting ward. Type ur ...   
 4   1.391170e+06  Tnx 4 volunteering 2 be a citizen reporter for...   
 5   3.454443e+01  Welcome to VIP!\n1. Answer & win!\n2. VIP Quiz...   
 6   9.408301e+00                        I am...\n1. Male\n2. Female   
 7   2.477766e+01  How old are you?\n1. u14\n2. 15-19\n3. 20-29\n...   
 8   3.007038e+01  Did you vote in the 2009 election?\n1. Yes\n2....   
 9   2.952224e+01  Did you vote in the 2009 election?\n1. Yes\n2....   
 10  1.264236e+01  I am...\n1. Black African\n2. Coloured\n3. Ind...   
 11  2.917772e+01  Thank you for telling VIP a bit more about you...   
 12  1.652889e+04  VIP:Voice wants 2 know: 

In [None]:
# %%time
# from concurrent.futures import ProcessPoolExecutor

# e = ProcessPoolExecutor()
# addresses = get_addresses()
# results = list(e.map(make_features, addresses[0:100]))

In [None]:
# %%time
# from concurrent.futures import ThreadPoolExecutor
# e = ThreadPoolExecutor(36)

# futures = []
# for address in addresses[0:100]:
#     futures.append(e.submit(make_features, address))

# r = [f.result() for f in futures]

In [None]:
# %%time
# from concurrent.futures import ThreadPoolExecutor
# e = ThreadPoolExecutor(16)

# futures = []
# for address in addresses[0:100]:
#     futures.append(e.submit(make_features, address))

# r = [f.result() for f in futures]

In [None]:
# %%time
# from concurrent.futures import ThreadPoolExecutor
# e = ThreadPoolExecutor(8)

# futures = []
# for address in addresses[0:100]:
#     futures.append(e.submit(make_features, address))

# r = [f.result() for f in futures]