In [10]:
import pandas as pd
import pandas as pd
from datetime import datetime
from pymongo import MongoClient
import pickle
from datetime import datetime, timedelta
from pymongo.errors import ConnectionFailure
import pickle
from concurrent.futures import ProcessPoolExecutor

This is a notebook that shows the issue Etienne is having when trying to find which users switched between the 3 lottery channels in ussd. The lottery channels have the poi *120*7692*2#, *120*7692*3#, *120*4729#. The problem is that Etienne is not able to isolate any user that messaged more than one of these channels. 

This notebook was originally made to expose the problem to Aaron. The code contained here is not production code and is copypasted from the final_feature_creation.py script. 

In [29]:
def mongo_connect():
    """
    This is Ryan's connection function
    """
    # Try to connect to MongoDB,  exit if not successful.
    try:
        conn = MongoClient('localhost', 27017)
        # print "Connected successfully to MongoDB, instance created!"

    except ConnectionFailure:
        print("Could not connect to MongoDB")

    name = 'test'
    db = conn[name]
    conversations = db.cleaned_conversations
    raw = db.conversations_collection
    features = db.features_collection

    return conn, db, conversations, raw, features



def get_conversation(poi):
    conn, db, conversations, raw, features = mongo_connect()
    try:
        c = conversations.find({'poi': poi})
    finally:
        conn.close()
    df = pd.DataFrame(list(c))
    header = ['poi', 'content', 'from_addr', 'to_addr', 'transport_type', 'transport_name', 'session_event', '_id',
              'helper_metadata']
    df = df.set_index(u'timestamp')
    # we drop duplicates of timestamp which is the index
    df = df[~df.index.duplicated(keep='first')]
    return df[header]



def index_to_date(conversation_df_index):
    """
    turns the index from a unicode string to datetime format
    :param conversation_df:
    :return:
    """

    datetime_index = [
        datetime.strptime(index_value, TIME_STAMP_FORMAT) for index_value in conversation_df_index]
    return datetime_index

TIME_STAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f"





This is the function that is supposed to find which users have switched. This function takes the "to_addr" field in the conversation and subsets for the messages that are sent to one of the 3 relevant channels. It then looks for some change in channel from one message to another. As far as I can tell this function should be able to detect switches as it is. 

In [18]:

def channels(conversation_df):
    """
    Looks for whether users moved between the free lottery. Should not apply to mxit users.
    """
    channel_list = conversation_df[(conversation_df['to_addr'] == '*120*7692*2#') |
                                   (conversation_df['to_addr'] == '*120*7692*3#') |
                                   (conversation_df['to_addr'] == '*120*4729#').to_addr.tolist()
    previous = None
    ch1_to_ch2 = False
    ch2_to_ch1 = False
    ch2_to_ch3 = False
    ch3_to_ch2 = False
    ch1_to_ch3 = False
    ch3_to_ch1 = False
    for x in channel_list[1:]:
        next_one = x
        if next_one == previous:
            previous = next_one
            pass
        else:
            # ch1 = '*120*7692*2#' ch2 = '*120*7692*3#' ch3 = '*120*4729#'
            if (previous == '*120*7692*2#' and next_one == '*120*7692*3#'):
                ch1_to_ch2 = True
            if(next_one == '*120*7692*2#' and previous == '*120*7692*3#'):
                ch2_to_ch1 = True
            if (previous == '*120*7692*3#' and next_one == '*120*4729#'):
                ch2_to_ch3 = True
            if (next_one == '*120*7692*3#' and previous == '*120*4729#'):
                ch3_to_ch2 = True
            if (previous == '*120*7692*2#' and next_one == '*120*4729#'):
                ch3_to_ch1 = True
            if (next_one == '*120*7692*2#' and previous == '*120*4729#'):
                ch1_to_ch3 = True
            previous = next_one

    return ch1_to_ch2, ch2_to_ch1, ch2_to_ch3, ch3_to_ch2, ch1_to_ch3, ch3_to_ch1


The main problem is that none of the conversations in our current data actually have more than 1 of the 3 relevant channels in their "to_addr" field. Here I pull a conversation that is marked as switching channels in Ryan's spreadsheet. The get_conversation function returns all messages in our database that are associated with a given poi.

In [19]:
example_conversation = get_conversation('+27727885259')
example_conversation.index = index_to_date(example_conversation.index)
example_conversation.sort_index(inplace=True)

print(example_conversation.to_addr)


2014-05-15 14:36:15.938703    +27727885259
2014-05-21 13:02:00.271478    +27727885259
2014-06-22 19:38:00.461363            None
Name: to_addr, dtype: object


One thing that I have noticed is that a common pattern in these messages is that the ussd transport type is not present. Typically there is only the sms transport type. It might be that there are other messages on ussd by that user which are found under a different poi. 

In [22]:
print(example_conversation.transport_type)

2014-05-15 14:36:15.938703    sms
2014-05-21 13:02:00.271478    sms
2014-06-22 19:38:00.461363    sms
Name: transport_type, dtype: object


This is not always the case however. Here is another example of a conversation that is marked as switching channels in Ryan's spreadsheet but which does not have messages sent to more than 1 of the 3 lottery channels:

In [26]:
other_conversation = get_conversation('+27823909270')
other_conversation.index = index_to_date(other_conversation.index)
other_conversation.sort_index(inplace=True)

print(other_conversation.to_addr)
print(other_conversation.transport_type)


2014-04-21 17:07:19.008762      *120*7692#
2014-04-21 17:07:20.056518    +27823909270
2014-04-21 17:07:29.371202      *120*7692#
2014-04-21 17:07:30.346099    +27823909270
2014-04-21 17:07:38.841444      *120*7692#
2014-04-21 17:07:40.070932    +27823909270
2014-04-21 17:07:48.920139      *120*7692#
2014-04-21 17:07:50.536028    +27823909270
2014-04-21 17:08:12.031299      *120*7692#
2014-04-21 17:08:13.553137    +27823909270
2014-04-21 17:08:20.022931      *120*7692#
2014-04-21 17:08:21.409536    +27823909270
2014-04-21 17:08:33.596968      *120*7692#
Name: to_addr, dtype: object
2014-04-21 17:07:19.008762    ussd
2014-04-21 17:07:20.056518    ussd
2014-04-21 17:07:29.371202    ussd
2014-04-21 17:07:30.346099    ussd
2014-04-21 17:07:38.841444    ussd
2014-04-21 17:07:40.070932    ussd
2014-04-21 17:07:48.920139    ussd
2014-04-21 17:07:50.536028    ussd
2014-04-21 17:08:12.031299    ussd
2014-04-21 17:08:13.553137    ussd
2014-04-21 17:08:20.022931    ussd
2014-04-21 17:08:21.409536 

There is a metadata field that I have explore to see if it could help retrieve the messages that went to other users but I could not find anything of interest in there

In [28]:
print(other_conversation.helper_metadata.iloc[0])

{u'go': {u'conversation_type': u'jsbox', u'is_paid': True, u'user_account': u'aa30257efe8644c1a86e8dd63cd63836', u'conversation_key': u'50cddc12bb514ee3a389c6702dcffbdb'}, u'tag': {u'tag': [u'truteq_7692_base', u'*120*7692#']}, u'truteq': {u'genfields': {}}, u'optout': {u'optout': False}}


The user_account and conversation_key fields seem to be shared across hundreds of users. None of the other fields seem like they would be helpful. Finally I looked up conversations in the original data but they also do not indicate that the user sent messages to multiple channels. 