In [11]:
import random
from datetime import datetime, timedelta
import io
import numpy as np
import heapq
import json
import operator
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from stop_words import get_stop_words
import json

import numpy.random as nprnd

stop_words = get_stop_words('english')


In [18]:
path_to_data = '../data/'

##########################
# load files #                           
##########################

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)
#training_info = pd.read_csv(path_to_data + 'training_info.csv', sep=',', header=0)
training_info = pd.read_csv(path_to_data+"training_info2.csv",sep=',', header=0, index_col=0)
test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)
#test_info = pd.read_csv(path_to_data + 'test_info.csv', sep=',', header=0)
test_info = pd.read_csv(path_to_data+"test_info2.csv",sep=',', header=0, index_col=0)

with io.open('../data/sent_to.json') as json_data:
    sent_to = json.load(json_data)
    
with io.open('../data/received_from.json') as json_data:
    received_from = json.load(json_data)

In [19]:
# Correct dates and put datetime format
# We do that because we noticed test_set is only composed of email posterior to the ones of train_set. 
# Datetime format allows to simulate posteriority in our train/test split
from datetime import datetime

for row in training_info.sort(['date']).iterrows():
    date = row[1]['date']
    if date[:3] == '000':
        date = '2' + date[1:]
        
    training_info.loc[row[0], 'date'] = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')

for row in test_info.sort(['date']).iterrows():
    date = row[1]['date']
        
    test_info.loc[row[0], 'date'] = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')



### Functions

In [4]:
def most_similar_sklearn(array_embedding_sparse, mail_tfidf, n):
    
    similarities = cosine_similarity(array_embedding_sparse, mail_tfidf)
    if int(round(sorted(similarities[:,0], reverse=True)[0])) ==1:
        closest_ids = similarities[:,0].argsort()[::-1][1:]
    else:
        closest_ids = similarities[:,0].argsort()[::-1]
    
    return closest_ids[:n], similarities

def get_sender(query_mid, training):
    for row in training.iterrows():
        mids = row[1]['mids'].split()
        for mid in mids:
            if int(mid) == query_mid:
                sender = row[1]['sender']
                break
    return sender


def get_10_recipients(closest_ids_per_sender, training_info, similarities):
    dic_of_recipients = {}
    #weight = len(closest_ids_per_sender)+1
    for idx in closest_ids_per_sender:
        recipients = training_info.loc[idx,'recipients'].split()
        for recipient in recipients:
            if '@' in recipient:
                dic_of_recipients[recipient] = dic_of_recipients.get(recipient, 0) + similarities[idx][0]

    return dic_of_recipients

def get_recency_features(X_train_info_sender, mail_date, n_recency_features):    
    dic_recency = {}
    df_last_sent_emails = X_train_info_sender[X_train_info_sender.date<= mail_date].sort_values(by = 'date', ascending = False)[:n_recency_features]
    for row in df_last_sent_emails.iterrows():
        recipients = row[1]['recipients'].split()
        for recipient in recipients:
            if '@' in recipient:
                dic_recency[recipient] = dic_recency.get(recipient, 0) + 1
    norm = sum(dic_recency.values())
    for k,v in dic_recency.iteritems():
        dic_recency[k] = float(v)/norm
    
    return dic_recency

def mean_ap(suggested_10_recipients, ground_truth):
    MAP = 0
    correct_guess = 0
    for i, suggestion in enumerate(suggested_10_recipients):
        if suggestion in ground_truth:
            correct_guess +=1
            MAP += float(correct_guess)/(i+1)
    MAP = float(MAP)/min(10, len(ground_truth))
    return MAP

## Create Features

In [104]:
submission = True
training_info = training_info.sort_values(by='date')

if submission:
    # submission procedure
    X_train_info = training_info
    X_test_info = test_info
    
else:
    # test procedure
    split_date=datetime(2001, 8, 25)
    X_train_info = training_info[training_info.date <= split_date]
    #X_test_info = training_info[training_info.date > split_date]
    
    #Randomize selection of test set:
    X_test_info = training_info[training_info.date > split_date]
    mask = nprnd.choice(range(X_test_info.shape[0]), size=1000, replace=False)
    X_test_info.index = range(X_test_info.shape[0])
    X_test_info = X_test_info[X_test_info.index.isin(mask)]

    X_train_info = training_info

In [105]:
if submission:
    tfidf = TfidfVectorizer(stop_words = stop_words)
    array_embedding_sparse = tfidf.fit_transform(np.concatenate((X_train_info['body'].values,X_test_info['body'].values)))
    array_embedding_sparse = array_embedding_sparse[:X_train_info.shape[0]]
else:
    #With porter stemming:
    #tfidf = TfidfVectorizer(tokenizer= tokenize, stop_words = stop_words)
    #Without stemming:
    tfidf = TfidfVectorizer(stop_words = stop_words)
    array_embedding_sparse = tfidf.fit_transform(X_train_info['body'].values)

## Compute New features

In [253]:
all_mean_ap = []
all_ground_truth = []
all_suggestions = []
results = pd.DataFrame(columns=['recipients'])
results.index.name = 'mid'

index = 0
new_features_df = pd.DataFrame(columns=['recipients', 'KNNScore','NSF', 'NRF','recency', 'label'])
new_features_all = np.zeros((0,4))
labels_all = np.zeros((0,1))
# number of closest neighbors to collect recipients from:
n = 30
n_recency_features = 50

#re-arrange train index
X_train_info.index = range(X_train_info.shape[0])

t_all = datetime.now()

count = 1
query_id = 19516
t_100 = datetime.now()
for query_id in X_train_info.index.values:
    
    
    count+=1
    if count%100==0:
        print count
        print datetime.now()-t_100
        t_100 = datetime.now()

    # STEP 1: preliminary assignment
    #t1 = datetime.now()
    mail = X_train_info['body'][query_id]
    mail_date = X_train_info['date'][query_id]
    query_mid = X_train_info['mid'][query_id]
    ground_truth = X_train_info['recipients'][query_id].split()

    mail_tfidf = tfidf.transform([mail])
    #t_step1 = datetime.now()-t1
    #print "STEP 1 took ", t_step1
    # END OF STEP 1

    # STEP 2: get sender
    #t2 = datetime.now()
    sender = X_train_info[X_train_info.mid == query_mid]['sender'].values[0]
    X_train_info_sender = X_train_info[X_train_info.sender == sender]
    index_sender = X_train_info_sender.index.values
    X_train_info_sender.index = range(X_train_info_sender.shape[0])
    array_embedding_sparse_sender = array_embedding_sparse[index_sender]
    #t_step2 = datetime.now() - t2
    #print "STEP 2 took ", t_step2
    # END OF STEP 2 --- > divided by 100

    # STEP 3: Rank mails by cosine similarity with searched mail
    #t3 = datetime.now()
    closest_ids_per_sender, similarities = most_similar_sklearn(array_embedding_sparse_sender, mail_tfidf, n)
    #t_step3 = datetime.now() - t3
    #print "STEP 3 took ", t_step3
    # END OF STEP 3

    # STEP 5: compute recency features
    #t5 = datetime.now()
    dic_recency = get_recency_features(X_train_info_sender, mail_date, n_recency_features)
    #t_step5 = datetime.now() - t5
    #print "STEP 5 took ", t_step5
    # END OF STEP 5

    # STEP 7: Create dictionnary of all recipient for the 30 most similar emails, and get frequency
    #t7 = datetime.now()
    dic_of_recipients = get_10_recipients(closest_ids_per_sender, X_train_info_sender, similarities)
    #t_step7 = datetime.now() - t7
    #print "STEP 7 took ", t_step7
    # END OF STEP 7

    # STEP 8: Create new features
    #t8 = datetime.now()
    new_features_per_mail = np.zeros((len(dic_of_recipients), 4))
    labels_per_mail = np.zeros((len(dic_of_recipients), 1))
    index = 0
    for k,v in dic_of_recipients.iteritems():
        KNNScore = v
        NSF = sent_to[sender][k]
        NRF = 0
        if sender in received_from.keys():
            NRF = received_from[sender].get(k, 0)
        
        if k in ground_truth:
            labels_per_mail[index, :] = 1
            
        recency = 0
        if k in dic_recency.keys():
            recency = dic_recency[k]
        
        #new_features_df.loc[index] = [k, KNNScore, NSF, NRF, recency, label]
        new_features_per_mail[index, :] = [KNNScore, NSF, NRF, recency]
        index +=1
    
    new_features_all = np.concatenate((new_features_all, new_features_per_mail))
    labels_all = np.concatenate((labels_all, labels_per_mail))

    #t_step8 = datetime.now() - t8
    #print "STEP 8 took ", t_step8
    # END OF STEP 8

print "total took:", datetime.now()-t_all

100
0:00:03.943789
200
0:00:04.043890
300
0:00:04.034041
400
0:00:04.302646
500
0:00:04.049652
600
0:00:04.111524
700
0:00:04.095455
800
0:00:03.935326
900
0:00:03.561163
1000
0:00:04.306302
1100
0:00:03.499020
1200
0:00:03.818536
1300
0:00:03.353365
1400
0:00:03.832335
1500
0:00:03.617624
1600
0:00:03.593555
1700
0:00:03.835898
1800
0:00:03.613794
1900
0:00:03.654243
2000
0:00:03.715972
2100
0:00:04.153084
2200
0:00:04.057123
2300
0:00:04.004729
2400
0:00:04.083353
2500
0:00:04.008825
2600
0:00:03.898112
2700
0:00:04.266798
2800
0:00:03.930580
2900
0:00:03.762277
3000
0:00:03.767075
3100
0:00:03.996096
3200
0:00:03.983355
3300
0:00:03.824236
3400
0:00:04.247085
3500
0:00:04.063782
3600
0:00:04.413951
3700
0:00:04.292282
3800
0:00:03.947952
3900
0:00:04.416478
4000
0:00:04.681308
4100
0:00:03.777606
4200
0:00:03.690493
4300
0:00:04.021080
4400
0:00:04.478510
4500
0:00:04.468931
4600
0:00:04.566133
4700
0:00:03.935251
4800
0:00:03.838850
4900
0:00:03.626482
5000
0:00:03.844605
5100
0:00

In [255]:
t1 = datetime.now()
np.save('../data/new_features1', new_features_all)
np.save('../data/labels1', labels_all)
print datetime.now() - t1

0:00:00.120710


In [33]:
new_features_all = np.load('../data/new_features1.npy')
labels_all = np.ravel(np.load('../data/labels1.npy'))

## Train Algorithm

In [106]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

In [101]:
#SVM = LinearSVC(dual=False)
#SVM.fit(new_features_all, labels_all)

# C parameter: smaller values mean bigger regularization
#LR = LogisticRegression(C=0.01)
#LR.fit(new_features_all, labels_all)

ABC = AdaBoostClassifier()
ABC.fit(new_features_all, labels_all)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

## Test

Create test features

In [107]:
n = 30
n_recency_features = 50
#re-arrange train index
X_train_info.index = range(X_train_info.shape[0])
t_all = datetime.now()
t_100 = datetime.now()
results = pd.DataFrame(columns=['recipients'])
results.index.name = 'mid'
all_mean_ap = []
all_ground_truth = []
all_suggestions = []

count=0
for query_id in X_test_info.index.values:

    count+=1
    if count%100==0:
        print count
        print datetime.now()-t_100
        t_100 = datetime.now()

    mail = X_test_info['body'][query_id]
    mail_date = X_test_info['date'][query_id]
    query_mid = X_test_info['mid'][query_id]
    
    mail_tfidf = tfidf.transform([mail])
    
    sender = X_test_info[X_test_info.mid == query_mid]['sender'].values[0]
    X_train_info_sender = X_train_info[X_train_info.sender == sender]
    index_sender = X_train_info_sender.index.values
    X_train_info_sender.index = range(X_train_info_sender.shape[0])
    array_embedding_sparse_sender = array_embedding_sparse[index_sender]
    
    closest_ids_per_sender, similarities = most_similar_sklearn(array_embedding_sparse_sender, mail_tfidf, n)
    
    dic_of_recipients = get_10_recipients(closest_ids_per_sender, X_train_info_sender, similarities)
    
    dic_recency = get_recency_features(X_train_info_sender, mail_date, n_recency_features)
    
    new_features_per_mail = np.zeros((len(dic_of_recipients), 4))
    index = 0
    for k,v in dic_of_recipients.iteritems():
        KNNScore = v
        NSF = sent_to[sender][k]
        NRF = 0
        if sender in received_from.keys():
            NRF = received_from[sender].get(k, 0)
    
        recency = 0
        if k in dic_recency.keys():
            recency = dic_recency[k]
        
        new_features_per_mail[index, :] = [KNNScore, NSF, NRF, recency]
        index +=1
    
    # Once the features are computed, we can predict the 10 recipients
    order = ABC.predict_proba(new_features_per_mail)[:,1].argsort()[::-1]
    recipients = np.array(dic_of_recipients.keys())
    suggested_10_recipients = recipients[order][:10]

    if submission:
        string_recipients = ''
        for k in suggested_10_recipients:
            string_recipients+=k + ' '

        results.loc[query_mid, 'recipients'] = string_recipients
    else:
        ground_truth = X_test_info['recipients'][query_id].split()
        all_suggestions.append(suggested_10_recipients)
        ground_truth.append(ground_truth)
        all_mean_ap.append(mean_ap(suggested_10_recipients, ground_truth))

print "total took:", datetime.now()-t_all

100
0:00:06.332867
200
0:00:05.614538
300
0:00:07.367845
400
0:00:06.722850
500
0:00:05.717509
600
0:00:06.016117
700
0:00:06.384617
800
0:00:06.870067
900
0:00:04.882427
1000
0:00:05.125166
1100
0:00:05.723837
1200
0:00:08.723643
1300
0:00:07.015919
1400
0:00:06.414040
1500
0:00:06.274868
1600
0:00:07.033063
1700
0:00:06.630035
1800
0:00:04.665406
1900
0:00:06.069735
2000
0:00:05.619131
2100
0:00:06.601064
2200
0:00:06.526404
2300
0:00:06.815101
total took: 0:02:30.971996


In [109]:
results.to_csv('../submission/learning_first_try.csv')

In [103]:
np.mean(all_mean_ap)

0.3652579746157722

## Get NSF and NRF
- NSF: number of messages sent by sender to this recipient, divided by all messages sent by sender 
- NRF: number of messages received from the recipient to the sender, divided by all messages received by sender

In [7]:
sent_to = {}
for row in training.iterrows():
    sender = row[1]['sender']
    sent_to[sender] = {}
    for mid in row[1]['mids'].split():
        recipients = training_info[training_info.mid == int(mid)]['recipients'].values[0].split()
        for recipient in recipients:
            if '@' in recipient:
                if recipient in sent_to[sender].keys():
                    sent_to[sender][recipient] += 1
                else:
                    sent_to[sender][recipient] = 1

In [8]:
received_from = {}
for sender in sent_to.keys():
    for recipient in sent_to[sender].keys():
        if recipient not in received_from.keys():
            received_from[recipient] = {}
        if sender not in received_from[recipient].keys():
            received_from[recipient][sender] = sent_to[sender][recipient]
        else:
            received_from[recipient][sender] += sent_to[sender][recipient]

In [9]:
# normalization:
for sender in sent_to.keys():
    norm = sum(sent_to[sender].values())
    for recipient in sent_to[sender].keys():
        sent_to[sender][recipient] = float(sent_to[sender][recipient])/norm

In [10]:
# normalization:
for recipient in received_from.keys():
    norm = sum(received_from[recipient].values())
    for sender in received_from[recipient].keys():
        received_from[recipient][sender] = float(received_from[recipient][sender])/norm

In [13]:
with io.open('../data/sent_to.json', 'w', encoding='utf-8') as f:
    f.write(unicode(json.dumps(sent_to, ensure_ascii=False)))
with io.open('../data/received_from.json', 'w', encoding='utf-8') as f:
    f.write(unicode(json.dumps(received_from, ensure_ascii=False)))

In [15]:
with io.open('../data/sent_to.json') as json_data:
    sent_to = json.load(json_data)