In [48]:
###########
#  Readme #
###########
# Ce programme fonctionne de la maniere suivant
#1) Les donnees training des mails sont importees ainsi que les donnees des destinataires pour chaque mail
#et les jeux de donnees sont joints
#2) Pour creer un graphe directionnel pondere par le nombre de mails entre expediteur et destinataire,
#le nombre de mail d'un expediteur vers un destinataire est calcule, puis le graphe est realise
#3) Le lien entre chaque expediteur et destinataire est mesure par le coefficient Jaccard et Adamic
#4) Pour les feature text, on procede a une tokenisation en enlevant d'avoir les chaines de caracteres 
# avec des caracteres numeriques ou speciaux, et en retirant certains mots non pertinent
#, puis on procede a une racinisation/stemming.
#5) On extrait la matrice de comptage des mots puis on estime un modele LDA avec 30 topics
#6) Pour chaque destinataire, on calcule le centroid des topics des mails recus d'un expediteur particulier
#a partir des distributions
#obtenus a partir du modele LDA
#7) Pour chaque mail, on calcule la distance au cosinus entre les topics du mail par rapport 
#au centroid expediteur-destinataire possibles
#8) Ensuite, on fait une concatenation pour obtenir un jeu de donnees avec les features text et reseau.
#9) Le jeu de donnees est equilibre pour obtenir 50% de positifs/destinataires reels et 50% de negatif
#10) On estime un modele RandomForest pour classifier une paire mail-destinataire de maniere binaire,
#egale a 1 si destinataire, 0 sinon
#11) Les donnees test sont importees et joints de la meme maniere que pour le jeu de donnees training
#12) Les feature reseaux obtenus sur le jeu d'apprentissage sont ajoutes au jeu de donnees par jointure
#13) Les feature texte sont obtenus en utilisant le modele LDA estime sur l'echantillon d'apprentissage 
#pour extraire les topics des mails test et en calculant la distance au cosinus avec chaque
#centroid expediteur-destinataire de l'echantillon d'apprentissage
#14) Enfin les scores sont obtenus en utilisant le modele RandomForest appris et pour chaque mail, les scores
#sont ranges dans l'ordre decroissant et les 10 premiers destinataires sont retournes dans un fichier csv.

In [49]:
import itertools
import numpy as np
import pandas as pd
import networkx as nx
from datetime import date
import nltk
from nltk.corpus import twitter_samples, stopwords
from nltk.stem.porter import PorterStemmer
from nltk.collocations import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from nltk import tokenize

##Data Import##
training_info=pd.read_csv("/Users/thomaslaurent/Documents/Cours-M2/Web-mining/kaggle/training_info_sid.csv",parse_dates=True,header=None)
training_set=pd.read_csv("/Users/thomaslaurent/Documents/Cours-M2/Web-mining/kaggle/training_set_sid.csv",header=None)


In [50]:
####################################
#Training data import and reshaping#
####################################

##Data reshaping for training_info##


#Changing column names for imported data#
training_info.columns=['Mail_id','Dates','Contents','Recipients']
training_set.columns=['Sender','Mail_id']

training_info['Dates']=pd.to_datetime(training_info['Dates'])
training_info['Dates']=training_info['Dates'].dt.strftime('%m/%d/%Y')

#Changing recipient variable into list#
training_info['Recipients']=training_info['Recipients'].str.split(" ")

##Data reshaping for training_set##

#Changing Mail_id variable into list#
training_set['Mail_id']=training_set['Mail_id'].str.split(" ")

In [51]:
##Creating a function for transposing list variable to rows - one element of the list by row##
def explode(df, lst_cols, fill_value=''):
    # make sure lst_cols is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except lst_cols
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

##Transposing each mail_id and recipient to row ##
dataset_recipient_mail=explode(training_set,["Mail_id"])
training_info_mail=explode(training_info,["Recipients"])

##Merge sender and recipient info for each mail##
dataset_recipient_mail["Mail_id"]=pd.to_numeric(dataset_recipient_mail["Mail_id"])
merged_recipient_sender=pd.merge(dataset_recipient_mail,training_info_mail[["Mail_id","Recipients"]],on="Mail_id",how='left')




In [52]:
##Creation of contingency table to calculate weight based on the number of email exchanged between##
##sender and recipient##
df=pd.crosstab(merged_recipient_sender["Sender"],merged_recipient_sender["Recipients"])
idx = df.columns.union(df.index)
df = df.reindex(index = idx, columns=idx, fill_value=0)
df = df.iloc[:,:].divide(df.iloc[:,:].sum(axis=0),axis=1)
df=df.fillna(0)




In [53]:
############################
#Network feature extraction#
############################

##Creation of a directed graph between senders and recipients##
##and calculation of Jaccard and Adamic/Adar coefficients##
G = nx.from_pandas_adjacency(df,create_using=nx.DiGraph())
g_copy = nx.Graph(G)

##Calculating Jaccard and adamic coefficient for each edge##
list_classe=[]
list_combination=[]
list_jaccard=[]
list_adamic=[]
for l in g_copy.nodes():
    for j in g_copy.nodes():
        if l != j :
            list_combination.append((l,j))
            lpJaccard= nx.jaccard_coefficient(g_copy,[(l,j)])
            lpAdamic= nx.adamic_adar_index(g_copy,[(l,j)])
            for u, v, p in lpJaccard:
                value_jaccard=p
                list_jaccard.append(value_jaccard)
            for u, v, p in lpAdamic:
                value_adamic=p
                list_adamic.append(value_adamic)
#Creating a list for existing edges#
            if g_copy.has_edge(l,j) :
                list_classe.append((l,j))

#Creating a dataset class_nodes containing each coefficient
#for each node and classe as 1 if the edge exists#
class_nodes=pd.DataFrame(list_combination,columns=["Sender","Recipient"])
class_nodes["Jaccard"]=pd.Series(list_jaccard)
class_nodes["Adamic"]=pd.Series(list_adamic)
class_nodes_existing=pd.DataFrame(list_classe,columns=["Sender","Recipient"])
class_vector=pd.Series(np.ones(class_nodes_existing.shape[0]))
class_nodes_existing["classe"]=class_vector

class_nodes=class_nodes.merge(class_nodes_existing,on=["Sender","Recipient"],how="left").fillna(0)




In [54]:
#########################
#Text feature extraction#
#########################

##Identifying topics for each mail for non void mail##
token_df=training_info
token_df = token_df[token_df['Contents'].notnull()]
token_df=token_df.reset_index()

#Creating stop_words list and extending the list with "http","AM" and "PM"#
stop_words = stopwords.words('english')
stop_words.extend(["AM","PM","http"])

##Creating a function to select only alphanumeric words, removing stop words and stemming##
def cleaning_words(content):
    text = [word for word in nltk.word_tokenize(content)]
    porter = PorterStemmer()
    tokens_no_stop_word = []
    filtered_tokens = []
    stems=[]
    for token in text:
        if token.isalpha():
            filtered_tokens.append(token)
    tokens_no_stop_word=[w for w in filtered_tokens if w not in stop_words]
    stems = [porter.stem(w) for w in tokens_no_stop_word]
    return stems

##LDA method ##
n_features=5000
tf_vectorizer = CountVectorizer(max_df=0.5, min_df=2, max_features=n_features,tokenizer=cleaning_words)
tf = tf_vectorizer.fit_transform(token_df["Contents"])
# Convert sparse matrix to gensim corpus
import gensim
corpus = gensim.matutils.Sparse2Corpus(tf, documents_columns=False)

# Mapping from word IDs to words
id_map = dict((v, k) for k, v in tf_vectorizer.vocabulary_.items())

# LDA training using 30 topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word=id_map, passes=25, random_state=34)

#Creating a function to return a dataframe of topic distribution by document
def lda_corpus(corpus):
    lda_corpus = ldamodel[corpus]
    lda=pd.DataFrame([])
    list_topic=[]
    dict_topics = {}
    index=0
    for doc in lda_corpus:
        dict_topics[index]=dict(doc)
        index=index+1
    topic_doc_df=pd.DataFrame(dict_topics).fillna(0)
    lda_df=topic_doc_df.T
    return lda_df

#Creating dataframe of topic distribution for document in the training data
lda_df=lda_corpus(corpus)

#Merging topic vector to original training_info dataframe#
merged_info_lda=pd.concat([token_df,lda_df],axis=1)
del merged_info_lda["Contents"]
del merged_info_lda["Recipients"]
del merged_info_lda["Dates"]

original_info_lda=pd.merge(training_info_mail,merged_info_lda,on=["Mail_id"],how="left").fillna(0)
del original_info_lda["index"]
del original_info_lda["Contents"]

#Mapping senders to original_info_lda dataframe#
original_info_lda=pd.merge(original_info_lda,dataset_recipient_mail,on=["Mail_id"],how="left")

#Merging clustering features to original_info_lda dataframe#
original_info_lda=pd.merge(original_info_lda,class_nodes,left_on=["Sender","Recipients"],right_on=["Sender","Recipient"],how="left")
del original_info_lda["Recipients"]
del original_info_lda["Dates"]

In [240]:
##Calculating topic similarities##

#Calculate centroid of topics for each recipient#
original_info_lda.fillna(0)
listvar=["Sender","Mail_id"]
listvar.extend(range(0,30))
LDA_by_recipient=original_info_lda[listvar]
LDA_by_recipient=pd.merge(LDA_by_recipient,merged_recipient_sender[["Mail_id","Recipients"]],left_on="Mail_id",right_on="Mail_id",how="left")
LDA_by_recipient=LDA_by_recipient.groupby(["Recipients","Sender"]).sum()
LDA_by_recipient=LDA_by_recipient.rename(index=str,columns={"Recipients":"Recipient"}).reset_index()

In [242]:
from sklearn.metrics.pairwise import cosine_similarity

#Creating the function to calculate cosine similarity
def cosine_recipient(dataframe):
    listvar=list(range(0,30))
    cosine_LDA_mails=cosine_similarity(dataframe[listvar],LDA_by_recipient[listvar])
    cosine_LDA_mails=pd.DataFrame(cosine_LDA_mails)
    cosine_LDA_mails.columns=list(LDA_by_recipient.index)
    cosine_LDA_mails=cosine_LDA_mails.reset_index()
    cosine_LDA_mails["Mail_id"]=dataframe["Mail_id"]
    cosine_LDA_mails["Senders"]=dataframe["Sender"]
    cosine_LDA_mails=cosine_LDA_mails.reset_index().set_index(["Mail_id","Senders"])
    cosine_LDA_mails=pd.DataFrame(cosine_LDA_mails.stack()).reset_index()
    cosine_LDA_mails.columns=["Mail_id","Senders","Recipient","Cosine"]
    LDA_by_recipient_copy=LDA_by_recipient.copy()
    LDA_by_recipient_copy["index_var"]=LDA_by_recipient_copy.index
    cosine_LDA_mails=cosine_LDA_mails.merge(LDA_by_recipient_copy[["index_var","Sender","Recipients"]],left_on=["Recipient"],right_on=["index_var"],how="inner")
    cosine_LDA_mails=cosine_LDA_mails[cosine_LDA_mails.Senders==cosine_LDA_mails.Sender]
    cosine_LDA_mails=cosine_LDA_mails[["Mail_id","Sender","Recipients","Cosine"]]
    cosine_LDA_mails=cosine_LDA_mails.rename(columns={"Recipients":"Recipient"})
    return cosine_LDA_mails

#Calculating cosine similarities for the train dataframe
cosine_LDA_mails=cosine_recipient(original_info_lda)


In [245]:
##################
# Data reshaping #
##################

##Creating a dataframe for each email and each possible recipient##
Mail_analysis=cosine_LDA_mails.merge(class_nodes,left_on=["Sender","Recipient"],right_on=["Sender","Recipient"],how="inner")
del Mail_analysis["classe"]

#Creating class coded as 1 for real recipients#
training_info_class=training_info_mail[["Mail_id","Recipients"]]
training_info_class["classe"]=pd.Series(np.ones(training_info_class.shape[0]))
Mail_analysis=Mail_analysis.rename(index=str, columns={"Recipient": "Recipients"})
Mail_analysis=Mail_analysis.merge(training_info_class,left_on=["Mail_id","Recipients"],right_on=["Mail_id","Recipients"],how="left")
Mail_analysis["classe"]=Mail_analysis["classe"].fillna(0)

#Removing data where recipient is identical to sender#
Mail_analysis=Mail_analysis[Mail_analysis["Recipients"]!=Mail_analysis["Sender"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [246]:
##Creating dataset for model training##
#Balancing negative and positive class 50%#
Positive_set=Mail_analysis[Mail_analysis["classe"]==1]
length_positive=Positive_set.shape[0]
Negative_set=Mail_analysis[Mail_analysis["classe"]==0].sample(length_positive)

train_set=pd.concat([Positive_set,Negative_set])


In [247]:
#######################
# Training the model  #
#######################

#Classification using random forest#
from sklearn.ensemble import RandomForestClassifier

features = ["Cosine","Adamic","Jaccard"]
y = train_set["classe"]
X = train_set[features]

# Create a random forest Classifier
clf = RandomForestClassifier(n_estimators=200, random_state=0)

# Training the classifier
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [249]:
##########################
#  Importing test data   #
##########################


#Importing test datasets
test_info=pd.read_csv("/Users/thomaslaurent/Documents/Cours-M2/Web-mining/kaggle/test_info_sid.csv",parse_dates=True,header=None)
test_set=pd.read_csv("/Users/thomaslaurent/Documents/Cours-M2/Web-mining/kaggle/test_set_sid.csv",header=None)

##Data reshaping for test_info##

#Changing column names of datasets#
test_info.columns=['Mail_id','Dates','Contents']
test_set.columns=['Sender','Mail_id']

#Changing Mail_id variable into list#
test_set['Mail_id']=test_set['Mail_id'].str.split(" ")
test_recipient_mail=explode(test_set,["Mail_id"])

test_recipient_mail["Mail_id"]=pd.to_numeric(test_recipient_mail["Mail_id"])

#######################################
#  Adding network and text features   #
#######################################

#Adding network features to test dataset
test_network_feature=test_recipient_mail.merge(class_nodes,left_on=["Sender"],right_on=["Sender"],how="left")

##Identifying topics for each mail for non void mail##
token_df=test_info
token_df = token_df[token_df['Contents'].notnull()]
token_df=token_df.reset_index()

# Create the matrix of token counts for test dataset
X = tf_vectorizer.transform(token_df["Contents"])

# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Generating topic distribution by mail in the test dataset
lda_df=lda_corpus(corpus)



In [250]:
#Merging topic vector to original training_info dataframe#
test_info_lda=pd.concat([token_df,lda_df],axis=1)
del test_info_lda["Contents"]
del test_info_lda["Dates"]
del test_info_lda["index"]

In [251]:
#Mapping senders to topics dataframe#
test_info_lda=test_info_lda.merge(test_recipient_mail,on="Mail_id",how="left")

#Calculating cosine similarities for each email content to the centroid of each potential recipient
#in the test dataset
cosine_LDA_mails=cosine_recipient(test_info_lda)

In [324]:
#Combine network and text features#
test_dataset=pd.merge(test_network_feature[["Mail_id","Jaccard","Adamic","classe","Recipient"]],cosine_LDA_mails,left_on=["Mail_id","Recipient"],right_on=["Mail_id","Recipient"],how="inner")

#Data handling for email without content-using only feature network for all possible pairs
notextfeat=test_recipient_mail[-test_recipient_mail["Mail_id"].isin(cosine_LDA_mails["Mail_id"].unique())]
notextfeat=notextfeat.merge(test_network_feature[["Jaccard","Adamic","classe","Sender","Recipient"]],left_on="Sender",right_on="Sender",how="inner")
notextfeat["Mail_id"].unique()
test_dataset=pd.concat([test_dataset,notextfeat])

#Replacing missing values by 0#
test_dataset=test_dataset.fillna(0)


In [334]:
##########################################################
#  Predicted the score for each mail and recipient pair  #
##########################################################

#Predicting potential recipients using trained Random Forest model#
features = ["Cosine","Adamic","Jaccard"]
X = test_dataset[features]
predictions=clf.predict_proba(X)

#Adding score of being recipient to the test dataset#
predictions=pd.DataFrame(predictions)
predictions=predictions[1]
test_dataset=test_dataset.reset_index()
predictions=predictions.reset_index()
predicted_set=pd.concat([test_dataset,predictions],axis=1)
predicted_set=predicted_set.rename(index=str, columns={1: "Score"})

#Ranking prediction by decreasing score by email and keeping the 10 first recipients#
predicted_list=predicted_set[["Mail_id","Recipient","Score"]].sort_values(["Mail_id", "Score"], ascending=[True, False])
predicted_list=predicted_list.groupby("Mail_id").head(10).reset_index(drop=True)
predicted_list=predicted_list[["Mail_id","Recipient"]]

#Reshaping dataframe to output the list of the best 10 predicted recipients by email
output_list=pd.DataFrame(predicted_list.groupby("Mail_id").apply(lambda x: list(x.Recipient)))
output_list=output_list.reset_index()
output_list.columns=["Id","Recipients"]
output_list["Recipients"]=output_list.Recipients.str.join(" ")

#Saving results as a csv#
output_list.to_csv("pred_recipients_Thomas_Laurent.csv",index=False)