# Tutorial 1

In this session, we will look at the wikileaks dataset and learn how to start gathering statistics about the dataset, preprocess the emails and extract useful information.

## Loading JSON file

In the folder you will find a json file.

In [9]:
import pandas as pd

path_data = '../../data/clean_json.json'

def load_json_data(path_to_file):
    data_DF = pd.read_json(path_to_file,encoding='ascii')
    data_DF['from'] = data_DF['from'].str.lower()
    data_DF['body'] = data_DF['body'].apply(lambda x: " ".join(str(x).split()))
    return data_DF

Loading dataset from data folder

In [10]:
data = load_json_data(path_data)

In [None]:
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
import string

stop_words_list = stopwords.words('english') + list(string.punctuation) #TODO: add other words?

class Dataset(object):
    def __init__(self, dataframe):
        self.data = dataframe
        self.user_emails = list(set(self.data['from']))
        self._generate_email2name()
        self.word_count = Counter()

    def _generate_email2name(self):
        self.EMAIL2NAME = defaultdict(list) # in case there are aliases
        user_emails = self.data['from']
        user_names = self.data['from_name']
        receivers = self.data['to']
        for email, name in zip(list(user_emails),list(user_names)):
            email = email.lower()
            name = name.replace('"','')
            if name not in self.EMAIL2NAME[email]:
                self.EMAIL2NAME[email].append(name)

        receivers_emails = []
        for receiver in list(receivers):
            for name, email in receiver:
                email = email.lower()
                name = name.replace('"','')
                if name not in self.EMAIL2NAME[email]:
                    self.EMAIL2NAME[email].append(name)

    def get_top_spammers(self, ntop=9999):
        print("Count \t Email \t \t \t Name")
        list_spammers = []
        printout = 0
        for a in self.data.groupby(self.data['from'])['from'].count()\
                                        .reset_index(name='count') \
                                        .sort_values(['count'], ascending=False)\
                                        .iterrows():
                _, email = a
                if printout < ntop:
                    print("%i \t %s \t %s" %(email['count'],email['from'],self.EMAIL2NAME[email['from']][0]))
                    printout += 1
                    list_spammers.append([email['count'],email['from'],self.EMAIL2NAME[email['from']][0]])
        return list_spammers
            
    def get_total_vocabulary(self):
        #returns a dict of emails and their respective vocab
        self.vocabulary = self.data['body'].str.cat(sep=' ') + self.data['subject'].str.cat(sep=' ')
        return self.vocabulary
    
    def get_vocabulary_count(self,stop_words=False):
        if stop_words:
            self.word_count = Counter([x for x in self.vocabulary.split(' ') if x not in stop_words_list])
        else:
            self.word_count = Counter([x for x in self.vocabulary.split(' ')])

        return self.word_count
    
    def get_top_words(self,stop_words=False):
        if len(self.word_count.keys())==0:
            self.get_vocabulary_count(stop_words=stop_words)
        print('Word \t Count')
        for a,b in self.word_count.most_common(20):
            print('%s \t %i)' %(a, b))
        return self.word_count.most_common(20)
        
    def generate_reduced_dataset(self, list_of_users):
        pass
        #returns a smaller dataframe

def plot_time(dataframe):
    #new = dataframe[['date']]
    #new['hour'] = # TODO 
    #new['hour'].hist(bins=24)
    #plt.title('Emails per hour')
    pass

Let's explore this dataset a bit.

1. For example, who sends out most emails?
2. Which words are most common?
3. Around when were emails most received (by day of the week and hour)

In particular, how can we improve the output of question 2 (if the most common words aren't particularly interesting?)

In [None]:
# Initiate the dataset
DataObject = Dataset(data)

tab = DataObject.get_top_spammers(ntop=...)
word_count = DataObject.get_top_words()

Now let's try to get a feeling of what these people are talking about.

In this example, we will do a simple topic mining model and use spacy to pick up on relevant entities.

In particular:
1. Aggregate the communication between two people
1. Perform topic modelling on the subset exchanged emails
2. Perform named entity extraction on the subset

The output of this task is to find pairs of people and the keywords/topics they are talking about in their emails.

In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

nlp = spacy.load('en')
list_of_entities = nlp.entity.cfg[u'actions']['1']
relevant_entities = list_of_entities

def clean_text(text):
    return text

def display_topics(model, feature_names, no_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        for i in topic.argsort()[:-no_top_words - 1:-1]:
            topics.append(feature_names[i])
        
    return topics

def get_keywords(sentence):
    keywords = defaultdict(list)
    doc = nlp(sentence)
    for ent in doc.ents:
        if ent.label_ in relevant_entities:
            keywords[ent.label_].append(ent.text)
    return keywords

def get_topics(emails):
    # eats a list of emails and returns 3 topics 
    # NMF is able to use tf-idf
    temp = []
    for em in emails:
        try:
            accum = [a for a in em[0].split('.')]
            temp += accum
        except:
            continue
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=False)
    tfidf = tfidf_vectorizer.fit_transform(temp)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(stop_words='english', lowercase=False)
    tf = tf_vectorizer.fit_transform(temp)
    tf_feature_names = tf_vectorizer.get_feature_names()

    no_topics = 5

    # Run NMF
    nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
    # Run LDA
    lda = LatentDirichletAllocation(n_components=no_topics, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
    no_top_words = 3
    topics1 = display_topics(nmf, tfidf_feature_names, no_top_words)
    topics2 = display_topics(lda, tf_feature_names, no_top_words)
    return topics1 + topics2

class user(object):
    def __init__(self, data, email):
        self.user = email
        self.emails = data.data.loc[data.data['from']==self.user]['body']
        self.vocabulary_raw = self.get_vocabulary(data)
        self.keywords = defaultdict(list)
        self.correspondents_count = Counter()
        self.correspondents_emails = defaultdict(list)
        self.correspondents_keywords = defaultdict(dict)
        self.get_connections(data)
        self.correspondents_topics = defaultdict(list)
        self.get_topics_correspondents()
        self.connections = self.correspondents_count.keys()

    def get_vocabulary(self, data):
        return data.data.loc[data.data['from']==self.user]['body'].str.cat(sep=' ')
    
    def get_connections(self,data):
        # return person, number of emails, top entities
        self.keywords_per_receiver = defaultdict(dict)
        for row in data.data.loc[data.data['from']==self.user].itertuples():
            indx, body, date, sender, from_name, subject, corres = row
            #try:
            if len(corres) == 0:
                continue
            for r in corres[0]:
                    if '@' not in r:
                        pass
                    else:
                        r = r.lower()
                        self.correspondents_count[r] += 1
                        self.correspondents_emails[r].append([clean_text(body)])
                        keywords = get_keywords(clean_text(body))
                        
                        if r not in self.correspondents_keywords.keys():
                            for key in relevant_entities:
                                self.correspondents_keywords[r][key] = []
                        for key in keywords.keys():
                            if key in relevant_entities:
                                self.correspondents_keywords[r][key] += keywords[key]
                                
        for receiver in self.correspondents_emails.keys():
            for row in data.data.loc[data.data['from']==receiver].itertuples():
                indx, body, date, sender, from_name, subject, corres = row
                if len(corres) == 0:
                    continue
                if self.user not in corres[0]:
                    continue
                    
                self.correspondents_emails[receiver].append(str(body))
                keywords = get_keywords(body)
                        
                if receiver not in self.correspondents_keywords.keys():
                    #instanciate dictionary
                    for key in relevant_entities:
                        self.correspondents_keywords[receiver][key] = []
                        
                for key in keywords.keys():
                    if key in relevant_entities:
                        self.correspondents_keywords[receiver][key] += keywords[key]
    
    def get_topics_correspondents(self):
        for corres in self.correspondents_keywords.keys():
            try:
                topics = get_topics(userA.correspondents_emails[corres])
            except:
                topics = []
            counter = Counter(topics)
            self.correspondents_topics[corres] = counter.most_common(5)

Suppose now we are interested in looking at a person in particular. For example, some names were particularly centered in the controversy, such as:

Debbie Wasserman (email: hrtsleeve@gmail.com)     
Brad Marshal (email: marshall@dnc.or)       
Luis Miranda (mirandal@dnc.org) (he's just the top spammer :) )


In [94]:
userA = user(DataObject,...)

Now let's do the last part of this session, let's see if we can extract some interesting topics from the emails.

In [None]:
import pickle   
    
tab = DataObject.get_top_spammers(ntop=50)
top_s = [a[1] for a in tab]

graph = []
for indx, email in enumerate(top_s[0:5]):
    userA = user(DataObject,email)
    for key in userA.correspondents_count.keys():
        graph.append({'email': email, 'correspondent': key, 'topics': userA.correspondents_topics[key], 'keywords': userA.correspondents_keywords[key], 'count': userA.correspondents_count[key]})

pickle.dump(graph, open('graph_topics_dict_t.pkl','wb'))
a = pickle.load(open('graph_topics_dict_t.pkl','rb'))

In [None]:
def get_top_words(dictionary, exclude=[]):
    all_words = dictionary['topics']
    for key in list(dictionary['keywords'].keys()):
        temp = []
        if len(dictionary['email'])==0:
            return 
        #all_words+= 
        temp = dictionary['keywords'][key]
        if len(temp) < 2:
            continue
        count = Counter(temp)
        print(count)
        for a, b in count.most_common(1): #for example 
            print(a)
            all_words.append(a)
            
    all_words = [a for a in all_words if a not in exclude]
    
    print('Email: ', dictionary['email'], 'To: ', dictionary['correspondent'],\
          'Words: ', all_words)

In [None]:
for e in graph:
    exclude_words = [DataObject.EMAIL2NAME[e['email']][0], DataObject.EMAIL2NAME[e['correspondent']][0], e['email'], e['correspondent'],\
                    ] + DataObject.EMAIL2NAME[e['email']][0].split(',') + DataObject.EMAIL2NAME[e['correspondent']][0].split(',')
    get_top_words(e, exclude = exclude_words)