## Загрузка данных

In [1]:
import pandas as pd
aliases = pd.read_csv('data/Aliases.csv')
emailreceivers = pd.read_csv('data/EmailReceivers.csv')
emails = pd.read_csv('data/Emails.csv')
persons = pd.read_csv('data/Persons.csv')

In [2]:
emails.columns

Index(['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'MetadataFrom',
       'SenderPersonId', 'MetadataDateSent', 'MetadataDateReleased',
       'MetadataPdfLink', 'MetadataCaseNumber', 'MetadataDocumentClass',
       'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom', 'ExtractedCc',
       'ExtractedDateSent', 'ExtractedCaseNumber', 'ExtractedDocNumber',
       'ExtractedDateReleased', 'ExtractedReleaseInPartOrFull',
       'ExtractedBodyText', 'RawText'],
      dtype='object')

In [3]:
data = pd.DataFrame(emails[['MetadataSubject', 'ExtractedBodyText']])
data.columns = ['subject', 'body']
data.fillna('', inplace=True)

In [4]:
# data.head(20)

## Предобработка и анализ

Предобработаем сперва темы писем. В них содержатся шумовые токены вроде "FWD:" или "Re:". Избавимся от них с помощью регулярных выражений.
Затем предобработаем тексты писем, в которых также содержатся шумовые строки, начинающиеся с подстрок "Re:" или "H:". Стоит также удалить шумовые тэги, такие как "From:", штампы "U.S. Department...", отметки времени и/или даты. Также удалим строки с номерами документов вроде "Case No" и строки с числами.

In [4]:
import re
def is_date(string):
    try:
        parse(string)
        return True
    except:
        return False
    
def has_numbers(string):
    return bool(re.search(r'\d', string))

def process_body(elem):
    lines = elem.splitlines()
    trashpattern = re.compile(r"^(Doc No\\..*|Case No\\.|H <.*@.*>|.*@.*|From .|To:.|For:.*|Subject:.|U.S. Department of State$)")
    datepattern = re.compile(r"Date: \d{2}/\d{2}/\d{4}")
    timepattern = re.compile(r".*\d{1,2}:(\d{2}\s(?:AM|PM))")
    keypattern = re.compile(r"\s*(Fwd|Fw|Fvv|FVV|FW|Re|re|RE|H):\s*")
    numpattern = re.compile(r".*(\d*-\d*|\d+\b).*")
    lines = list(filter(lambda line: not datepattern.match(line) and not trashpattern.match(line) and not keypattern.match(line)
            and not timepattern.match(line) and not numpattern.match(line) and len(line) > 10,lines))
    return '\n'.join(lines)

def process_subject(elem):
    if elem == '':
        return elem
    keypattern = r's*(Fwd|Fw|Fvv|FVV|FW|Re|re|RE|H):\s*'
    subject = re.sub(keypattern, "", elem)
    subject = re.sub(keypattern, "", elem)
    return subject

data['subjectProc'] = data.subject.apply(process_subject)
data['bodyProc'] = data.body.apply(process_body)

In [5]:
print(len(data))

7945


### Биграммы из текста

In [6]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
import string
stopset = list(stopwords.words('english')) + ["'s"] + ["n't"] + ['']
punctset = string.punctuation + '—' + '``' + "'" + "•"

In [7]:
tokenset = list(list(filter(lambda token: token.lower() not in stopset and token not in punctset and len(token) > 3, word_tokenize(line)))
              for index, text in data.bodyProc.iteritems() for line in text.splitlines())
tokenset = list(list("".join(l for l in list(filter(lambda c: c not in punctset, token))) for token in doc)
               for doc in tokenset)
bigrams = list(bg for elem in tokenset for bg in list(ngrams(elem,2)))

In [8]:
print(len(tokenset))
print(tokenset[516])

31217
['wait', 'exactly', 'investigation', 'finally', 'confirms', 'best', 'information']


Выведем 5 наиболее часто встречающихся биграмм

In [9]:
from collections import Counter
frequencies = Counter(bigrams)

In [10]:
print(frequencies.most_common(5))

[(('United', 'States'), 325), (('White', 'House'), 239), (('Secretary', 'State'), 147), (('State', 'Department'), 134), (('health', 'care'), 112)]


### Биграммы с помощью NLTK по PMI

In [11]:
import nltk
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
tokens = [token for tokens in tokenset for token in tokens]

In [12]:
word_fd = nltk.FreqDist(tokens)
bigram_fd = nltk.FreqDist(nltk.bigrams(tokens))
finder = BigramCollocationFinder(word_fd, bigram_fd)
sorted(finder.nbest(bigram_measures.raw_freq, 2))

[('United', 'States'), ('White', 'House')]

In [13]:
scored = finder.score_ngrams(bigram_measures.raw_freq)

In [14]:
len(finder.score_ngrams(bigram_measures.raw_freq))

166368

In [15]:
finder.apply_word_filter(lambda w: w.lower() in stopset + [''])

In [16]:
len(finder.score_ngrams(bigram_measures.raw_freq))

166201

In [17]:
sorted(finder.above_score(bigram_measures.raw_freq, 1.0 / len(tuple(nltk.bigrams(tokens)))))

[('101st', 'Airborne'),
 ('10am', 'Call'),
 ('10th', 'Dodd'),
 ('11th', 'attacks'),
 ('11th', 'hour'),
 ('12th', 'asked'),
 ('14th', 'pass'),
 ('15minutes', 'conference'),
 ('1960s', 'accuse'),
 ('1990s', 'Republicans'),
 ('19th', 'Amendment'),
 ('19th', 'century'),
 ('20th', 'century'),
 ('21st', 'century'),
 ('35th', 'anniversary'),
 ('AGNA', 'leadership'),
 ('AGREEMENT', 'SENSITIVE'),
 ('AIPAC', 'Presidents'),
 ('ANCHORS', 'APPROACH'),
 ('APPROACH', 'FARM'),
 ('APPROACH', 'identified'),
 ('ARRIVE', 'Private'),
 ('ARTS', 'LIVING'),
 ('ASEAN', 'China'),
 ('ASEAN', 'plus'),
 ('AUCA', 'president'),
 ('Abbas', 'agree'),
 ('Abdul', 'Ghani'),
 ('Abdulfatah', 'Younis'),
 ('Abedin', 'Huma'),
 ('Abedin', 'Hume'),
 ('Abedin', 'Jake'),
 ('Abedin', 'Kennedy'),
 ('Abedin', 'Lona'),
 ('Abedin', 'Sent'),
 ('Abraham', 'Lincoln'),
 ('Absolute', 'Absolute'),
 ('Abyei', 'understand'),
 ('Academy', 'Sciences'),
 ('Accomplishments', 'Date'),
 ('According', 'Hudson'),
 ('According', 'Politico'),
 ('Accord

In [18]:
sorted(finder.nbest(bigram_measures.raw_freq, 2))

[('United', 'States'), ('White', 'House')]

Получили те же наиболее часто встречающиеся биграммы.

## Кластеризация писем

Сформируем признаки.

In [13]:
texts = list(text.lower().replace('\n', '') for index, text in data.bodyProc.iteritems())
preproc_texts = list(' '.join(elem for elem in list(filter(lambda token: token not in stopset and len(token) > 3, word_tokenize(text))))
                     for text in texts)

In [14]:
preproc_texts = list(''.join(elem for elem in list(filter(lambda c: c not in punctset, list(text)))) for text in preproc_texts)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=500, min_df=10)
object_feature = vectorizer.fit_transform(preproc_texts)

In [16]:
print(object_feature.shape)

(7945, 2673)


In [19]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=1000, random_state=123)
features = svd.fit_transform(object_feature)

In [15]:
# dbscan
# Latent Dirichlet Allocation?? -> sklearn

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(object_feature)

### Кластеризация

Кластеризуем по токенам и сообщения, и темы

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, min_df=25)
matrix = vectorizer.fit_transform(data.subjectProc + '\n' + data.bodyProc)

print(matrix.shape)

(7945, 1596)


In [20]:
%%time
from sklearn.cluster.hierarchical import AgglomerativeClustering
from sklearn.cluster import KMeans

N_CLUSTERS = 4
model = AgglomerativeClustering(n_clusters=N_CLUSTERS, affinity='euclidean', linkage='complete')
#model = KMeans(n_clusters=N_CLUSTERS, random_state=1)
data.preds = model.fit_predict(matrix.toarray())

Wall time: 1min 12s


В качестве визуализации можно использовать тему письма, самого близкого к "центру" класса, среднего вектора кластера

In [23]:
import numpy as np
for i in range(0, N_CLUSTERS):
    pd_matrix = pd.DataFrame(matrix.toarray())[data.preds == i]
    m = pd_matrix.mean()
    xy = pd_matrix.apply(lambda x: np.multiply(x,m), axis=1).sum(axis=1)
    xx = pd_matrix.apply(lambda x: x**2, axis=1).sum(axis=1)
    yy = np.sum(m * m)
    print(data.subjectProc[np.arccos(np.divide(xy, np.multiply(xx, yy))).argmin()+1])



USGLC PRAISES SECRETARY CLINTON
RICHARD (TNR)
PUTIN TAGS POLAR BEAR IN ARCTIC
BLANK


In [24]:
print(pd.value_counts(data.preds))

0    6960
1     437
3     434
2     114
dtype: int64
