In [1]:
import pandas as pd

## Загрузка данных

In [22]:
aliases = pd.read_csv('data/Aliases.csv')
emailreceivers = pd.read_csv('data/EmailReceivers.csv')
emails = pd.read_csv('data/Emails.csv')
persons = pd.read_csv('data/Persons.csv')

In [7]:
emails.columns

Index(['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'MetadataFrom',
       'SenderPersonId', 'MetadataDateSent', 'MetadataDateReleased',
       'MetadataPdfLink', 'MetadataCaseNumber', 'MetadataDocumentClass',
       'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom', 'ExtractedCc',
       'ExtractedDateSent', 'ExtractedCaseNumber', 'ExtractedDocNumber',
       'ExtractedDateReleased', 'ExtractedReleaseInPartOrFull',
       'ExtractedBodyText', 'RawText'],
      dtype='object')

In [138]:
data = pd.DataFrame(emails[['MetadataSubject', 'ExtractedBodyText']])
data.columns = ['subject', 'body']
data.fillna('', inplace=True)

In [139]:
data.head(20)

Unnamed: 0,subject,body
0,WOW,
1,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest..."
2,CHRIS STEVENS,Thx
3,CAIRO CONDEMNATION - FINAL,
4,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"H <hrod17@clintonemail.com>\nFriday, March 11,..."
5,MEET THE RIGHT-WING EXTREMIST BEHIND ANTI-MUSL...,Pis print.\n-•-...-^\nH < hrod17@clintonernail...
6,"ANTI-MUSLIM FILM DIRECTOR IN HIDING, FOLLOWING...",
7,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"H <hrod17@clintonemail.corn>\nFriday, March 11..."
8,SECRETARY'S REMARKS,FYI
9,MORE ON LIBYA,"B6\nWednesday, September 12, 2012 6:16 PM\nFwd..."


## Предобработка и анализ

Предобработаем сперва темы писем. В них содержатся шумовые токены вроде "FWD:" или "Re:". Избавимся от них с помощью регулярных выражений.
Затем предобработаем тексты писем, в которых также содержатся шумовые строки, начинающиеся с подстрок "Re:" или "H:". Стоит также удалить шумовые тэги, такие как "From:", штампы "U.S. Department...", отметки времени и/или даты. Также удалим строки с номерами документов вроде "Case No" и строки с числами.

In [148]:
import re
def is_date(string):
    try:
        parse(string)
        return True
    except:
        return False
    
def has_numbers(string):
    return bool(re.search(r'\d', string))

def process_body(elem):
    lines = elem.splitlines()
    trashpattern = re.compile(r"^(Doc No\\..*|Case No\\.|H <.*@.*>|.*@.*|From .|To:.|For:.*|Subject:.|U.S. Department of State$)")
    datepattern = re.compile(r"Date: \d{2}/\d{2}/\d{4}")
    timepattern = re.compile(r".*\d{1,2}:(\d{2}\s(?:AM|PM))")
    keypattern = re.compile(r"\s*(Fwd|Fw|Fvv|FVV|FW|Re|re|RE|H):\s*")
    numpattern = re.compile(r".*(\d*-\d*|\d+\b).*")
    lines = list(filter(lambda line: not datepattern.match(line) and not trashpattern.match(line) and not keypattern.match(line)
            and not timepattern.match(line) and not numpattern.match(line),lines))
    return '\n'.join(lines)

def process_subject(elem):
    if elem == '':
        return elem
    keypattern = r's*(Fwd|Fw|Fvv|FVV|FW|Re|re|RE|H):\s*'
    subject = re.sub(keypattern, "", elem)
    subject = re.sub(keypattern, "", elem)
    return subject

data['subjectProc'] = data.subject.apply(process_subject)
data['bodyProc'] = data.body.apply(process_body)

In [141]:
data.head(20)

Unnamed: 0,subject,body,subjectProc,bodyProc
0,WOW,,WOW,
1,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",LATEST: HOW SYRIA IS AIDING QADDAFI AND MORE.....,
2,CHRIS STEVENS,Thx,CHRIS STEVENS,Thx
3,CAIRO CONDEMNATION - FINAL,,CAIRO CONDEMNATION - FINAL,
4,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"H <hrod17@clintonemail.com>\nFriday, March 11,...",LATEST: HOW SYRIA IS AIDING QADDAFI AND MORE.....,Huma Abedin\nPis print.
5,MEET THE RIGHT-WING EXTREMIST BEHIND ANTI-MUSL...,Pis print.\n-•-...-^\nH < hrod17@clintonernail...,MEET THE RIGHT-WING EXTREMIST BEHIND ANTI-MUSL...,Pis print.\nSent from my Verizon Wireless 4G L...
6,"ANTI-MUSLIM FILM DIRECTOR IN HIDING, FOLLOWING...",,"ANTI-MUSLIM FILM DIRECTOR IN HIDING, FOLLOWING...",
7,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"H <hrod17@clintonemail.corn>\nFriday, March 11...",LATEST: HOW SYRIA IS AIDING QADDAFI AND MORE.....,Huma Abedin\nPis print.
8,SECRETARY'S REMARKS,FYI,SECRETARY'S REMARKS,FYI
9,MORE ON LIBYA,"B6\nWednesday, September 12, 2012 6:16 PM\nFwd...",MORE ON LIBYA,Sending direct. Just in.\nSent from my Verizon...


In [142]:
print(len(data))

7945


### Биграммы из текста

In [537]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
import string
stopset = list(stopwords.words('english')) + ["'s"] + ["n't"] + ['']
punctset = string.punctuation + '—' + '`' + "'" + "•"

In [538]:
tokenset = list(list(filter(lambda token: token.lower() not in stopset and token not in punctset, word_tokenize(line)))
              for index, text in data.bodyProc.iteritems() for line in text.splitlines())
tokenset = list(list("".join(l for l in list(filter(lambda c: c not in punctset, token))) for token in doc)
               for doc in tokenset)
bigrams = list(bg for elem in tokenset for bg in list(ngrams(elem,2)))

In [539]:
print(len(tokenset))
print(tokenset[55])

35448
['retaken', 'least', 'besieged', 'Benghazi', 'opposition', 'stronghold']


Выведем 5 наиболее часто встречающихся биграмм

In [484]:
from collections import Counter
frequencies = Counter(bigrams)

In [485]:
print(frequencies.most_common(5))

[(('United', 'States'), 325), (('White', 'House'), 239), (('Secretary', 'State'), 147), (('New', 'York'), 140), (('State', 'Department'), 134)]


### Биграммы с помощью NLTK по PMI

In [487]:
import nltk
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
tokens = [token for tokens in tokenset for token in tokens]

In [488]:
scored = finder.score_ngrams(bigram_measures.raw_freq)

In [489]:
word_fd = nltk.FreqDist(tokens)
bigram_fd = nltk.FreqDist(nltk.bigrams(tokens))
finder = BigramCollocationFinder(word_fd, bigram_fd)
sorted(finder.nbest(bigram_measures.raw_freq, 2))

[('United', 'States'), ('White', 'House')]

In [490]:
len(finder.score_ngrams(bigram_measures.raw_freq))

183003

In [491]:
finder.apply_word_filter(lambda w: w.lower() in stopset + [''])

In [492]:
len(finder.score_ngrams(bigram_measures.raw_freq))

180591

In [None]:
sorted(finder.above_score(bigram_measures.raw_freq, 1.0 / len(tuple(nltk.bigrams(tokens)))))

In [494]:
sorted(finder.nbest(bigram_measures.raw_freq, 2))

[('United', 'States'), ('White', 'House')]

Получили те же наиболее часто встречающиеся биграммы.

In [None]:
for index, text in data.bodyProc.iteritems() for line in text.splitlines()

## Кластеризация писем

Сформируем признаки.

In [551]:
texts = list(text.lower().replace('\n', '') for index, text in data.bodyProc.iteritems())
preproc_texts = list(' '.join(elem for elem in list(filter(lambda token: token not in stopset, word_tokenize(text))))
                     for text in texts)


In [555]:
preproc_texts = list(''.join(elem for elem in list(filter(lambda c: c not in punctset, list(text)))) for text in preproc_texts)

In [556]:
texts[55]

'more info."security at risk," etc.according to politico yesterday, there was an internal argument within the romney campaign overlibya. obviously, the neocons and the rove oriented faction (ed gillespie, rove\'s surrogate is now aromney campaign adviser) beat stuart stevens.romney is careening from tactic to tactic, hoping something will catch.i wouldn\'t be surprised to see romney raise the specter of the muslim brotherhood in charge of egypt& libya as a bogeyman. i have heard him do this within the last week.'

In [557]:
preproc_texts[55]

'info   security risk   etcaccording politico yesterday  internal argument within romney campaign overlibya  obviously  neocons rove oriented faction  ed gillespie  rove surrogate aromney campaign adviser  beat stuart stevensromney careening tactic tactic  hoping something catchi would surprised see romney raise specter muslim brotherhood charge egypt  libya bogeyman  heard within last week '

In [None]:
preproc = list((text.replace(c,'') for c in punctset) for text in text)

In [None]:
list(stopset) + list(punctset)

In [554]:
res = list(texts[55].lower())
# for c in list(punctset):
#     res = res.replace(c,'')
print(res)
#print(set(res.split(' ')) - set(stopset))

['m', 'o', 'r', 'e', ' ', 'i', 'n', 'f', 'o', '.', '"', 's', 'e', 'c', 'u', 'r', 'i', 't', 'y', ' ', 'a', 't', ' ', 'r', 'i', 's', 'k', ',', '"', ' ', 'e', 't', 'c', '.', 'a', 'c', 'c', 'o', 'r', 'd', 'i', 'n', 'g', ' ', 't', 'o', ' ', 'p', 'o', 'l', 'i', 't', 'i', 'c', 'o', ' ', 'y', 'e', 's', 't', 'e', 'r', 'd', 'a', 'y', ',', ' ', 't', 'h', 'e', 'r', 'e', ' ', 'w', 'a', 's', ' ', 'a', 'n', ' ', 'i', 'n', 't', 'e', 'r', 'n', 'a', 'l', ' ', 'a', 'r', 'g', 'u', 'm', 'e', 'n', 't', ' ', 'w', 'i', 't', 'h', 'i', 'n', ' ', 't', 'h', 'e', ' ', 'r', 'o', 'm', 'n', 'e', 'y', ' ', 'c', 'a', 'm', 'p', 'a', 'i', 'g', 'n', ' ', 'o', 'v', 'e', 'r', 'l', 'i', 'b', 'y', 'a', '.', ' ', 'o', 'b', 'v', 'i', 'o', 'u', 's', 'l', 'y', ',', ' ', 't', 'h', 'e', ' ', 'n', 'e', 'o', 'c', 'o', 'n', 's', ' ', 'a', 'n', 'd', ' ', 't', 'h', 'e', ' ', 'r', 'o', 'v', 'e', ' ', 'o', 'r', 'i', 'e', 'n', 't', 'e', 'd', ' ', 'f', 'a', 'c', 't', 'i', 'o', 'n', ' ', '(', 'e', 'd', ' ', 'g', 'i', 'l', 'l', 'e', 's', 'p',

In [514]:
print(list(texts[55]))

['info', 'security risk etc', 'According Politico yesterday internal argument within Romney campaign', 'Libya Obviously neocons Rove oriented faction Ed Gillespie Rove surrogate', 'Romney campaign adviser beat Stuart Stevens', 'Romney careening tactic tactic hoping something catch', 'would surprised see Romney raise specter Muslim Brotherhood charge Egypt', 'Libya bogeyman heard within last week']


In [495]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=500, min_df=10)
object_feature = vectorizer.fit_transform(data.bodyProc)

In [496]:
print(object_feature.shape)

(7945, 3267)


In [None]:
# dbscan
# Latent Dirichlet Allocation?? -> sklearn

### Аггломеративная кластеризация (neighbour joining)

In [145]:
from sklearn.cluster.hierarchical import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete')
preds = model.fit_predict(object_feature.toarray())

In [146]:
print(list(preds))

[1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 2, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 2, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 2, 1, 1, 0, 1, 0, 0, 0, 2, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 

In [147]:
vectorizer.get_feature_names()

['10am',
 '1990s',
 '1pm',
 '21st',
 '7pm',
 '8th',
 'abandon',
 'abandoned',
 'abbas',
 'abedin',
 'ability',
 'able',
 'above',
 'abroad',
 'absence',
 'absolutely',
 'abu',
 'abuse',
 'academic',
 'accept',
 'accepted',
 'access',
 'accomplish',
 'accord',
 'according',
 'accordingly',
 'account',
 'accountability',
 'accountable',
 'accounts',
 'accused',
 'achieve',
 'achieved',
 'achieving',
 'acknowledge',
 'acknowledged',
 'across',
 'act',
 'acting',
 'action',
 'actions',
 'active',
 'actively',
 'activist',
 'activists',
 'activities',
 'activity',
 'actors',
 'acts',
 'actual',
 'actually',
 'ad',
 'add',
 'added',
 'adding',
 'addition',
 'additional',
 'address',
 'addressed',
 'addresses',
 'addressing',
 'adds',
 'administration',
 'admit',
 'admitted',
 'adopted',
 'advance',
 'advantage',
 'advice',
 'advise',
 'advised',
 'adviser',
 'advisers',
 'advisor',
 'advocacy',
 'advocate',
 'advocates',
 'af',
 'affairs',
 'affect',
 'afghan',
 'afghanistan',
 'africa',
 'a

## KMeans

In [33]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(object_feature.toarray())
print(preds[:30])

[0 0 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0]


In [34]:
## target value??


## SVD + KMeans

In [35]:
from sklearn.decomposition import TruncatedSVD

model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=1000, random_state=123)
features = svd.fit_transform(object_feature)
preds = model.fit_predict(features)
print(preds)

[1 1 1 ..., 1 1 1]


Предобработка

In [26]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
for elem in word_tokenize(str(texts[1])):
    print(elem, ps.stem(elem))

In [None]:
X = []
for elem in texts:
    X.append(word_tokenize(elem))
    for j in range(len(X[i])):
        X[i][j] = PorterStemmer().stem(X[i][j])
    X[i] = " ".join(X[i])