In [1]:
import pandas as pd

In [2]:
aliases = pd.read_csv('data/Aliases.csv')
emailreceivers = pd.read_csv('data/EmailReceivers.csv')
emails = pd.read_csv('data/Emails.csv')
persons = pd.read_csv('data/Persons.csv')

In [3]:
aliases.head(2)

Unnamed: 0,Id,Alias,PersonId
0,1,111th congress,1
1,2,agna usemb kabul afghanistan,2


In [4]:
emailreceivers.head(2)

Unnamed: 0,Id,EmailId,PersonId
0,1,1,80
1,2,2,80


In [5]:
persons.head(2)

Unnamed: 0,Id,Name
0,1,111th Congress
1,2,AGNA USEMB Kabul Afghanistan


In [6]:
emails.head(2)

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\nU.S. Department of State\nCase N...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",UNCLASSIFIED\nU.S. Department of State\nCase N...


In [7]:
emails.columns

Index(['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'MetadataFrom',
       'SenderPersonId', 'MetadataDateSent', 'MetadataDateReleased',
       'MetadataPdfLink', 'MetadataCaseNumber', 'MetadataDocumentClass',
       'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom', 'ExtractedCc',
       'ExtractedDateSent', 'ExtractedCaseNumber', 'ExtractedDocNumber',
       'ExtractedDateReleased', 'ExtractedReleaseInPartOrFull',
       'ExtractedBodyText', 'RawText'],
      dtype='object')

In [25]:
data = pd.DataFrame(emails[['MetadataSubject', 'ExtractedBodyText']])
data.columns = ['subject', 'text']
data.text.fillna('', inplace=True)

In [26]:
data.head()

Unnamed: 0,subject,text
0,WOW,
1,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest..."
2,CHRIS STEVENS,Thx
3,CAIRO CONDEMNATION - FINAL,
4,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"H <hrod17@clintonemail.com>\nFriday, March 11,..."


In [23]:
print(len(data))

7945


In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=500, min_df=10)
object_feature = vectorizer.fit_transform(data.text)

In [28]:
print(object_feature.shape)

(7945, 4299)


## Аггломеративная кластеризация (neighbour joining)

In [29]:
from sklearn.cluster.hierarchical import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete')
preds = model.fit_predict(object_feature.toarray())

In [30]:
print(list(preds))

[0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 2, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [31]:
vectorizer.get_feature_names()

['00',
 '000',
 '00am',
 '00pm',
 '01',
 '02',
 '03',
 '04',
 '04841',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '100',
 '10am',
 '11',
 '11th',
 '12',
 '13',
 '14',
 '14th',
 '15',
 '150',
 '15am',
 '15pm',
 '16',
 '17',
 '1709',
 '18',
 '19',
 '1967',
 '1973',
 '1979',
 '1990',
 '1990s',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '19th',
 '1pm',
 '1st',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '20036',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2011',
 '2012',
 '2014',
 '2015',
 '202',
 '2024',
 '2025',
 '2035',
 '20439',
 '20520',
 '21',
 '21st',
 '22',
 '2201',
 '23',
 '24',
 '25',
 '250',
 '26',
 '27',
 '28',
 '29',
 '2972',
 '2pm',
 '30',
 '300',
 '30am',
 '30pm',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '45pm',
 '46',
 '47',
 '48',
 '49',
 '4pm',
 '50',
 '500',
 '51',
 '52',
 '53',
 '54',
 '55',
 '5548',
 '56',
 '57',
 '58',
 '59',
 '60',
 '600',
 '62',
 '63',
 '647',
 '65',
 '6

## KMeans

In [33]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(object_feature.toarray())
print(preds[:30])

[0 0 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0]


In [34]:
## target value??


## SVD + KMeans

In [35]:
from sklearn.decomposition import TruncatedSVD

model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=1000, random_state=123)
features = svd.fit_transform(object_feature)
preds = model.fit_predict(features)
print(preds)

[1 1 1 ..., 1 1 1]


Предобработка

In [26]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
for elem in word_tokenize(str(texts[1])):
    print(elem, ps.stem(elem))

In [None]:
X = []
for elem in texts:
    X.append(word_tokenize(elem))
    for j in range(len(X[i])):
        X[i][j] = PorterStemmer().stem(X[i][j])
    X[i] = " ".join(X[i])