In [1]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import regex as re
from nltk.stem import PorterStemmer
from pandarallel import pandarallel
import itertools
import collections

stemmer = PorterStemmer()

pandarallel.initialize(progress_bar = False)

# Serialization folder
SERIALIZATION_FOLDER = "pickle/"

# Serialization folder
DF_NAME = "df.pkl"

# Environment
DATA_PATH = 'data/'
EMAIL_DATA = 'Emails.csv'

ASUM_PATH = 'asum/'

TOKENS_THLD = 15

FREQ_THLD = 10

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
df = pd.read_pickle(SERIALIZATION_FOLDER + DF_NAME)

print(f"Length before: {len(df)}")
df = df[df["Tokenized"].apply(lambda x: len(x)) > TOKENS_THLD]
print(f"Length after: {len(df)}")
df.head()

Length before: 6737
Length after: 1752


Unnamed: 0_level_0,SenderPersonId,MetadataDateSent,ExtractedSubject,ExtractedBodyText,DateYear,DateMonth,DateDay,ExtractedBodyTextCleaned,Tokenized
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,,2011-03-03 05:00:00+00:00,,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",2011,3,3,b6\nh: latest how syria is aiding qaddafi and ...,"[h, latest, syria, aiding, qaddafi, sid, hrc, ..."
6,80.0,2012-09-12 04:00:00+00:00,Meet The Right Wing Extremist Behind Anti-Musl...,Pis print.\n-•-...-^\nH < hrod17@clintonernail...,2012,9,12,°russorv@state.gov'\nfrom [meat)\nsent: to: 11...,"[state.gov', meat, sent, subject, meet, right,..."
14,10.0,2011-03-13 05:00:00+00:00,,"Anne-Marie Slaughter\nSunday, March 13, 2011 9...",2011,3,13,"anne-marie slaughter\njacob mills, cheryl d; r...","[anne-marie, slaughter, jacob, mills, cheryl, ..."
15,32.0,2012-09-12 04:00:00+00:00,RE: Not a dry eye in NEA,"_ .....\nFrom Randolph, Lawrence M\nSent: Wedn...",2012,9,12,"_ .....\nfrom randolph, lawrence m\nsent: to: ...","[randolph, lawrence, sent, mills, cheryl, subj..."
16,77.0,2012-09-12 04:00:00+00:00,,I asked to attend your svtc today with Embassy...,2012,9,12,i asked to attend your svtc today with embassy...,"[asked, attend, svtc, today, embassy, tripoli,..."


In [3]:
def tokenize_sentence(sentence, stem=True):
    tokenized = word_tokenize(sentence)
    # Strip tokens
    tokenized = [token.strip() for token in tokenized]
    # Strict regex rule
    tokenized = [token for token in tokenized if re.match('\w+', token)]
    # Remove punctuation
    tokenized = [token for token in tokenized if token not in string.punctuation]
    # Remove stopwords
    stop = stopwords.words('english') + [':', '.', '@'] + ["n't"]
    tokenized = [token for token in tokenized if token not in stop]
    # Remove numbers
    tokenized = [token for token in tokenized if not re.search(r'\d', token)]
    if stem:
        tokenized = [stemmer.stem(token) for token in tokenized]
    return tokenized

def preprocess_asum(body):
    sentences = sent_tokenize(body)
    sentences = [tokenize_sentence(sentence) for sentence in sentences]
    return [s for s in sentences if s != []]

In [4]:
text = df.iloc[4]['ExtractedBodyTextCleaned']
text

"i asked to attend your svtc today with embassy tripoli, because had first met so many of that staff when i went with\nyou from malta to tripoli for the reopening of our embassy.\ntoday's deaths hit me much harder than i would have guessed. i am always proud to serve under you, but never have\nyour words been more meaningful than on today's svtc. every day of your tenure has been extraordinary, but none\nmore so than today. thank you again for your inspirational leadership and example.\nas ever,\nharold\nu.s. department of state\ncase no. f-2015-04841\ndoc no. c05739571\ndate: 05/13/2015\nstate dept. - produced to house select benghazi comm.\nsubject to agreement on sensitive information & redactions. no foia waiver. state-scb0045269"

In [5]:
sent_text = nltk.sent_tokenize(text)
sent_text

['i asked to attend your svtc today with embassy tripoli, because had first met so many of that staff when i went with\nyou from malta to tripoli for the reopening of our embassy.',
 "today's deaths hit me much harder than i would have guessed.",
 "i am always proud to serve under you, but never have\nyour words been more meaningful than on today's svtc.",
 'every day of your tenure has been extraordinary, but none\nmore so than today.',
 'thank you again for your inspirational leadership and example.',
 'as ever,\nharold\nu.s. department of state\ncase no.',
 'f-2015-04841\ndoc no.',
 'c05739571\ndate: 05/13/2015\nstate dept.',
 '- produced to house select benghazi comm.',
 'subject to agreement on sensitive information & redactions.',
 'no foia waiver.',
 'state-scb0045269']

In [6]:
[tokenize_sentence(s) for s in sent_text]

[['ask',
  'attend',
  'svtc',
  'today',
  'embassi',
  'tripoli',
  'first',
  'met',
  'mani',
  'staff',
  'went',
  'malta',
  'tripoli',
  'reopen',
  'embassi'],
 ['today', 'death', 'hit', 'much', 'harder', 'would', 'guess'],
 ['alway', 'proud', 'serv', 'never', 'word', 'meaning', 'today', 'svtc'],
 ['everi', 'day', 'tenur', 'extraordinari', 'none', 'today'],
 ['thank', 'inspir', 'leadership', 'exampl'],
 ['ever', 'harold', 'u.s.', 'depart', 'state', 'case'],
 ['doc'],
 ['date', 'state', 'dept'],
 ['produc', 'hous', 'select', 'benghazi', 'comm'],
 ['subject', 'agreement', 'sensit', 'inform', 'redact'],
 ['foia', 'waiver'],
 []]

In [7]:
df["asum"] = df["ExtractedBodyTextCleaned"].parallel_apply(preprocess_asum)
df.head()

Unnamed: 0_level_0,SenderPersonId,MetadataDateSent,ExtractedSubject,ExtractedBodyText,DateYear,DateMonth,DateDay,ExtractedBodyTextCleaned,Tokenized,asum
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,,2011-03-03 05:00:00+00:00,,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",2011,3,3,b6\nh: latest how syria is aiding qaddafi and ...,"[h, latest, syria, aiding, qaddafi, sid, hrc, ...","[[h, latest, syria, aid, qaddafi, sid, hrc, me..."
6,80.0,2012-09-12 04:00:00+00:00,Meet The Right Wing Extremist Behind Anti-Musl...,Pis print.\n-•-...-^\nH < hrod17@clintonernail...,2012,9,12,°russorv@state.gov'\nfrom [meat)\nsent: to: 11...,"[state.gov', meat, sent, subject, meet, right,...","[[state.gov', meat, sent, subject, meet, right..."
14,10.0,2011-03-13 05:00:00+00:00,,"Anne-Marie Slaughter\nSunday, March 13, 2011 9...",2011,3,13,"anne-marie slaughter\njacob mills, cheryl d; r...","[anne-marie, slaughter, jacob, mills, cheryl, ...","[[anne-mari, slaughter, jacob, mill, cheryl, r..."
15,32.0,2012-09-12 04:00:00+00:00,RE: Not a dry eye in NEA,"_ .....\nFrom Randolph, Lawrence M\nSent: Wedn...",2012,9,12,"_ .....\nfrom randolph, lawrence m\nsent: to: ...","[randolph, lawrence, sent, mills, cheryl, subj...","[[randolph, lawrenc, sent, mill, cheryl, subje..."
16,77.0,2012-09-12 04:00:00+00:00,,I asked to attend your svtc today with Embassy...,2012,9,12,i asked to attend your svtc today with embassy...,"[asked, attend, svtc, today, embassy, tripoli,...","[[ask, attend, svtc, today, embassi, tripoli, ..."


In [8]:
flatten = list(itertools.chain(*list(itertools.chain(*df["asum"]))))
tokens_freq = collections.Counter(flatten)
print("Describe tokens frequency")
pd.Series(tokens_freq.values()).describe()

Describe tokens frequency


count    20088.000000
mean        13.966697
std         57.815385
min          1.000000
25%          1.000000
50%          2.000000
75%          6.000000
max       2122.000000
dtype: float64

In [9]:
top_tokens = [token for token, occ in tokens_freq.items() if occ > FREQ_THLD]
print(f"After thld on frequency: {len(top_tokens)}")

After thld on frequency: 3419


In [10]:
top_tokens = sorted(top_tokens)
top_tokens = {idx: token for idx, token in zip(top_tokens, range(len(top_tokens)))}

In [11]:
def resolve_asum(sentences):
    # Resolve indices
    resolved = []
    for sentence in sentences:
        resolved.append([top_tokens.get(token, None) for token in sentence])
    # Remove None values
    no_none = []
    for l in resolved:
        no_none.append([x for x in l if x is not None])
    # Remove empty list
    out_list = []
    for l in no_none:
        out_list.append([x for x in l if x != []])
    # Remove empty body
    return [x for x in out_list if x != []]

In [15]:
final_series = df["asum"].parallel_apply(resolve_asum)
final_series


Id
2       [[1367, 1737, 3016, 89, 2447, 2795, 1462, 1919...
6       [[2915, 2743, 2969, 1915, 2628, 3363, 1110, 29...
14      [[149, 2824, 1618, 1950, 501, 6, 2281, 1787, 1...
15      [[2743, 1950, 501, 2969, 1514, 1955], [2567, 2...
16      [[212, 232, 3110, 983, 3163, 1185, 1930, 1864,...
                              ...                        
7934    [[955, 172, 343, 2136, 2379, 1912, 2414, 1514,...
7938    [[763, 1027, 2329, 1733, 2063, 3276, 1761, 144...
7939    [[2860, 481, 2728, 3306, 1404, 846, 1340, 2610...
7942    [[320, 482, 2291, 2738], [2738, 2550, 151, 182...
7944    [[2445, 2297, 1774, 1701, 1413, 798, 1385, 229...
Name: asum, Length: 1752, dtype: object

In [17]:
# Wordlist
with open(ASUM_PATH + 'in/WordList.txt', "w+") as f:
    for word in top_tokens.keys():
        f.write(word + "\n")

In [20]:
# Bag of sentences
with open(ASUM_PATH + 'in/BagOfSentences.txt.txt', "w+") as f:
    for senteces in final_series:
        f.write(str(len(senteces)) + "\n")
        for sentence in senteces:
            f.write(' '.join([str(x) for x in sentence]) + '\n')

In [13]:
'''
java sto2.STO2Core -s 2 -t 30 -i 1000 -th 3 -a 0.1 -b 0.001/0.1/0 -g 1/1 -d ../../Test\ data/ -o /tmp/ciao
'''

'\njava sto2.STO2Core -s 2 -t 30 -i 1000 -th 3 -a 0.1 -b 0.001/0.1/0 -g 1/1 -d ../../Test\\ data/ -o /tmp/ciao\n'