## - Combine all data 

In [13]:
import pandas as pd
from os import listdir

path = '../data/'
files = listdir('../data/')
df = pd.DataFrame(columns=["url", "query", "text"])

for f in files:
    temp = pd.read_csv(path + f)
    if 'article-name' in temp.columns:
        temp.rename(columns={'article-name':'name','article-url':'url','content':'text','keyword':'query'}, inplace=True)
    if len(temp) < 1:
        continue
    df = df.append(temp)
df.drop(['Unnamed: 0', 'name'], inplace=True, axis=1)

## - data preprocessing
    1. stop word removal
    2. lower case letters
    3. non ascii character removal

In [14]:
from nltk.corpus import stopwords
import re
stop = stopwords.words('english')

def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text

def remove_stop_words(text):
    return " ".join([item.lower() for item in text.split() if item not in stop])

def remove_non_ascii(text):
    return ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in text])

df['text'] = df['text'].apply(remove_non_ascii)
df['text'] = df['text'].apply(normalize_text)
df['text'] = df['text'].apply(remove_stop_words)
df["text"] = df['text'].str.replace('[^\w\s]','')

## - a simple word2vec model
    In this section we apply simple word to vec model to tokenized data.

In [15]:
from gensim.models import Word2Vec
from nltk import word_tokenize

In [16]:
df['tokenized_text'] = df.apply(lambda row: word_tokenize(row['text']), axis=1)

In [17]:
model = Word2Vec(df['tokenized_text'], size=100)

In [18]:
for num in [1, 3, 5, 10, 12, 16, 17, 18, 19, 28, 29, 30, 32, 33, 34, 37, 38]:
    term = "apt%s"%str(num)
    if term in model.wv.vocab:
        print("Most similar words for %s"%term)
        for t in model.most_similar(term): print(t)
        print('\n')

Most similar words for apt1
('mandiant', 0.9992831349372864)
('according', 0.9988211989402771)
('china', 0.9986724257469177)
('defense', 0.9986507892608643)
('kaspersky', 0.9986412525177002)
('iranian', 0.9985784888267517)
('military', 0.9983772039413452)
('lab', 0.9978839159011841)
('detected', 0.997614860534668)
('published', 0.997364342212677)


Most similar words for apt3
('strontium', 0.9977763891220093)
('cozy', 0.9963721036911011)
('tracked', 0.9958826899528503)
('team', 0.994817852973938)
('also', 0.9941498041152954)
('menupass', 0.9935141205787659)
('linked', 0.9934953451156616)
('axiom', 0.9930843114852905)
('chinalinked', 0.9929003715515137)
('behind', 0.9923593997955322)


Most similar words for apt10
('apt37', 0.9996817111968994)
('sophisticated', 0.9994451403617859)
('naikon', 0.9994421601295471)
('overlap', 0.999294638633728)
('entities', 0.9992740154266357)
('micro', 0.9989956021308899)
('noticed', 0.9988883137702942)
('tracks', 0.9988324642181396)
('primarily', 0.99880

  """


### here we got one interesting result for apt17 as apt28
    but for all other word2vec results we observe that we are getting names like malware, attackers, groups, backdoor in the most similar items.  
    It might be the case that the names of attacker groups are ommited because they are phrases instead simple words.

## - word2vec with bigram phrases
    here we try to find bigram phrases from the dataset and apply word2vec model to it

In [19]:
from gensim.models import Phrases
from collections import Counter

In [20]:
bigram = Phrases()

In [21]:
bigram.add_vocab(df['tokenized_text'])

In [22]:
bigram_counter = Counter()
for key in bigram.vocab.keys():
    if len(key.split("_")) > 1:
        bigram_counter[key] += bigram.vocab[key]

for key, counts in bigram_counter.most_common(20):
    print '{0: <20} {1}'.format(key.encode("utf-8"), counts)

cyber_security       353
security_conference  334
ics_cyber            334
document_getelementsbytagname 163
comjsplusone_js      163
conference_singapore 163
google_comjsplusone  163
script_0             163
ciso_forum           163
forum_half           163
document_createelement 163
po_src               163
apis_google          163
textjavascript_po    163
type_textjavascript  163
po_async             163
var_po               163
parentnode_insertbefore 163
async_true           163
po_type              163


In [23]:
bigram_model = Word2Vec(bigram[df['tokenized_text']], size=100)

In [24]:
for num in [1, 3, 5, 10, 12, 16, 17, 18, 19, 28, 29, 30, 32, 33, 34, 37, 38]:
    term = "apt%s"%str(num)
    if term in bigram_model.wv.vocab:
        print("Most similar words for %s"%term)
        for t in bigram_model.most_similar(term): print(t)
        print('\n')

Most similar words for apt1
(u'different', 0.99991774559021)
(u'likely', 0.9999154806137085)
(u'well', 0.9999152421951294)
(u'says', 0.9999047517776489)
(u'multiple', 0.9999043941497803)
(u'threat_actors', 0.9998949766159058)
(u'network', 0.9998934268951416)
(u'according', 0.9998912811279297)
(u'compromised', 0.9998894929885864)
(u'related', 0.999876856803894)


Most similar words for apt3
(u'actor', 0.9998462796211243)
(u'described', 0.9998243451118469)
(u'also_known', 0.9998069405555725)
(u'actors', 0.9997928738594055)
(u'recently', 0.9997922778129578)
(u'experts', 0.999782919883728)
(u'apt29', 0.9997620582580566)
(u'identified', 0.9997564554214478)
(u'two', 0.9997557401657104)
(u'domains', 0.9997459650039673)


Most similar words for apt10
(u'time', 0.999898374080658)
(u'analysis', 0.9998810291290283)
(u'u', 0.9998781681060791)
(u'version', 0.9998765587806702)
(u'based', 0.9998717308044434)
(u'provided', 0.9998701810836792)
(u'least', 0.9998694658279419)
(u'mandiant', 0.999866664409

  """


### After applying bigram phrases still we cannot see the desired results. 

## Word2Vec model topic by topic using bigram phrases

In [25]:
df_doc = df[['query', 'text']]

In [26]:
df_doc

Unnamed: 0,query,text
0,APT30,today fireeye released report threat group cal...
1,APT30,apt30 mechanics behind decade long cyber espi...
0,APT29,sophisticated threat actor cozy bear initially...
0,APT38,today releasing details threat group believe ...
0,APT12,2202 votessymantec official blogyet another ze...
0,APT15,apt attackers target japanese word processor z...
1,APT15,chinalinked spies used new malware u k gover...
2,APT15,chinalinked spies used new malware u k gover...
0,APT34,less week microsoft issued patch cve201711882 ...
0,APT35,iranian hackers impersonate israeli security f...


In [27]:
df_doc = df_doc.groupby(['query'],as_index=False).first()

In [28]:
df_doc

Unnamed: 0,query,text
0,APT1,download kaspersky security bulletin threat p...
1,APT10,chinalinked apt10 hackers update attack techni...
2,APT12,2202 votessymantec official blogyet another ze...
3,APT15,apt attackers target japanese word processor z...
4,APT16,microsoft office flaw exploited several apt ac...
5,APT17,apt17 hiding plain sight fireeye microsoft e...
6,APT18,chinese cyberspies target european drone maker...
7,APT27,chinese cyberspies target european drone maker...
8,APT28,nationstate adversary group known fancy bear o...
9,APT29,sophisticated threat actor cozy bear initially...


In [31]:
from nltk.corpus import stopwords
import re
stop = stopwords.words('english') + ['fireeye', 'crowdstrike', 'symantec', 'rapid7', 'securityweek', 'kaspersky']

def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text

def remove_stop_words(text):
    return " ".join([item.lower() for item in text.split() if item not in stop])

def remove_non_ascii(text):
    return ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in text])

df_doc['text'] = df_doc['text'].apply(remove_non_ascii)
df_doc['text'] = df_doc['text'].apply(normalize_text)
df_doc['text'] = df_doc['text'].apply(remove_stop_words)
df_doc["text"] = df_doc['text'].str.replace('[^\w\s]','')

In [32]:
df_doc

Unnamed: 0,query,text
0,APT1,download security bulletin threat predictions ...
1,APT10,chinalinked apt10 hackers update attack techni...
2,APT12,2202 votessymantec official blogyet another ze...
3,APT15,apt attackers target japanese word processor z...
4,APT16,microsoft office flaw exploited several apt ac...
5,APT17,apt17 hiding plain sight microsoft expose obfu...
6,APT18,chinese cyberspies target european drone maker...
7,APT27,chinese cyberspies target european drone maker...
8,APT28,nationstate adversary group known fancy bear o...
9,APT29,sophisticated threat actor cozy bear initially...


In [33]:
df_doc['tokenized_text'] = df_doc.apply(lambda row: word_tokenize(row['text']), axis=1)

In [70]:
df_doc

Unnamed: 0,query,text,tokenized_text
0,APT1,download security bulletin threat predictions ...,"[download, security, bulletin, threat, predict..."
1,APT10,chinalinked apt10 hackers update attack techni...,"[chinalinked, apt10, hackers, update, attack, ..."
2,APT12,2202 votessymantec official blogyet another ze...,"[2202, votessymantec, official, blogyet, anoth..."
3,APT15,apt attackers target japanese word processor z...,"[apt, attackers, target, japanese, word, proce..."
4,APT16,microsoft office flaw exploited several apt ac...,"[microsoft, office, flaw, exploited, several, ..."
5,APT17,apt17 hiding plain sight microsoft expose obfu...,"[apt17, hiding, plain, sight, microsoft, expos..."
6,APT18,chinese cyberspies target european drone maker...,"[chinese, cyberspies, target, european, drone,..."
7,APT27,chinese cyberspies target european drone maker...,"[chinese, cyberspies, target, european, drone,..."
8,APT28,nationstate adversary group known fancy bear o...,"[nationstate, adversary, group, known, fancy, ..."
9,APT29,sophisticated threat actor cozy bear initially...,"[sophisticated, threat, actor, cozy, bear, ini..."


In [72]:
from gensim.models import Phrases
from collections import Counter

In [75]:
for num in ['APT1', 'APT10', 'APT12', 'APT15', 'APT16', 'APT17', 'APT18', 'APT27', 'APT28', 'APT29', 'APT3', 'APT30', 'APT32', 'APT33', 'APT34', 'APT35', 'APT37', 'APT38']:
    temp = df_doc[df_doc['query'] == num]
    print(temp.shape)
    if temp.shape[0] == 0:
        continue
    bigram = Phrases()
    
    bigram.add_vocab(temp['tokenized_text'])
    
    bigram_model = Word2Vec(bigram[temp['tokenized_text']], size=100)
    
    term = num.lower()
    if term in bigram_model.wv.vocab:
        print("Most similar words for %s"%term)
        for t in bigram_model.most_similar(term, topn=20): print(t)
        print('\n')

(1, 3)
(1, 3)
(1, 3)
(1, 3)
(1, 3)
(1, 3)
Most similar words for apt17
(u'threat', 0.1114407628774643)


(1, 3)
(1, 3)


  from ipykernel import kernelapp as app


(1, 3)
(1, 3)
(1, 3)
(1, 3)
(1, 3)
Most similar words for apt32
(u'configured', 0.35824739933013916)
(u'images', 0.2468959540128708)
(u'used', 0.24169450998306274)
(u'information', 0.22564998269081116)
(u'private_sector', 0.21900717914104462)
(u'table', 0.21895581483840942)
(u'malicious', 0.19625842571258545)
(u'contains', 0.19428619742393494)
(u'including', 0.1941174566745758)
(u'2014', 0.1893436759710312)
(u'security', 0.1759520173072815)
(u'task', 0.17439031600952148)
(u'infrastructure', 0.16248968243598938)
(u'targeted', 0.16084317862987518)
(u'mandiant', 0.1572575718164444)
(u'tools', 0.15236860513687134)
(u'corporations', 0.15172307193279266)
(u'beacon', 0.14482899010181427)
(u'vietnamese', 0.14482486248016357)
(u'fake', 0.13276609778404236)


(1, 3)
(1, 3)
Most similar words for apt34
(u'process', 0.23127448558807373)
(u'cupdatecheckers', 0.16893014311790466)
(u'mumbaim_site', 0.1618441492319107)
(u'response', 0.1537320464849472)
(u'example', 0.13982141017913818)
(u'vbs', 0.1376

In [63]:
num = 38
temp = df_doc[df_doc['query'] == 'APT%s'%num]
bigram = Phrases()

bigram.add_vocab(temp['tokenized_text'])

In [64]:
bigram_model = Word2Vec(bigram[temp['tokenized_text']], size=100)

term = 'apt%s'%num
if term in bigram_model.wv.vocab:
    print("Most similar words for %s"%term)
    for t in bigram_model.most_similar(term, topn=20): print(t)
    print('\n')

Most similar words for apt38
(u'used', 0.20620612800121307)
(u'group', 0.14802813529968262)
(u'operations', 0.135009765625)
(u'victim', 0.061795078217983246)
(u'swift', 0.05330579727888107)
(u'transactions', 0.04159824922680855)
(u'organizations', 0.04043135046958923)
(u'systems', -0.006069064140319824)
(u'access', -0.02243630588054657)
(u'malware', -0.03157751262187958)
(u'activity', -0.07716790586709976)
(u'tools', -0.08170446008443832)
(u'north_korean', -0.09251955151557922)
(u'financial', -0.11650492250919342)




  


In [65]:
temp.shape

(1, 3)