In [None]:
import os
import pandas as pd
import networkx as nx
file_path = 'triples.xlsx'
df = pd.read_excel(file_path)
df.drop_duplicates(subset='single_article', keep='last', inplace=True)
df = df.reset_index(drop=True)

In [None]:
def get_triples(input_longmemstr):
    final_triples = []
    import re
    def extract_content(text):
        text = text.replace("'", "")
        text = text.replace('"', "")
        text = text.replace('\n', "")
        text = text.replace(':  ', "")
        text = text.replace(': ', "")
        text = text.replace(' :', "")
        text = re.sub(r"(SUBJECT:|Subject:|OBJECT:|RELATION:)", "", text)
        pattern = r"\[(.*?,.*?,.*?)\]"
        matches = re.findall(pattern, text)
        return matches

    input_longmem_listsrt = extract_content(input_longmemstr)
    import ast

    for i in input_longmem_listsrt:
        try:
            i = i.replace(',', '","')
            i = '["' + (i) + '"]'
            result = ast.literal_eval(i)
            final_triples.append(result)
        except:
            print('error')
            continue

    for tri in final_triples:
        for i in range(len(tri)):
            tri[i] = tri[i].strip()

    return final_triples

def build_graph_allsentences(df):
    G = nx.DiGraph()
    for i in range(1, len(df)):
        fulltext=df.loc[i,'single_article']
        longmemstr=df.loc[i,'longmem']
        triples = get_triples(longmemstr)
        for triple in triples:
            source = triple[0]
            target = triple[2]
            G.add_edge(source, target, articletext=fulltext, rel=triple[1])
    return G

def build_graph_alltriples(dfin):
    G = nx.DiGraph()
    for i in range(0, len(dfin)):
        fulltext = dfin.loc[i,'fulltext']
        subw = dfin.loc[i,'sub']
        relw = dfin.loc[i,'rel']
        objw = dfin.loc[i,'obj']
        subtype = dfin.loc[i,'subtype']
        tactic = dfin.loc[i,'tactic']
        objtype = dfin.loc[i,'objtype']
        fulltext = dfin.loc[i,'fulltext']
        be = dfin.loc[i,'be']
        G.add_node(subw, entity=subtype)
        G.add_node(objw, entity=objtype)
        G.add_edge(subw, objw, article_id=fulltext, rel=relw, tactic=tactic,behave_conf=be)
    return G

from simpletransformers.language_representation import RepresentationModel
from simpletransformers.config.model_args import ModelArgs
from sklearn.metrics.pairwise import cosine_similarity

def find_most_similar(given_string, string_list):
    model_args = ModelArgs(max_seq_length=156)

    model = RepresentationModel(
        "roberta",
        "./tactic model",
        args=model_args,
    )

    given_embedding = model.encode_sentences([given_string], combine_strategy="mean")

    similarities = []
    for s in string_list:
        s_embedding = model.encode_sentences([s], combine_strategy="mean")
        cosine_sim = cosine_similarity(given_embedding, s_embedding)
        similarities.append((s, cosine_sim[0][0]))

    top_5 = sorted(similarities, key=lambda x: x[1], reverse=True)[:5]

    return [s[0] for s in top_5]

def ask(prompt, token, temp, model="TheBloke/Yi-34B-Chat-AWQ",streamprint=True):
    import os
    from openai import OpenAI
    if model == "gpt4":
        os.environ["OPENAI_API_KEY"] = ""
        api_key = os.getenv("OPENAI_API_KEY")
        api_base = 'https://api.openai.com/v1'
        setmodel = 'gpt-4-0125-preview'
    else:
        setmodel=model
        api_key = "EMPTY"
        api_base = "http://localhost:8000/v1"

    client = OpenAI(api_key=api_key, base_url=api_base)
    print("Model:", setmodel)
    stream = client.chat.completions.create(
        model=setmodel,
        messages=prompt,
        stream=True,
        max_tokens=token,
        temperature=temp,
        extra_body={
        "stop_token_ids": [7]
        }
    )
    final_response = ""
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            if streamprint:
                print(chunk.choices[0].delta.content, end="")
            final_response += chunk.choices[0].delta.content  
    return final_response                
        
def generate_prompt(longmem,shortmem,sentence):
    promptmessage = [
    {
    "role": "user",
    "content": 
    '''You are a triples integration assistant. Triple is a basic data structure, which describes concepts and their relationships. A triple in long-term and short-term memory MUST has THREE elements: [Subject, Relation, Object]. You are now reading a whole article and extract all triples from it. But you can only see part of the article at a time, and the entity names in the article are not always consistent, for example, "the Formbook" and "Formbook sample" refer to the same entity. In order to make the triples consistent, you need to follow the rules to modify the triples in short-term memory and then write down new short-term memory. 
    You have a long-term memory that already contains some triples which are consistent with each other.
    -The start of the long-term memory area-
    #Triples will be added here
    -The end of the short-term memory area-
    Second, you now see a short term memory area. It contains only one triple extracted from the article. You should modify this triple in short-term memory to make them consistent with triples in long-term memory. 
    -The start of the short-term memory area-
    #Triple will be added here
    -The end of the short-term memory area-
    Third, now review your long-term memory and short-term memory. Modify the short-term memory into a new short-term memory. You should follow following rules to modify triple in short-term memory to make them consistent with triples in long-term memory. You should write down how you use the rule to modify the triple in short-term memory. 
    Rule 1. You are only allowed to do small modifications to the triples in short-term memory, including deleting, adding, or modifying some words. You are not allowed to change the meaning of the triples. When you write down the new short-term memory, you should think wheather the new short-term memory explains the same meaning as the old short-term memory.
    
    Rule 2. You notice that in these triples, some triples have subjects and objects that contain partially identical terms and refer to the same specific nouns, but these specific nouns have prefixes/suffixes/modifiers that make them not identical. You should delete the prefixes/suffixes/modifiers and unify them into the same specific nouns.
    
    Before rule: [the Formbook, is designed to run as, a deleter] [Formbook sample, is designed to run as, one-time encryptor]

    After rule: [Formbook, is designed to run as, a deleter] [Formbook, is designed to run as, one-time encryptor]

    Explanation: The words "the Formbook" and "Formbook sample" refer to the same entity, so they are unified to use the exact same subject "Formbook" for consistency.
    
    Rule 3. Be especially careful that when you meet specific names of malware,CVE, Trojans, hacker organizations, etc., always use their specific names and remove the prefixes/suffixes/modifiers.
    
    Before rule: [Malware Formbook, is, malware] 
    
    After rule: [Formbook, is, malware]
    
    Explanation: The word "Formbook" is a specific name of malware, so it should be used as the subject of the triple and the prefix "Malware" should be removed.
    
    Rule 4. Don't add any example word like 'Formbook','XLoader','Leafminer', 'FinSpy', 'Kismet' in your new short-term memory area, they are just example words not the real triples in the long term memory area or short term memory area.
    
    Rule 5. new short-term memory area must be started with \'-The start of new short-term memory area-\' and ended with \'-The end of new short-term memory area-\'. A triple in new short-term memory MUST has THREE elements: [Subject, Relation, Object]. 

    Rule 6. The new short-term memory area must only has one triple in it.
    '''
    },
    {"role": "assistant",'content': 'Yes, I understand and totally follow the rules.'},
    {"role": "user",'content': '''
        -The start of the long-term memory area-  
        [SUBJECT:AgumonHacker, RELATION:disrupts, OBJECT:network operations], [SUBJECT:HackGroup Gabumon, RELATION:targets, OBJECT:financial institutions], [SUBJECT:Malware Biyomon, RELATION:steals, OBJECT:personal data], [SUBJECT:CyberAttack 2042, RELATION:compromised, OBJECT:database systems],  [SUBJECT:A sample of Biyomon, RELATION:save, OBJECT:a payload]
        -The end of the long-term memory area-

    -The start of the short-term memory area-  
        [SUBJECT:The AgumonHacker Trojan, RELATION:created by, OBJECT:a sophisticated hacker group]
    -The end of the short-term memory area-
        '''
        },
    {"role": "assistant",'content': 
        '''
        Modifications required based on the rules:
    Apply Rule 1 and Rule 2: Unify subjects and objects with partially identical terms and remove any prefixes/suffixes/modifiers, particularly for specific names like malware and hacker groups.
    Modified short-term memory:

    [SUBJECT:The AgumonHacker Trojan, RELATION:created by, OBJECT:a sophisticated hacker group] → [SUBJECT:AgumonHacker, RELATION:created by, OBJECT:sophisticated hacker group]
    -The start of new short-term memory area- 
    [SUBJECT:AgumonHacker, RELATION:created by, OBJECT:sophisticated hacker group], 
    -The end of new short-term memory area-
        '''   },
    {"role": "user",'content': 
    '''
    Good. Now, let's swtich to another article. 
    -The start of the long-term memory area-
    '''+str(longmem)+'''
    -The end of the long-term memory area-

    -The start of the short-term memory area-
    '''+str(shortmem)+'''
    -The end of the short-term memory area-
    
    Now, follow the rules. Write down how you use the rule to modify the one triple in short-term memory. Then, write down new short-term memory which must be started with \'-The start of new short-term memory area-\' and ended with \'-The end of new short-term memory area-\'
    '''
    },      
    ]
    return promptmessage

def ask_merge(longmem,shortmem, token, temp, model,streamprint=True):
    inpt=generate_prompt(longmem,shortmem,'')
    max_retries = 3  # 最大重试次数
    retry_count = 0  # 重试计数器
    final_shortmem=shortmem
    while retry_count < max_retries:
        newlongmem=ask(prompt=inpt, token= token, temp=temp, model=model,streamprint=streamprint)
        newlongmem=newlongmem.replace('-The start of the new short-term memory area-','-The start of new short-term memory area-')
        newlongmem=newlongmem.replace('-The end of the new short-term memory area-','-The end of new short-term memory area-')  
        if '-The start of new short-term memory area-' in newlongmem and '-The end of new short-term memory area-' in newlongmem:
            newlongmem=newlongmem[newlongmem.rindex('-The start of new short-term memory area-')+len('-The start of new short-term memory area-'):newlongmem.rindex('-The end of new short-term memory area-')]
            if not any(keyword in newlongmem for keyword in ['Formbook', 'XLoader', 'savetextfile', 'Leafminer', 'FinSpy', 'Kismet','Agumon','Gabumon','Biyomon','2042']):
                final_shortmem=newlongmem
                retry_count=9999
            else:
                retry_count += 1
        else:
            retry_count += 1
    return final_shortmem


import re
list_triples = []
for i in range(1, len(df)):
    fulltext = df.iloc[i].single_article
    longmemstr = df.iloc[i].longmem
    triples = get_triples(longmemstr)
    for triple in triples:
        sub = triple[0]
        rel = triple[1]
        obj = triple[2]
        subtype = ''
        tactic = ''
        objtype = ''
        simplesen=sub+' '+rel+' '+obj
        be=''
        #replace 2+ spaces with 1 space
        #simplesen=re.sub(' +', ' ', simplesen)
        while '  ' in simplesen:
            simplesen=simplesen.replace('  ',' ')
        list_triples.append([sub, rel, obj, subtype, tactic,be,objtype, simplesen,fulltext,be])
        
df_triples = pd.DataFrame(list_triples, columns=['sub', 'rel', 'obj', 'subtype', 'tactic','be','objtype','simplesen','fulltext','be'])

In [None]:
newdf_str=''
meet_sub_obj=[]
for i in range(0, len(df_triples)):
    if df_triples.loc[i,'fulltext'] == df_triples.loc[0,'fulltext']:
        newdf_str=newdf_str+'['+df_triples.loc[i,'sub']+','+df_triples.loc[i,'rel']+','+df_triples.loc[i,'obj']+']'+'\n'
        meet_sub_obj.append(df_triples.loc[i,'sub'])
        meet_sub_obj.append(df_triples.loc[i,'obj'])
    else:
        sub_name=df_triples.loc[i,'sub']
        obj_name=df_triples.loc[i,'obj']
        top_5_subs = find_most_similar(sub_name, meet_sub_obj)
        top_5_objs = find_most_similar(obj_name, meet_sub_obj)
        top_10_subs_objs = list(set(top_5_subs + top_5_objs))
        #filter df that sub == top_5_subs or obj == top_5_objs or sub == top_5_objs or obj == top_5_subs
        df_similar_overall=[]
        df_similar=df_triples[(df_triples['sub'].isin(top_5_subs)) | (df_triples['obj'].isin(top_5_objs)) | (df_triples['sub'].isin(top_5_objs)) | (df_triples['obj'].isin(top_5_subs))]
        #remove the duplicates that has same fulltext as current row
        df_similar=df_similar[df_similar['fulltext']!=df_triples.loc[i,'fulltext']]
        #shuffle the df_similar
        df_similar=df_similar.sample(frac=1).reset_index(drop=True)
        keep_index=[]
        #for df_similar,we only allow 5 rows that share the same sub or obj
        for name in top_10_subs_objs:
            count=0
            for j in range(0,len(df_similar)):
                if df_similar.loc[j,'sub']==name or df_similar.loc[j,'obj']==name:
                    count+=1
                    keep_index.append(j)
                if count>=5:
                    break
        #
        df_similar=df_similar.loc[keep_index]
        df_similar.reset_index(drop=True, inplace=True)
        #build longmemstr by df_similar
        longmemstr=''
        for j in range(0,len(df_similar)):
            longmemstr=longmemstr+'['+df_similar.loc[j,'sub']+','+df_similar.loc[j,'rel']+','+df_similar.loc[j,'obj']+']'+'\n'
        shortmemstr='['+df_triples.loc[i,'sub']+','+df_triples.loc[i,'rel']+','+df_triples.loc[i,'obj']+']'+'\n'
        newline=ask_merge(longmemstr,shortmemstr, 1024, 0.5, "TheBloke/Yi-34B-Chat-AWQ",streamprint=False)
        cleared=newline.replace('\n','')
        #remove space in the front and end of the string
        cleared=cleared.strip()
        #use ast to convert string to list
        if '[' in cleared and ']' in cleared and cleared.count(',')==2:
        #get the first element of the list
            newsub=cleared[cleared.index('[')+1:cleared.index(',')].strip()
            newrel=cleared[cleared.index(',')+1:cleared.rindex(',')].strip()
            newobj=cleared[cleared.rindex(',')+1:cleared.rindex(']')].strip()
            #if newsub!=df_triples.loc[i,'sub'] or newobj!=df_triples.loc[i,'obj'] or newrel!=df_triples.loc[i,'rel'], modify the row and print how the row is modified
            newsub=newsub.replace('SUBJECT:',"")
            newobj=newobj.replace('OBJECT:',"")
            newrel=newrel.replace('RELATION:',"")
            if newsub!=df_triples.loc[i,'sub'] or newobj!=df_triples.loc[i,'obj'] or newrel!=df_triples.loc[i,'rel']:
                print('Before:',df_triples.loc[i,'sub'],df_triples.loc[i,'rel'],df_triples.loc[i,'obj'])
                print('After:',newsub,newrel,newobj)
                df_triples.loc[i,'sub']=newsub
                df_triples.loc[i,'rel']=newrel
                df_triples.loc[i,'obj']=newobj
                
        meet_sub_obj.append(df_triples.loc[i,'sub'])
        meet_sub_obj.append(df_triples.loc[i,'obj'])
        meet_sub_obj=list(set(meet_sub_obj))

In [None]:
import pickle
with open('name_dict.pkl', 'rb') as f:
    dict_subobj = pickle.load(f)
    
# Iterate over each row in df_triples
used_subobj=set()
for index, row in df_triples.iterrows():
    # Check if subtype or objtype is null
    
    # Check if sub or obj exists in dict_subobj
    value=row['sub'].lower()   
    if value in dict_subobj.keys():
        df_triples.at[index, 'subtype'] = dict_subobj[value]
        used_subobj.add(value) 
        if 'cve' in value and 'c'==value[0]:
            df_triples.at[index, 'subtype'] = 'CVE'
    #df_triples.at[index, 'subtype'] = dict_subobj[row['sub']]
    value=row['obj'].lower()
    if value in dict_subobj.keys():
        df_triples.at[index, 'objtype'] = dict_subobj[value]
        used_subobj.add(value)
        if 'cve' in value and 'c'==value[0]:
            df_triples.at[index, 'subtype'] = 'CVE'
                
            #df_triples.at[index, 'objtype'] = dict_subobj[row['obj']]

print(len(used_subobj))

In [None]:
all_sentences=df_triples['simplesen'].tolist()
print(len(all_sentences))
print(all_sentences[0])
#save all_sentences as pkl
import pickle
with open('all_sentences.pkl', 'wb') as f:
    pickle.dump(all_sentences, f)

In [None]:
#read all_sentences.pkl
from simpletransformers.classification import ClassificationModel, ClassificationArgs,MultiLabelClassificationModel  # type: ignore
import pickle
with open('all_sentences.pkl', 'rb') as f:
    all_sentences = pickle.load(f)
print(len(all_sentences))
print(all_sentences[0:3])
my_best_model_dir='/content/tactic model'
n_model_args = {
    "threshold": 0.5
    }
model=MultiLabelClassificationModel("roberta",my_best_model_dir,args=n_model_args)
predictions, raw_outputs = model.predict(all_sentences)
#read all_sentences.pkl
from simpletransformers.classification import ClassificationModel, ClassificationArgs,MultiLabelClassificationModel  # type: ignore
import pickle
with open('all_sentences.pkl', 'rb') as f:
    all_sentences = pickle.load(f)
print(len(all_sentences))
print(all_sentences[0:3])
my_best_model_dir='/content/behavior model'
n_model_args = {
    "threshold": 0.5
    }
model=ClassificationModel("roberta",my_best_model_dir,args=n_model_args)
bepredictions, raw_outputs = model.predict(all_sentences)
import pickle
with open('all_sentences_tactic.pkl', 'wb') as f:
    pickle.dump(predictions, f)
import pickle
with open('all_sentences_behavior.pkl', 'wb') as f:
    pickle.dump(bepredictions, f)

In [None]:
import pickle
with open('all_sentences_tactic.pkl', 'rb') as f:
    all_sentences_tactic = pickle.load(f)
df_triples['tactic']=all_sentences_tactic

import pickle
with open('all_sentences_behavior.pkl', 'rb') as f:
    all_sentences_tactic = pickle.load(f)
df_triples['be']=all_sentences_tactic

In [None]:
G=build_graph_alltriples(df_triples)