# Data exploration

## Load the data

Gather paths from private json file

In [None]:
import json

with open('paths.json') as f:
    json_data = json.load(f)
json_data

In [None]:
csv_file = json_data['data']['file']

Load into dataframe

In [None]:
import pandas as pd

df = pd.read_csv(csv_file, engine='python')

Inspect properties

In [None]:
df

In [None]:
df.info()

In [None]:
df['Module'].unique()

In [None]:
df['Module'].value_counts()

In [None]:
df['Type'].unique()

In [None]:
df['Type'].value_counts()

In [None]:
count_series = df.groupby(['Module', 'Type']).size()
new_df = count_series.to_frame(name = 'Entries').reset_index()
new_df.sort_values(by=['Entries'], ascending=False)

## Display data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

plt.subplots(figsize=(20,15))
g=sns.countplot(x="Module", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
print('Plot it...')

In [None]:
plt.subplots(figsize=(20,15))
g=sns.countplot(x="Type", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
print('Plot it')

In [None]:
def cat_corr(df, x, y):
    modules = df[x].unique()
    types = df[y].unique()
    cat = [[0 for i in range(len(types))] for j in range(len(modules))]
    # prepare cross-correlation
    count_series = df.groupby([x, y]).size()
    # fill list of lists
    for imod, mod in enumerate(modules):
        for ityp, typ in enumerate(types):
            try:
                cat[imod][ityp] = count_series[mod][typ]
            except KeyError:
                pass
    return cat

In [None]:
#plt.subplots(figsize=(20,15))
#sns.heatmap(cat_corr(df, 'Module', 'Type'), xticklabels=types, yticklabels=modules)

## Analysis

In [None]:
import spacy
# from spacy.lang.xx import MultiLanguage
nlpEN = spacy.load('en')
nlpDE = spacy.load('de')

def get_module_dummy(name):
    module = {}
    module['Module'] = name
    return module

def isGermanName(name):
    # name-like entries, e.g. "Kostenlose Service-Hotline" should return false
    humanName = False
   
    doc = nlpDE(name)
    endIndex = 0
    for ent in doc.ents:
        #print(ent.text, ent.start_char, ent.end_char, ent.label_)
        humanName = True
        endIndex = ent.end_char
        if(ent.label_ != "PER"):
            humanName = False
            break
    
    if(humanName and endIndex < len(name)):
        #including unprocessed part
        humanName = False
    
    return humanName

# NOT used
def isEnglishName(name):
    # name-like entries, e.g. "Kostenlose Service-Hotline" should return false
    humanName = False
    
    doc = nlpEN(name)
    endIndex = 0
    for ent in doc.ents:
        # print(ent.text, ent.start_char, ent.end_char, ent.label_)
        humanName = True
        if(ent.label_ != "PERSON"):
            humanName = False
            break
            
    if(humanName and endIndex < len(name)):
        #including unprocessed part
        humanName = False
        
    return humanName

def filter_names(nameData):
    removalList = []
    for name, count in nameData.items():
        # name-like entries, e.g. "Kostenlose Service-Hotline" could be removed here
        if(not isGermanName(name)):
            removalList.append(name)
    
    print("Count of names:", len(nameData))
    print("After filtering German Words:", len(nameData)- len(removalList))
    
    for name in removalList:
        nameData = nameData.drop(labels = name)

    return nameData

def getCountsData(df, moduleName, data_type):
    series = df.loc[(df['Module'] == moduleName) & (df['Type'] == data_type)]['Data'].value_counts()
    return series

def get_mapped_data(data_frame, module_name, data_type, reverse=False):
    data = data_frame.loc[(data_frame['Module'] == module_name) & (data_frame['Type'] == data_type)]
    
    series = None
    if(reverse):
        series = pd.Series(data['Source'].tolist(), index=data['Data'])
    else:
        series = pd.Series(data['Data'].tolist(), index=data['Source'])
    
    return series

# the major method defines which data to abstract and how to abstract them
def get_data(data_frame, module_name, data_type):
    data = None
    
    # sfp_names
    if (module_name == 'sfp_names'):
        # only HUMAN_NAME is handled while no other type exists in current data file
        if (data_type == 'HUMAN_NAME'):
            # collected human names and their counts
            data = getCountsData(data_frame, module_name, data_type)
            # name-like entries, e.g. "Kostenlose Service-Hotline" could be removed here
            data = filter_names(data)
    
    # sfp_dnsresolve
    if (module_name == 'sfp_dnsresolve'):
        if (data_type == 'IP_ADDRESS'):
            # ip vs. hostname
            data = get_mapped_data(data_frame, module_name, data_type, True)
            
        if(data_type == 'AFFILIATE_INTERNET_NAME'):
            # 
            data = get_mapped_data(data_frame, module_name, data_type)
    
    # sfp_pastebin
    if (module_name == 'sfp_pastebin'):
        if (data_type == 'LEAKSITE_CONTENT'):
            # site content found on pastebins (pure text storage on internet)
            data = get_mapped_data(data_frame, module_name, data_type)
            
        if(data_type == 'LEAKSITE_URL'):
            # site url found on pastebins (including query parameters etc.)
            data = get_mapped_data(data_frame, module_name, data_type, True)
    
    if(data is not None):
        # dict is automatically serializable
        data = data.to_dict()
    
    return data

def get_types(data_frame, module_name):
    return data_frame.loc[df['Module'] == module_name]['Type'].unique()

def get_module(data_frame, module_name):
    # get module data if defined
    result = {}
    for data_type in get_types(data_frame, module_name):
        data = get_data(data_frame, module_name, data_type)
        if (data is not None):
            result[data_type] = data
    
    if(len(result) == 0):
        # no data exists or defined
        return None
    
    module = get_module_dummy(module_name)
    module['Result'] = result
    
    return module


In [None]:
name = "Informationsabende Medizinisch-Psychologische"
print(isEnglishName(name), isGermanName(name))

#### sfp_names

In [None]:
#sfp_names - HUMAN_NAME
types = df.loc[df['Module'] == 'sfp_names']
#['Type'].value_counts()
#print (types.info())
#print("=================================================")
#results = df.loc[(df['Module'] == 'sfp_binstring')]['Source'][0]
#print(results)

data = df.loc[(df['Module'] == 'sfp_dnsresolve') & (df['Type'] == 'AFFILIATE_INTERNET_NAME')][['Source', 'Data']]
#print(data)

print("=================================================")


In [None]:
modules = ["sfp_names", "sfp_dnsresolve", "sfp_pastebin"]
result = []

for module_name in modules:
    data = get_module(df, module_name)
    if (data is not None):
        result.append(data)

result = json.dumps(result, indent=4)
print(result)

In [None]:
#save results to json file
#my_json_string = json.dumps(results)
with open('data/data.json', 'w') as outfile:
    outfile.truncate()
    outfile.write(json.dumps(result, indent=4))

In [None]:
#read results from json file
with open('data/data.json') as json_data:
    data = json.load(json_data)
    print(data)