# Data exploration

## Load the data

Gather paths from private json file

In [None]:
import json

with open('paths.json') as f:
    json_data = json.load(f)
json_data

In [None]:
csv_file = json_data['data']['file']

Load into dataframe

In [None]:
import pandas as pd

df = pd.read_csv(csv_file, parse_dates=['Last Seen'], engine='python')

Inspect properties

In [None]:
df

In [None]:
df.info()

In [None]:
df['Module'].unique()

In [None]:
df['Module'].value_counts()

In [None]:
df['Type'].unique()

In [None]:
df['Type'].value_counts()

In [None]:
count_series = df.groupby(['Module', 'Type']).size()
new_df = count_series.to_frame(name = 'Entries').reset_index()
new_df.sort_values(by=['Entries'], ascending=False)

## Display data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

plt.subplots(figsize=(20,15))
g=sns.countplot(x="Module", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
print('Plot it...')

In [None]:
plt.subplots(figsize=(20,15))
g=sns.countplot(x="Type", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
print('Plot it')

In [None]:
def cat_corr(df, x, y):
    modules = df[x].unique()
    types = df[y].unique()
    cat = [[0 for i in range(len(types))] for j in range(len(modules))]
    # prepare cross-correlation
    count_series = df.groupby([x, y]).size()
    # fill list of lists
    for imod, mod in enumerate(modules):
        for ityp, typ in enumerate(types):
            try:
                cat[imod][ityp] = count_series[mod][typ]
            except KeyError:
                pass
    return cat

In [None]:
#plt.subplots(figsize=(20,15))
#sns.heatmap(cat_corr(df, 'Module', 'Type'), xticklabels=types, yticklabels=modules)

## Analysis

In [None]:
import spacy
#from spacy.lang.xx import MultiLanguage
nlpEN = spacy.load('en')
nlpDE = spacy.load('de')

def appendBracket(data):
    return "{" + data + "}"
    
def getModuleJSONDummy(name):
    moduleString = '{"Module": "moduleName", "Result": moduleData}'
    moduleString = moduleString.replace("moduleName", name)
    return moduleString

def isGermanName(name):
    #name-like entries, e.g. "Kostenlose Service-Hotline" should return false
    humanName = False
   
    doc = nlpDE(name)
    endIndex = 0
    for ent in doc.ents:
        #print(ent.text, ent.start_char, ent.end_char, ent.label_)
        humanName = True
        endIndex = ent.end_char
        if(ent.label_ != "PER"):
            humanName = False
            break
    
    if(humanName and endIndex < len(name)):
        #including unprocessed part
        humanName = False
    
    return humanName

# NOT used
def isEnglishName(name):
    #name-like entries, e.g. "Kostenlose Service-Hotline" should return false
    humanName = False
    
    doc = nlpEN(name)
    endIndex = 0
    for ent in doc.ents:
        #print(ent.text, ent.start_char, ent.end_char, ent.label_)
        humanName = True
        if(ent.label_ != "PERSON"):
            humanName = False
            break
            
    if(humanName and endIndex < len(name)):
        #including unprocessed part
        humanName = False
        
    return humanName

def filterNames(nameData):
    removalList = []
    for name, count in nameData.items():
        #name-like entries, e.g. "Kostenlose Service-Hotline" could be removed here
        if(not isGermanName(name)):
            removalList.append(name)
    
    print("Count of names:", len(nameData))
    print("After filtering German Words:", len(nameData)- len(removalList))
    
    for name in removalList:
        nameData = nameData.drop(labels = name)

    return nameData

def getCountsData(df, moduleName, dataType):
    series = df.loc[(df['Module'] == moduleName) & (df['Type'] == dataType)]['Data'].value_counts()
    return series

def getMapData(df, moduleName, dataType, reverse=False):
    data = df.loc[(df['Module'] == moduleName) & (df['Type'] == dataType)]
    
    series = None
    if(not reverse):
        series = pd.Series(data['Data'].tolist(), index=data['Source'])
    else:
        series = pd.Series(data['Source'].tolist(), index=data['Data'])
        
    return series

# the major method defines which data to abstract and how to abstract them
def getData(df, moduleName, dataType):
    # sfp_names
    if (moduleName == 'sfp_names'):
        # only HUMAN_NAME is handled while no other type exists in current data file
        if (dataType == 'HUMAN_NAME'):
            data = getCountsData(df, moduleName, dataType)
            #name-like entries, e.g. "Kostenlose Service-Hotline" could be removed here
            data = filterNames(data)
            return data
    
    # sfp_dnsresolve
    if (moduleName == 'sfp_dnsresolve'):
        if (dataType == 'IP_ADDRESS'):
            data = getMapData(df, moduleName, dataType)
            return data
        if(dataType == 'AFFILIATE_INTERNET_NAME'):
            #TODO: value from this type looks like a IP-Host mapping but why it is affilicated?
            data = getMapData(df, moduleName, dataType, True)
            return data
        
       
    return None

def getTypes(df, moduleName):
    return df.loc[df['Module'] == moduleName]['Type'].unique()

def getModuleJSON(df, moduleName):
    # get module data if defined
    result = {}
    for dataType in getTypes(df, moduleName):
        data = getData(df, moduleName, dataType)
        if (data is not None):
            result[dataType] = data
    
    if(len(result) == 0):
        # no data exists or defined
        return ""
    
    moduleString = getModuleJSONDummy(moduleName)
    
    dataString = ""
    first = True
    for dataType in result:
        if (not first):
            dataString = dataString + ", "
            
        jsonData = result[dataType].to_json()
        dataString = dataString + '"' + dataType + '":' + jsonData
        first = False
        
    moduleString = moduleString.replace("moduleData", appendBracket(dataString))
    return moduleString


In [None]:
name = "Informationsabende Medizinisch-Psychologische"
print(isEnglishName(name), isGermanName(name))

#### sfp_names

In [None]:
#sfp_names - HUMAN_NAME
types = df.loc[df['Module'] == 'sfp_names']
#['Type'].value_counts()
#print (types.info())
#print("=================================================")
results = df.loc[(df['Module'] == 'sfp_names') & (df['Type'] == 'HUMAN_NAME')]
#print(results)

data = df.loc[(df['Module'] == 'sfp_dnsresolve') & (df['Type'] == 'AFFILIATE_INTERNET_NAME')][['Source', 'Data']]
#print(data)

print("=================================================")


In [None]:
resultName = getModuleJSON(df, "sfp_names")
resultDNS = getModuleJSON(df, "sfp_dnsresolve")
print(resultName)
print(resultDNS)

In [None]:
#save results to json file
#my_json_string = json.dumps(results)
with open('data/data.json', 'w') as outfile:
    outfile.truncate()
    json.dump(resultName, outfile)

In [None]:
#read results from json file
with open('data/data.json') as json_data:
    data = json.load(json_data)
    print(data)