# Data exploration

## Load the data

Gather paths from private json file

In [None]:
import json

with open('paths.json') as f:
    json_data = json.load(f)
json_data

In [None]:
csv_file = json_data['data']['file']

Load into dataframe

In [None]:
import pandas as pd

df = pd.read_csv(csv_file, parse_dates=['Last Seen'], engine='python')

Inspect properties

In [None]:
df

In [None]:
df.info()

In [None]:
df['Module'].unique()

In [None]:
df['Module'].value_counts()

In [None]:
df['Type'].unique()

In [None]:
df['Type'].value_counts()

In [None]:
count_series = df.groupby(['Module', 'Type']).size()
new_df = count_series.to_frame(name = 'Entries').reset_index()
new_df

## Display data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

plt.subplots(figsize=(20,15))
g=sns.countplot(x="Module", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
print('Plot it...')

In [None]:
plt.subplots(figsize=(20,15))
g=sns.countplot(x="Type", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
print('Plot it')

In [None]:
def cat_corr(df, x, y):
    modules = df[x].unique()
    types = df[y].unique()
    cat = [[0 for i in range(len(types))] for j in range(len(modules))]
    # prepare cross-correlation
    count_series = df.groupby([x, y]).size()
    # fill list of lists
    for imod, mod in enumerate(modules):
        for ityp, typ in enumerate(types):
            try:
                cat[imod][ityp] = count_series[mod][typ]
            except KeyError:
                pass
    return cat

In [None]:
plt.subplots(figsize=(20,15))
sns.heatmap(cat_corr(df, 'Module', 'Type'), xticklabels=types, yticklabels=modules)

## Analysis

In [None]:
def appendBracket(data):
    return "{" + data + "}"
    
def getModuleJSON(name):
    moduleString = '{"Module": "moduleName", "Result": moduleData}'
    moduleString = moduleString.replace("moduleName", name)
    return moduleString

#TODO
def isHumanName(name):
    #name-like entries, e.g. "Kostenlose Service-Hotline" should return false
    return True

#TODO
def filterNames(nameData):
    removalList = []
    for name, count in nameData.items():
        #name-like entries, e.g. "Kostenlose Service-Hotline" could be removed here
        if(not isHumanName(name)):
            removalList.append(name)
      
    for name in removalList:
        nameData = nameData.drop(labels = name)

    return nameData
    
def getNamesJSON(df):
    moduleString = getModuleJSON("sfp_names")
    
    dataString = ""
    first = True
    for nameType in df.loc[df['Module'] == 'sfp_names']['Type'].unique():
        if (not first):
            dataString = dataString + ", "
            first = False
        nameData = df.loc[(df['Module'] == 'sfp_names') & (df['Type'] == nameType)]['Data'].value_counts()
        nameData = filterNames(nameData)
        jsonData = nameData.to_json()
        dataString = dataString + '"' + nameType + '":' + jsonData
        
    moduleString = moduleString.replace("moduleData", appendBracket(dataString))
    return moduleString
    

In [None]:
#sfp_names - HUMAN_NAME
types = df.loc[df['Module'] == 'sfp_names']['Type'].value_counts()
print (types)
print("=================================================")
results = df.loc[(df['Module'] == 'sfp_names') & (df['Type'] == 'HUMAN_NAME')]['Data'].value_counts()
results.to_json()

resultString = getNamesJSON(df)
print(resultString)

In [None]:
#save results to json file
#my_json_string = json.dumps(results)
with open('data/data.json', 'w') as outfile:
    outfile.truncate()
    json.dump(resultString, outfile)

In [None]:
#read results from json file
with open('data/data.json') as json_data:
    data = json.load(json_data)
    print(data)