# Data exploration

## Load the data

Gather paths from private json file

In [None]:
import json

with open('paths.json') as f:
    json_data = json.load(f)
json_data

In [None]:
csv_file = json_data['data']['file']

Load into dataframe

In [None]:
import pandas as pd

df = pd.read_csv(csv_file, parse_dates=['Last Seen'], engine='python')

Inspect properties

In [None]:
df

In [None]:
df.info()

In [None]:
df['Module'].unique()

In [None]:
df['Module'].value_counts()

In [None]:
df['Type'].unique()

In [None]:
df['Type'].value_counts()

In [None]:
count_series = df.groupby(['Module', 'Type']).size()
new_df = count_series.to_frame(name = 'Entries').reset_index()
new_df

## Display data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

plt.subplots(figsize=(20,15))
g=sns.countplot(x="Module", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
print('Plot it...')

In [None]:
plt.subplots(figsize=(20,15))
g=sns.countplot(x="Type", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
print('Plot it')

In [None]:
def cat_corr(df, x, y):
    modules = df[x].unique()
    types = df[y].unique()
    cat = [[0 for i in range(len(types))] for j in range(len(modules))]
    # prepare cross-correlation
    count_series = df.groupby([x, y]).size()
    # fill list of lists
    for imod, mod in enumerate(modules):
        for ityp, typ in enumerate(types):
            try:
                cat[imod][ityp] = count_series[mod][typ]
            except KeyError:
                pass
    return cat

In [None]:
plt.subplots(figsize=(20,15))
sns.heatmap(cat_corr(df, 'Module', 'Type'), xticklabels=types, yticklabels=modules)