<a href="https://colab.research.google.com/github/gogela/Colab-D3js/blob/main/dozzimeter_data_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data fetch
fetch json data from Dozzimeter project to pandas dataframe

In [None]:
#@title fetch data from dozzimeter
%cd /content
!git clone --quiet https://github.com/dozzimeter/gh_actions.git 2> /dev/null

%cd /content/gh_actions


import glob, json
import pandas as pd

#concat all jsons to a single df
df_arr=[]
jsonlist = glob.glob('*.json')
for fname in jsonlist:
  with open(fname) as f:
    j = json.load(f)
    for k in j.keys():
      for jj  in j[k]:
        df_arr.append(pd.DataFrame(jj, index=['1',]))

df = pd.concat(df_arr)
df = df.drop_duplicates()
df.reset_index()
df.to_parquet('/content/df_dozzimeter.pq')
df


/content
/content/gh_actions


Unnamed: 0,ip,host,url
1,194.50.240.70,www.csas.cz,/bin/erstegroup/gemesgapi/feature/gem_site_cz_...
1,194.50.240.70,www.csas.cz,/cs/vyhledavani?q=$_1
1,194.50.240.70,www.csas.cz,/cs/osobni-finance/podpora
1,13.107.213.44,mfcr.cz,/cs/vyhledavani?q=$_1
1,13.107.213.44,mfcr.cz,/cs/aktualne/vydali-jsme?p=$_2
...,...,...,...
1,40.127.132.204,www.vitec-alma.com/www.vitec-alma.com,/me/
1,193.151.91.101,www.bisbank.com.ua,/news/page/$_1/?filter=1
1,193.151.91.101,www.bisbank.com.ua,/system/ajax
1,193.151.91.101,www.bisbank.com.ua,/on-line/deposit


#Open AI setup

In [None]:
#@title Install modules
!pip install -q typing_extensions==4.7.1
!pip install -q openai
from typing_extensions import Iterator

In [None]:
#@title AI prompts
from openai import OpenAI
from google.colab import userdata


key= userdata.get('openai_key')
client = OpenAI(
    # This is the default and can be omitted
    api_key=key,
)

#Prompt to evaluate an URL
def get_gpt_url_eval(urllist_string):
  prompt = f'''
  The provided data represent several URL paths from a certain website. For each of the paths try to consider its purpose (search, authentication, password reset, api, etc...), note that the terms used in the path might not be English so consider the terms meaning in different possible languages when deciding the purpose. If any of the paths indicates a specific system (a CMS or an other platform) note that too. The output should be json formatted as follows (output the json data only, suppress any other text output):
  {{"stack":["any detected cms or platforms"],
  "paths":[
  "url_path":"copy provided data item here",
  "purpose":"purpose of the service eg. search"]
  }}

  URL path data:
  {urllist_string}
  '''
  response = client.chat.completions.create(
    model="gpt-3.5-turbo-1106",
    seed=4646,
    response_format={ "type": "json_object" },
    messages=[
      {"role": "system", "content": "You are an expert in web application analysis and development designed to output JSON."},
      {"role": "user", "content": prompt}
    ]
  )
  # print(response.choices[0].message.content)
  return response.choices[0].message.content


#prompt for consolidation of categories
def get_gpt_category(text,category_list):
  categories='\n'.join(category_list)
  prompt = f'''
  Please consider the following list of categories:
  -----------
  {categories}
  ----------
  Read the following text and decide which category from the list it fits best, if no category is a good fit consider it belongs to category "other", provide output in json form {{"category":"selected category"}}:
  "{text}"
  '''
  response = client.chat.completions.create(
    model="gpt-3.5-turbo-1106",
    seed=4646,
    response_format={ "type": "json_object" },
    messages=[
      {"role": "system", "content": "You are an expert in analyzing and categorizing short texts designed to output JSON."},
      {"role": "user", "content": prompt}
    ]
  )
  # print(response.choices[0].message.content)
  return response.choices[0].message.content



In [None]:
#@title Evaluate url groups (per host)

#group data by host IP
grouped = df.groupby(df.ip)
site_eval={}

# Iterate groups (per IP)
for group_name, group_df in grouped:
  print(group_name)
  try:
    ul = list(group_df['url'])
    ul = '\n'.join(ul)

    #run the LLM analysis of url list
    res = get_gpt_url_eval(ul)

    site_eval[group_name]=res
    for i in range(len(group_df)):
      row = group_df.iloc[i]
      j = json.loads(res)
      df.loc[((df.ip==row.ip) & (df.host==row.host) & (df.url==row.url)), 'stack']=j['stack'][0] if len(j['stack']) else 'n/a'
      df.loc[((df.ip==row.ip) & (df.host==row.host) & (df.url==row.url)),'purpose']=''.join([x['purpose'] for x in j['paths'] if x['url_path']==row.url])
  except:
    print('whatever shit happened for: ',group_name) #typically caused by invaid json from chatGPT

df.to_parquet('/content/df_dozzimeter_with_categories.pq')
df

In [None]:
# import pandas as pd
# df =pd.read_parquet('/content/df_dozzimeter_with_categories.pq')

In [None]:
#select top 20 categories
df['purpose']=df['purpose'].str.lower() #converto all to lowercase
df.loc[df.purpose=='','purpose']='other' #replace empty with other
df.loc[df.purpose=='unknown','purpose']='other' #replace unknown with other
cats = df.groupby(df.purpose, as_index=False).count().sort_values('stack',ascending=False)
categories = list(cats.iloc[0:19]['purpose'])
categories



['search',
 'other',
 'authentication',
 'password reset',
 'api',
 'contact',
 'news',
 'contact information',
 'user registration',
 'information',
 'category',
 'contact form',
 'language selection',
 'content',
 'registration',
 'service',
 'contact form submission',
 'homepage',
 'newsletter subscription']

In [None]:
#Let chatgpt find the closest category for the items out of top 20
pd.options.mode.chained_assignment = None #disable warning
for i in range(len(df)):
  row = df.iloc[i]
  if row.purpose not in categories:
    c = json.loads(get_gpt_category(row.purpose, categories))['category']
    if c in categories:
      print(row.purpose,'->',c)
      row.purpose=c
    else:
      print('NON EXISTENT CAT:',row.purpose,'->',c)

In [None]:
def consolidate_category(row):
  cat = row['purpose']
  if cat not in categories:
    c = json.loads(get_gpt_category(cat, categories))['category']
    if c in categories:
      print(cat,'->',c)
      cat=c
    else:
      print('NON EXISTENT CAT:',row.purpose,'->',c)
      cat='other'
  return cat
df['newcat']  = df.apply(lambda x: consolidate_category(x), axis=1)


In [None]:
df.to_parquet('/content/df_dozzimeter_with_cat_consolidated.pq')


In [5]:
import pandas as pd
df = pd.read_parquet('df_dozzimeter_with_cat_consolidated.pq')
df

Unnamed: 0,ip,host,url,stack,purpose,newcat
1,194.50.240.70,www.csas.cz,/bin/erstegroup/gemesgapi/feature/gem_site_cz_...,,other,other
1,194.50.240.70,www.csas.cz,/cs/vyhledavani?q=$_1,,search,search
1,194.50.240.70,www.csas.cz,/cs/osobni-finance/podpora,,support,other
1,13.107.213.44,mfcr.cz,/cs/vyhledavani?q=$_1,Umbraco,search,search
1,13.107.213.44,mfcr.cz,/cs/aktualne/vydali-jsme?p=$_2,Umbraco,other,other
...,...,...,...,...,...,...
1,40.127.132.204,www.vitec-alma.com/www.vitec-alma.com,/me/,,user profile,other
1,193.151.91.101,www.bisbank.com.ua,/news/page/$_1/?filter=1,,news page with filtering,news
1,193.151.91.101,www.bisbank.com.ua,/system/ajax,,system ajax request,api
1,193.151.91.101,www.bisbank.com.ua,/on-line/deposit,,online deposit functionality,service


In [4]:
df.groupby(df.newcat, as_index=False).count().sort_values('stack',ascending=False)

Unnamed: 0,newcat,ip,host,url,stack,purpose
13,other,1585,1585,1585,1553,1553
16,search,970,970,970,970,970
9,information,439,439,439,439,439
1,authentication,394,394,394,394,394
11,news,333,333,333,333,333
17,service,189,189,189,189,189
0,api,178,178,178,178,178
3,contact,150,150,150,150,150
14,password reset,149,149,149,149,149
7,content,110,110,110,110,110


#data visualization in D3.js

In [None]:
df.to_parquet('dozzimeter_final.pq')

In [9]:
df['tld'] = df.apply(lambda x: x['host'].split('.')[-1],axis=1)

In [11]:
#Create json for D3
import json
grouped_df = df.groupby(['tld','newcat']).count()
d3_data = {'name':'dozz data', 'children':[]}
# grouped_df.loc['ro','contact']
tlds = list(grouped_df.index.get_level_values(0).unique())
for tld in tlds:
  tld_entry = {'name':tld,'children':[]}
  for cat,row  in grouped_df.loc[tld].iterrows():
    # print(tld,cat,row['host'])
    tld_entry['children'].append({'name':cat,'value':str(row['host'])})
  d3_data['children'].append(tld_entry)

d3_data = json.dumps(d3_data)


In [13]:
#Javascript from d3js samples (slightly modified to produce html)
html = f'''
<!DOCTYPE html>
<meta charset="utf-8">

<!-- Load d3.js -->

<script src="https://cdn.jsdelivr.net/npm/d3@7"></script>

<!-- Create a div where the graph will take place -->
<div id="my_dataviz"></div>
<script>
  const data = {d3_data}
  const width = 800;
  const height = width;
  const radius = width / 6;

  // Create the color scale.
  const color = d3.scaleOrdinal(d3.quantize(d3.interpolateRainbow, data.children.length + 1));

  // Compute the layout.
  const hierarchy = d3.hierarchy(data)
      .sum(d => d.value)
      .sort((a, b) => b.value - a.value);
  const root = d3.partition()
      .size([2 * Math.PI, hierarchy.height + 1])
    (hierarchy);
  root.each(d => d.current = d);

  // Create the arc generator.
  const arc = d3.arc()
      .startAngle(d => d.x0)
      .endAngle(d => d.x1)
      .padAngle(d => Math.min((d.x1 - d.x0) / 2, 0.005))
      .padRadius(radius * 1.5)
      .innerRadius(d => d.y0 * radius)
      .outerRadius(d => Math.max(d.y0 * radius, d.y1 * radius - 1))

  // Create the SVG container.
  //const svg = d3.create("svg")
  //    .attr("viewBox", [-width / 2, -height / 2, width, width])
  //    .style("font", "10px sans-serif");
  var svg = d3.select("#my_dataviz")
  .append("svg")
    .attr("width", width)
    .attr("height", width)
  .append("g")
    .attr("transform", "translate(400, 400)")

  // Append the arcs.
  const path = svg.append("g")
    .selectAll("path")
    .data(root.descendants().slice(1))
    .join("path")
      .attr("fill", d => {{ while (d.depth > 1) d = d.parent; return color(d.data.name); }})
      .attr("fill-opacity", d => arcVisible(d.current) ? (d.children ? 0.6 : 0.4) : 0)
      .attr("pointer-events", d => arcVisible(d.current) ? "auto" : "none")

      .attr("d", d => arc(d.current));

  // Make them clickable if they have children.
  path.filter(d => d.children)
      .style("cursor", "pointer")
      .on("click", clicked);

  const format = d3.format(",d");
  path.append("title")
      .text(d => `${{d.ancestors().map(d => d.data.name).reverse().join("/")}}\n${{format(d.value)}}`);

  const label = svg.append("g")
      .attr("pointer-events", "none")
      .attr("text-anchor", "middle")
      .style("user-select", "none")
    .selectAll("text")
    .data(root.descendants().slice(1))
    .join("text")
      .attr("dy", "0.35em")
      .attr("fill-opacity", d => +labelVisible(d.current))
      .attr("transform", d => labelTransform(d.current))
      .text(d => d.data.name);

  const parent = svg.append("circle")
      .datum(root)
      .attr("r", radius)
      .attr("fill", "none")
      .attr("pointer-events", "all")
      .on("click", clicked);

  // Handle zoom on click.
  function clicked(event, p) {{
    parent.datum(p.parent || root);

    root.each(d => d.target = {{
      x0: Math.max(0, Math.min(1, (d.x0 - p.x0) / (p.x1 - p.x0))) * 2 * Math.PI,
      x1: Math.max(0, Math.min(1, (d.x1 - p.x0) / (p.x1 - p.x0))) * 2 * Math.PI,
      y0: Math.max(0, d.y0 - p.depth),
      y1: Math.max(0, d.y1 - p.depth)
    }});

    const t = svg.transition().duration(750);

    // Transition the data on all arcs, even the ones that aren’t visible,
    // so that if this transition is interrupted, entering arcs will start
    // the next transition from the desired position.
    path.transition(t)
        .tween("data", d => {{
          const i = d3.interpolate(d.current, d.target);
          return t => d.current = i(t);
        }})
      .filter(function(d) {{
        return +this.getAttribute("fill-opacity") || arcVisible(d.target);
      }})
        .attr("fill-opacity", d => arcVisible(d.target) ? (d.children ? 0.6 : 0.4) : 0)
        .attr("pointer-events", d => arcVisible(d.target) ? "auto" : "none")

        .attrTween("d", d => () => arc(d.current));

    label.filter(function(d) {{
        return +this.getAttribute("fill-opacity") || labelVisible(d.target);
      }}).transition(t)
        .attr("fill-opacity", d => +labelVisible(d.target))
        .attrTween("transform", d => () => labelTransform(d.current));
  }}

  function arcVisible(d) {{
    return d.y1 <= 3 && d.y0 >= 1 && d.x1 > d.x0;
  }}

  function labelVisible(d) {{
    return d.y1 <= 3 && d.y0 >= 1 && (d.y1 - d.y0) * (d.x1 - d.x0) > 0.03;
  }}

  function labelTransform(d) {{
    const x = (d.x0 + d.x1) / 2 * 180 / Math.PI;
    const y = (d.y0 + d.y1) / 2 * radius;
    return `rotate(${{x - 90}}) translate(${{y}},0) rotate(${{x < 180 ? 0 : 180}})`;
  }}
</script>
'''

In [None]:
from IPython.core.display import HTML
HTML(html)