In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import pandas as pd
from stratosphere import Stratosphere
from stratosphere.utils.inspect_flows import InspectFlows
from stratosphere import options
from stratosphere.services.extractor import Extractor
from stratosphere.storage.models import Flow

In [3]:
pd.options.display.max_rows = 10
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 100

## Process all samples

In [None]:
import glob
for db_pathname in glob.glob("/shared/data/samples/*"):
    e = Extractor(f"sqlite:///{db_pathname}")
    e.process()
    

## Samples vk.com

In [None]:
# Test extraction of user page , entity 8cfdfe5c-a59f-3a55-d7ab-1cd8553033ea
e = Extractor("sqlite:////shared/data/samples/vk-person.db")
e.process()

In [None]:
e.s_kb.db.pandas("select json_extract(data, '$.user_ids') as user_ids from entities where type = 'person'")

In [None]:
# Test extraction of friends list for entity 8cfdfe5c-a59f-3a55-d7ab-1cd8553033ea
e = Extractor("sqlite:////shared/data/samples/vk-friends.db")
e.process()
e.s_kb.db.pandas("select json_extract(data, '$.user_ids') as user_ids from entities where type = 'person'")

In [None]:
# Test extraction of friends list for entity 5801c74b-1b76-35d6-dbcb-0946e2bf4726, a friend of 8cfdfe5c-a59f-3a55-d7ab-1cd8553033ea
#e = Extractor("sqlite:////shared/data/samples/vk-person_friend.db")
#e.process()

In [None]:
# Test extraction friendship relationship: from https://vk.com/id200 to friend https://vk.com/maximka8 (clicking on link from friends list)
e = Extractor("sqlite:////shared/data/samples/vk.com-visiting-friend-from-friend.db")
e.process()



In [5]:
# Test extraction chain of friends
# andrey -> frederick -> valentin
e = Extractor("sqlite:////shared/data/samples/vk.com-chain-of-friends.db")
e.process()



[32m■[0m Processing 30 flows: start
[32m■[0m Added entity 3e459f3cbe5aa28d55643d087a9ef88b (Andrey Strelnikov)
[32m■[0m Added entity 19a8813161e2fa5571fa75cd5093a583 (Frederik Bulgakov)
[32m■[0m Added relationship 3e459f3cbe5aa28d55643d087a9ef88b -> 19a8813161e2fa5571fa75cd5093a583
[32m■[0m Added entity 72fce4e9ebc4d7c06e89571861b620a3 (Valentin Savelyev)
[32m■[0m Added relationship 19a8813161e2fa5571fa75cd5093a583 -> 72fce4e9ebc4d7c06e89571861b620a3
[32m■[0m Processing 30 flows: end


## Samples google searches

In [None]:
# Test extraction of google search results
e = Extractor("sqlite:////shared/data/samples/search-google-bellingcattools.db")
e.process()
#e = Extractor("sqlite:////shared/data/samples/search-google-multiple-pages.db")
#e.process()

In [None]:
kb = Stratosphere(options.get("db.url_kb"))


def get_google_searches():
    
    q = """
        SELECT
            entity_id,
            json_extract(data, '$.q') as query
        FROM entities
        WHERE type = "search_string"
    """
    df = kb.db.pandas(q)
    
    return df #df.set_index('query').to_dict(orient='index')

get_google_searches()

In [None]:
e.s_kb.db.pandas("select * from entities")

In [None]:
Extractor().process()

In [None]:
s = Stratosphere("sqlite:////shared/data/probe.db")
df = s.db.pandas("select * from flows")
df[df.flow_id == "aac65a493e82441eac9dd56d781fd679"]

In [None]:
stratosphere = Stratosphere("sqlite:////shared/data/probe.db")

with stratosphere.db.session() as session:
    rows = session.query(Flow).all()

df = pd.DataFrame([row.as_dict() for row in rows])
df[df.flow_id == '78b1fa25-afa0-4013-a646-a33e73d84916']

In [None]:
df.head()

In [None]:
for extractor_func in e.extractors.values():
    extractor_func(rows)

In [None]:
from stratosphere.services.extractors.vk01 import get_uuid_from_person_id

In [None]:
get_uuid_from_person_id("asd")

In [None]:
import uuid

s = "adasdadasd"
str(uuid.UUID(int=hash(s)))


In [None]:
import hashlib
h = hashlib.md5("whatever your string is".encode('utf-8')).hexdigest()
uuid.UUID(hex=h)