In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import pandas as pd
from stratosphere import Stratosphere
from stratosphere.utils.inspect_flows import InspectFlows
from stratosphere import options
from stratosphere.services.extractor import Extractor
from stratosphere.storage.models import Flow

In [3]:
pd.options.display.max_rows = 10
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 100

## Process all samples

In [4]:
import glob
for db_pathname in glob.glob("/shared/data/samples/*"):
    e = Extractor(f"sqlite:///{db_pathname}")
    e.process()
    

[32m■[0m Processing 30 flows: start
[32m■[0m Added google search 9087537177b531b83175eb5ce7ea7cc5 (11 results for query "bellingcat tools")
[32m■[0m Processing 30 flows: end
[32m■[0m Processing 192 flows: start
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (13 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (10 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (9 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (11 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (10 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (10 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (10 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da

## Samples vk.com

In [5]:
# Test extraction of user page , entity 8cfdfe5c-a59f-3a55-d7ab-1cd8553033ea
e = Extractor("sqlite:////shared/data/samples/vk-person.db")
e.process()

[32m■[0m Processing 142 flows: start
[32m■[0m Added entity e6e1fa295df40fa68aafcd62917e77f6 (Andrey Kramodanov)
[32m■[0m Processing 142 flows: end


In [6]:
e.s_kb.db.pandas("select json_extract(data, '$.user_ids') as user_ids from entities where type = 'person'")

Unnamed: 0,user_ids
0,"[""68870871""]"
1,"[""2083437""]"
2,"[""2460541""]"
3,"[""3190606""]"
4,"[""3522545""]"
...,...
11534,"[""794704806""]"
11535,"[""200""]"
11536,"[""235""]"
11537,"[""35474809""]"


In [7]:
# Test extraction of friends list for entity 8cfdfe5c-a59f-3a55-d7ab-1cd8553033ea
e = Extractor("sqlite:////shared/data/samples/vk-friends.db")
e.process()
e.s_kb.db.pandas("select json_extract(data, '$.user_ids') as user_ids from entities where type = 'person'")

[32m■[0m Processing 4 flows: start
[32m■[0m Added entity e6e1fa295df40fa68aafcd62917e77f6
[32m■[0m Added 249 friends of entity e6e1fa295df40fa68aafcd62917e77f6
[32m■[0m Processing 4 flows: end


Unnamed: 0,user_ids
0,"[""68870871""]"
1,"[""2083437""]"
2,"[""2460541""]"
3,"[""3190606""]"
4,"[""3522545""]"
...,...
11534,"[""794704806""]"
11535,"[""200""]"
11536,"[""235""]"
11537,"[""35474809""]"


In [12]:
# Test extraction of friends list for entity 5801c74b-1b76-35d6-dbcb-0946e2bf4726, a friend of 8cfdfe5c-a59f-3a55-d7ab-1cd8553033ea
e = Extractor("sqlite:////shared/data/samples/vk-person_friend.db")
e.process()

[32m■[0m Processing 26 flows: start
[32m■[0m Added entity 40f845c2dc7e67434cb59f67dd4c6317 (Oleg Kvitsinia)
[32m■[0m Processing 26 flows: end


In [13]:
# Test extraction friendship relationship: from https://vk.com/id200 to friend https://vk.com/maximka8 (clicking on link from friends list)
e = Extractor("sqlite:////shared/data/samples/vk.com-visiting-friend-from-friend.db")
e.process()


[32m■[0m Processing 21 flows: start
[32m■[0m Added entity 3644a684f98ea8fe223c713b77189a77 (Andrey Strelnikov)
[32m■[0m Added entity fcdf25d6e191893e705819b177cddea0 (Maxim Sterlyadkin)
[32m■[0m Added relationship 3644a684f98ea8fe223c713b77189a77 -> fcdf25d6e191893e705819b177cddea0
[32m■[0m Processing 21 flows: end


In [14]:
# Test extraction chain of friends
# andrey -> frederick -> valentin
e = Extractor("sqlite:////shared/data/samples/vk.com-chain-of-friends.db")
e.process()


[32m■[0m Processing 30 flows: start
[32m■[0m Added entity 3644a684f98ea8fe223c713b77189a77 (Andrey Strelnikov)
[32m■[0m Added entity 577ef1154f3240ad5b9b413aa7346a1e (Frederik Bulgakov)
[32m■[0m Added relationship 3644a684f98ea8fe223c713b77189a77 -> 577ef1154f3240ad5b9b413aa7346a1e
[32m■[0m Added entity c07ad36b132e424c3b3be7b8ee7a6b01 (Valentin Savelyev)
[32m■[0m Added relationship 577ef1154f3240ad5b9b413aa7346a1e -> c07ad36b132e424c3b3be7b8ee7a6b01
[32m■[0m Processing 30 flows: end


## Samples google searches

In [15]:
# Test extraction of google search results
e = Extractor("sqlite:////shared/data/samples/search-google-bellingcattools.db")
e.process()
e = Extractor("sqlite:////shared/data/samples/search-google-multiple-pages.db")
e.process()

[32m■[0m Processing 30 flows: start
[32m■[0m Added google search 9087537177b531b83175eb5ce7ea7cc5 (11 results for query "bellingcat tools")
[32m■[0m Processing 30 flows: end
[32m■[0m Processing 192 flows: start
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (13 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (10 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (9 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (11 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (10 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (10 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da73b99eb8ede44b99 (10 results for query "Bellingcat MH17")
[32m■[0m Added google search da120230e6be97da