In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
! tree ../../data/longeval/parquet/ -L 3

[01;34m../../data/longeval/parquet/[0m
├── [01;34mtest[0m
│   ├── [01;34m2023_06[0m
│   │   ├── [01;34mEnglish[0m
│   │   └── [01;34mFrench[0m
│   └── [01;34m2023_08[0m
│       ├── [01;34mEnglish[0m
│       └── [01;34mFrench[0m
└── [01;34mtrain[0m
    └── [01;34m2023_01[0m
        ├── [01;34mEnglish[0m
        └── [01;34mFrench[0m

11 directories, 0 files


In [3]:
# now lets use the opensearch python client to query the data
from opensearchpy import OpenSearch

client = OpenSearch(
    hosts="http://localhost:9200",
)
client.info()

{'name': '2f8e4a1ae462',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': '_S2DVky8STOsUY_Fszn8qQ',
 'version': {'distribution': 'opensearch',
  'number': '2.18.0',
  'build_type': 'tar',
  'build_hash': '99a9a81da366173b0c2b963b26ea92e15ef34547',
  'build_date': '2024-10-31T19:08:39.157471098Z',
  'build_snapshot': False,
  'lucene_version': '9.12.0',
  'minimum_wire_compatibility_version': '7.10.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'The OpenSearch Project: https://opensearch.org/'}

In [15]:
# add a default index template that sets replicas to 0
# also make an indexable contents and docid fields
client.indices.put_template(
    name="default",
    body={
        "index_patterns": ["*"],
        "settings": {
            "number_of_replicas": 0,
            "number_of_shards": 4,
        },
        "mappings": {
            "properties": {
                "contents": {
                    "type": "text",
                },
                "docid": {
                    "type": "keyword",
                },
            },
        },
    },
)

{'acknowledged': True}

In [21]:
# list all indices
print(client.cat.indices())

green open .opensearch-observability LLtJ98aVQVyG-PQgbAzMaQ 1 0     0     0    208b    208b
green open .plugins-ml-config        2PtKr88ZQpu8vS0ShuBOEw 1 0     1     0     4kb     4kb
green open test00                    _NlNEUFFQaenVOLPob5KnA 1 0 20000 20209 166.3mb 166.3mb



In [22]:
# delete test00 if it exists
if client.indices.exists(index="test00"):
    client.indices.delete(index="test00")

In [5]:
from longeval.collection import ParquetCollection
from longeval.spark import get_spark

spark = get_spark()

root = "../../data/longeval/parquet/"
collection = ParquetCollection(spark, f"{root}/train/2023_01/English")
collection.documents.show(n=5)

24/12/24 12:18:48 WARN Utils: Your hostname, daphne-major resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/12/24 12:18:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/24 12:18:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/24 12:18:48 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
                                                                                

+--------------------+---------------+
|            contents|          docid|
+--------------------+---------------+
|WordPress WooComm...|doc012309400001|
|WHOIS Lookup\nCur...|doc012309400002|
|Bookstore page\nT...|doc012309400003|
|How to remove Get...|doc012309400004|
|Unreal Software\n...|doc012309400005|
+--------------------+---------------+
only showing top 5 rows



In [7]:
docs = collection.documents.limit(20_000).repartition(10).cache()
docs.count()

24/12/24 12:20:00 WARN CacheManager: Asked to cache already cached data.


20000

In [23]:
(
    docs.write.format("opensearch")
    .option("opensearch.nodes.wan.only", "true")
    .mode("overwrite")
    .save("test00")
)

                                                                                

In [24]:
(
    spark.read.format("opensearch")
    .option("opensearch.nodes.wan.only", "true")
    .load("test00")
).show(n=5)

+--------------------+---------------+
|            contents|          docid|
+--------------------+---------------+
|PHARMACIE D'ALMEI...|doc012309403358|
|News\n| eco.ted.f...|doc012309406574|
|Anime World Fanta...|doc012309903594|
|Editions ZOE / Ko...|doc012309408052|
|Eneco\nNatural Ga...|doc012309403637|
+--------------------+---------------+
only showing top 5 rows



In [25]:
# field capabilities
client.field_caps(index="test00", fields=["contents", "docid"])

{'indices': ['test00'],
 'fields': {'contents': {'text': {'type': 'text',
    'searchable': True,
    'aggregatable': False}},
  'docid': {'keyword': {'type': 'keyword',
    'searchable': True,
    'aggregatable': True}}}}

In [26]:
# show detaails of the test00 index
client.indices.get_mapping(index="test00")

{'test00': {'mappings': {'properties': {'contents': {'type': 'text'},
    'docid': {'type': 'keyword'}}}}}

In [27]:
# count rows
client.count(index="test00")

{'count': 20000,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0}}

In [31]:
# term query in contet for the word "game"
client.search(index="test00", body={"query": {"match": {"contents": "game"}}})

{'took': 45,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1860, 'relation': 'eq'},
  'max_score': 5.2551217,
  'hits': [{'_index': 'test00',
    '_id': '9sFX-pMBwzD4aYsGvC1U',
    '_score': 5.2551217,
    '_source': {'contents': 'Flash Games\n- Panda Free Games Play\nFree Online Games\nat PandaFreeGames.net\nThe Best Games of the Galaxy\nShots in\nthe\nWar\nScenario Click\nto play\nFlash\nGame Play\nFree\nGame Category:\nFlash Games Name: Shots in the\nWar\nScenario Catch Birds Click to play\nFlash\nGame Play\nFree\nGame Category:\nFlash Games Name: Catch Birds Penalty Duel Click to play\nFlash\nGame Play\nFree\nGame Category:\nFlash Games Name: Penalty Duel Training to Fight\nClick\nto play\nFlash\nGame Play\nFree\nGame Category:\nFlash Games Name: Training to Fight Shooting on the Truck Click to play\nFlash\nGame Play\nFree\nGame Category:\nFlash Games Name: Shooting on the Truck Coloring 3 Drawings Click to p