In [42]:
import bz2
import json
import requests
from io import BytesIO
import csv

test_range = range(5)
set_range = range(51,91)


# Shard URLs
shards = [
    f"https://huggingface.co/datasets/HaifaCLGroup/KnessetCorpus/resolve/main/protocols_sentences/committee_full_sentences_shards_bzip2_files/committee_full_sentences_shard_{i:02d}.jsonl.bz2"
    for i in set_range
]

# Keywords to search for
refugee_keywords = ["פליטים", "מסתננים","פליט ","מסתנן","מבקשי מקלט","מבקש מקלט","אריתריאה","אריתראה","סודן","סודאן","סודני","סודאני","אריתראי","דרפור"]
violence_keywords = ["מסומם", "מסוממ","תקיפה","לאנוס","אנס","אונס","רצח","סמים","פשיעה","פשע","אלימות"]
exclude = ["חממה","גזים","גזי","ג'נין","שועפאט","שועפט","מכונית","דיזל","בנזין","כלי רכב","מחנה פליטים","מחנה הפליטים"]


# Output CSV file
output_file = "knesset_filtered_sentences_3.csv"

# Open CSV   file for writing
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["shard", "sentence_id", "speaker_name", "sentence_text", "committee"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Iterate over all shards
    for shard_idx, url in enumerate(shards):
        shard_name = f"shard_{shard_idx:02d}"
        print(f"Processing {shard_name}: {url}")

        try:
            resp = requests.get(url, stream=True)
            resp.raise_for_status()
        except requests.RequestException as e:
            print(f"Failed to download shard {url}: {e}")
            continue

        # Open bz2 stream
        with bz2.open(BytesIO(resp.content), "rt", encoding="utf-8") as f:
            for line_idx, line in enumerate(f, 1):
                line = line.strip()
                if not line:
                    continue
                try:
                    example = json.loads(line)
                except json.JSONDecodeError:
                    continue  # skip malformed lines

                text = example.get("sentence_text", "")
                if any(k in text for k in refugee_keywords) and any(k in text for k in violence_keywords):
                    if not any(k in text for k in exclude):
                        writer.writerow({
                            "shard": shard_name,
                            "sentence_id": example.get("sentence_id", ""),
                            "speaker_name": example.get("speaker_name", ""),
                            "sentence_text": text
                            #"committee": example.get("committee_name", "")
                        })

        print(f"Finished {shard_name}")

Processing shard_00: https://huggingface.co/datasets/HaifaCLGroup/KnessetCorpus/resolve/main/protocols_sentences/committee_full_sentences_shards_bzip2_files/committee_full_sentences_shard_51.jsonl.bz2
Finished shard_00
Processing shard_01: https://huggingface.co/datasets/HaifaCLGroup/KnessetCorpus/resolve/main/protocols_sentences/committee_full_sentences_shards_bzip2_files/committee_full_sentences_shard_52.jsonl.bz2
Finished shard_01
Processing shard_02: https://huggingface.co/datasets/HaifaCLGroup/KnessetCorpus/resolve/main/protocols_sentences/committee_full_sentences_shards_bzip2_files/committee_full_sentences_shard_53.jsonl.bz2
Finished shard_02
Processing shard_03: https://huggingface.co/datasets/HaifaCLGroup/KnessetCorpus/resolve/main/protocols_sentences/committee_full_sentences_shards_bzip2_files/committee_full_sentences_shard_54.jsonl.bz2
Finished shard_03
Processing shard_04: https://huggingface.co/datasets/HaifaCLGroup/KnessetCorpus/resolve/main/protocols_sentences/committee_f