# Creating new dataset
### From slicing stentences of wikipedia articles

In [7]:
import json
from nltk.tokenize import sent_tokenize
import nltk

# Download tokenizer (only once)
nltk.download('punkt')

# --- Load JSON file ---
with open("datasets/new dataset/new_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# --- Extract sentences with IDs ---
sentences = []   # list of dicts: [{id, sentence}]
id=0
for record in data[:10]:
    record_id = record.get("id", None)
    text = record.get("text", "")
    for s in sent_tokenize(text):
        id=id+1
        sentences.append({
            "id": id,
            "sentence": s
        })

# --- Print results ---
for item in sentences:
    print(item)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gulji\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'id': 1, 'sentence': 'M-137 was a state trunkline highway in the US state of Michigan that served as a spur route to the Interlochen Center for the Arts and Interlochen State Park.'}
{'id': 2, 'sentence': 'It started south of the park and ran north between two lakes in the area and through the community of Interlochen to US Highway 31 (US 31) in Grand Traverse County.'}
{'id': 3, 'sentence': 'The highway was first shown without a number label on maps in 1930 and labeled after an extension the next year.'}
{'id': 4, 'sentence': "The highway's current routing was established in the 1950s."}
{'id': 5, 'sentence': 'Jurisdiction of the roadway was transferred from the Michigan Department of Transportation (MDOT) to the Grand Traverse County Road Commission in June 2020, and the highway designation was decommissioned in the process; signage was removed by August 2020 to reflect the changeover.'}
{'id': 6, 'sentence': '==Route description== M-137 began at the southern end of Interlochen Stat

### Saving sentences to file (unlabled)

In [18]:
with open("datasets/new dataset/sentences.json", "w", encoding="utf-8") as out:
    json.dump(sentences, out, indent=2, ensure_ascii=False)


###  Applying Bayesian Guassian to create a lables for the sentences

In [19]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import BayesianGaussianMixture
import nltk

# Download tokenizer (once)
nltk.download('punkt')

# --- Step 1: Load JSON file ---
with open("datasets/new dataset/sentences.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# --- Step 2: Extract text and IDs ---
texts = [item["sentence"] for item in data]
ids = [item["id"] for item in data]

# --- Step 3: Convert text to TF-IDF vectors ---
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(texts).toarray()

# --- Step 4: Apply Bayesian Gaussian Mixture Model ---
bgmm = BayesianGaussianMixture(
    n_components=5,        # max clusters (model prunes unused ones)
    covariance_type="full",
    random_state=42
)
bgmm.fit(X)
labels = bgmm.predict(X)

# --- Step 5: Combine results ---
results = [
    {"id": int(ids[i]), "text": texts[i], "label": int(labels[i])}
    for i in range(len(texts))
]

# --- Step 6: Save to JSON file ---
with open("datasets/new dataset/labeled_sentences.json", "w", encoding="utf-8") as out:
    json.dump(results, out, indent=2, ensure_ascii=False)

# --- Step 7: Print a sample ---
for item in results[:10]:
    print(item)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gulji\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'id': 1, 'text': 'M-137 was a state trunkline highway in the US state of Michigan that served as a spur route to the Interlochen Center for the Arts and Interlochen State Park.', 'label': 3}
{'id': 2, 'text': 'It started south of the park and ran north between two lakes in the area and through the community of Interlochen to US Highway 31 (US 31) in Grand Traverse County.', 'label': 3}
{'id': 3, 'text': 'The highway was first shown without a number label on maps in 1930 and labeled after an extension the next year.', 'label': 3}
{'id': 4, 'text': "The highway's current routing was established in the 1950s.", 'label': 3}
{'id': 5, 'text': 'Jurisdiction of the roadway was transferred from the Michigan Department of Transportation (MDOT) to the Grand Traverse County Road Commission in June 2020, and the highway designation was decommissioned in the process; signage was removed by August 2020 to reflect the changeover.', 'label': 3}
{'id': 6, 'text': '==Route description== M-137 began at 

In [8]:
from pathlib import Path

file_path = "datasets/new dataset/labeled_sentences.json"

with open(file_path, "r", encoding="utf-8") as file:
    # üëá This line MUST be indented
    data = json.load(file)

for record in data[:10]:
    record_id = record.get("id", None)
    text = record.get("text", "")
    print(record_id)

1
2
3
4
5
6
7
8
9
10
