# STEP 1 - Loading JSON data, creating data class in Pydantic, data cleaning: removing, stopwords, punctuation.

In [None]:
import json
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from nltk.tokenize import word_tokenize
from pydantic import BaseModel
from typing import Optional
import string
import pandas as pd

# Downloading the Kazakh stopwords from NLTK
nltk.download('stopwords')


In [None]:
# Defining Pydantic data class for token information
class TokenData(BaseModel):
    lemma: str
    pos: str
    morph_features: Optional[str]  # Make it optional
    wordform: str


In [None]:
# Loading the JSON data
with open(r'C:\Users\monik\OneDrive\Desktop\job\sample_parsed_sentences.json', 'r', encoding='utf-8') as file:
    data = json.load(file)


In [None]:
# Creating a list to hold token data instances
token_instances = []

# Defining the list of stopwords and punctuation
stopwords_kz = set(nltk_stopwords.words('kazakh'))
punctuation = set(string.punctuation)


In [None]:
# Initializing lists to store TokenData instances
token_instances = []

# Iterating through sentences and tokens to create instances
for sentence in data['sentences']:
    for token in sentence['tokens']:
        # Check if the token is not a Kazakh stopword and not a punctuation
        if token['lemma'].lower() not in stopwords_kz and token['text'] not in punctuation:
            token_instance = TokenData(
                lemma=token['lemma'],
                pos=token.get('pos', ""),
                morph_features=token.get('feats', ""),
                wordform=token['text']
            )
            token_instances.append(token_instance)
            
# Printing the generated token instances
for token_instance in token_instances:
    print(token_instance.json())            


In [None]:
# Converting the list of token instances to a DataFrame
df = pd.DataFrame([token_instance.dict() for token_instance in token_instances])


# Step 2 - Data aggregation and creating JSON file output.

In [None]:
# Grouping by lemma and calculate frequency counts
lemma_freq = df.groupby('lemma').size().reset_index(name='total_frequency')

# Grouping by lemma and wordform to calculate wordform frequency counts
wordform_freq = df.groupby(['lemma', 'wordform']).size().reset_index(name='wordform_frequency')


In [None]:
# Merging lemma_freq and wordform_freq to create the output DataFrame
output_df = lemma_freq.merge(wordform_freq, on='lemma')


In [None]:
# Converting the output DataFrame to a JSON structure
output_json = []
for lemma, group in output_df.groupby('lemma'):
    wordform_info = []
    for idx, row in group.iterrows():
        wordform_info.append({
            "wordform": row['wordform'],
            "wordform_frequency": int(row['wordform_frequency']),  # Convert to integer
        })
    lemma_info = {
        "lemma": lemma,
        "wordforms": wordform_info,
        "total_frequency": int(group['total_frequency'].iloc[0])  # Convert to integer
    }
    output_json.append(lemma_info)


In [None]:
# Printing the JSON structure to the notebook
print(json.dumps(output_json, ensure_ascii=False, indent=4))


In [None]:
# Writing the JSON to a file
with open('output.json', 'w', encoding='utf-8') as outfile:
    json.dump(output_json, outfile, ensure_ascii=False, indent=4)
