In [1]:
import os
import glob
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def clean_text(text):
    """Remove punctuation while keeping only Devanagari words and spaces."""
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # Keep only Devanagari characters and spaces
    text = re.sub(r'[।॥]', '', text)  # Remove Devanagari punctuation marks
    return text


def custom_tokenizer(text):
    """Tokenize text into words while preserving full words."""
    return text.split()  # Simple space-based splitting to keep full words intact

def build_word_count_dictionary(folder_path):
    documents = []
    file_names = glob.glob(os.path.join(folder_path, "*.txt"))[:20]  # Limit to first 20 files

    for file_path in file_names:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
            cleaned_text = clean_text(text)  # Remove punctuation before tokenization
            documents.append(cleaned_text)

    if not documents:
        return {}

    vectorizer = CountVectorizer(analyzer='word', tokenizer=custom_tokenizer)  # Use custom tokenizer
    word_count_matrix = vectorizer.fit_transform(documents)

    word_counts = np.asarray(word_count_matrix.sum(axis=0)).flatten()
    word_freq_dict = dict(zip(vectorizer.get_feature_names_out(), word_counts))
    # print('word_freq_dict', word_freq_dict)

    sorted_word_freq = dict(sorted(word_freq_dict.items(), key=lambda item: item[1], reverse=True))
    # print('sorted_word_freq', sorted_word_freq)

    # print("Word count dictionary generated for first 20 files (excluding punctuation and preserving full words).")

    return sorted_word_freq



CORPUS_FOLDER = "/Users/chandra/Documents/4th sem/sentiment_analysis/archive/train/train"
WORD_COUNT_DICTIONARY = build_word_count_dictionary(CORPUS_FOLDER)
WORD_COUNT_DICTIONARY



{'र': np.int64(49),
 'थियो': np.int64(22),
 'हो': np.int64(22),
 'पनि': np.int64(20),
 'यो': np.int64(17),
 'भएको': np.int64(15),
 'एक': np.int64(14),
 'छन्': np.int64(13),
 'तथा': np.int64(13),
 'थिए': np.int64(13),
 'छ': np.int64(11),
 'रूपमा': np.int64(11),
 'नै': np.int64(10),
 'द्वारा': np.int64(9),
 'मा': np.int64(9),
 'लागि': np.int64(9),
 'हात्ती': np.int64(9),
 'उनका': np.int64(8),
 'पाश्चात्य': np.int64(8),
 'प्रधानमन्त्री': np.int64(8),
 'राणा': np.int64(8),
 'विसं': np.int64(8),
 'सबैभन्दा': np.int64(8),
 'उनको': np.int64(7),
 'उनले': np.int64(7),
 'उनी': np.int64(7),
 'नेपालमा': np.int64(7),
 'शिक्षाको': np.int64(7),
 'सन्': np.int64(7),
 'हुन्': np.int64(7),
 'आफ्नो': np.int64(6),
 'गोल': np.int64(6),
 'पहिलो': np.int64(6),
 'रहेको': np.int64(6),
 'हात्तीहरू': np.int64(6),
 'हुन्छ': np.int64(6),
 'होइन': np.int64(6),
 'अधिकार': np.int64(5),
 'ऐन': np.int64(5),
 'गरे': np.int64(5),
 'जंगबहादुर': np.int64(5),
 'ठूलो': np.int64(5),
 'दुई': np.int64(5),
 'नेपाल': np.int64(5),

In [1]:
import os
import glob
import re
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer

PICKLE_FILE = "/Users/chandra/Documents/4th sem/sentiment_analysis/dictionary/word_count.pkl"  # Path to store the frequency dictionary

def clean_text(text):
    """Remove punctuation while keeping only Devanagari words and spaces."""
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # Keep only Devanagari characters and spaces
    text = re.sub(r'[।॥]', '', text)  # Remove Devanagari punctuation marks
    return text


def custom_tokenizer(text):
    """Tokenize text into words while preserving full words."""
    return text.split()  # Simple space-based splitting to keep full words intact

def load_existing_data():
    """Load word frequency dictionary from a pickle file if it exists."""
    if os.path.exists(PICKLE_FILE):
        with open(PICKLE_FILE, "rb") as f:
            return pickle.load(f)
    return {}  # Return empty dictionary if no existing data

def save_data(data):
    """Save the updated word frequency dictionary to a pickle file."""
    with open(PICKLE_FILE, "wb") as f:
        pickle.dump(data, f)

def build_word_count_dictionary(folder_path):
    existing_word_count = load_existing_data()  # Load existing data

    documents = []
    file_names = glob.glob(os.path.join(folder_path, "*.txt"))  # Limit to first 20 files

    for file_path in file_names:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
            cleaned_text = clean_text(text)  # Remove punctuation before tokenization
            documents.append(cleaned_text)

    # if not documents:
    #     return existing_word_count  # Return existing data if no new documents found

    vectorizer = CountVectorizer(analyzer='word', tokenizer=custom_tokenizer)  # Use custom tokenizer
    word_count_matrix = vectorizer.fit_transform(documents)

    word_counts = np.asarray(word_count_matrix.sum(axis=0)).flatten()
    new_word_freq_dict = dict(zip(vectorizer.get_feature_names_out(), word_counts))

    # Merge existing and new word counts
    for word, count in new_word_freq_dict.items():
        existing_word_count[word] = existing_word_count.get(word, 0) + count

    # Sort by frequency (descending order)
    sorted_word_freq = dict(sorted(existing_word_count.items(), key=lambda item: item[1], reverse=True))

    # Save the updated dictionary
    save_data(sorted_word_freq)

    print("Word count dictionary updated and saved.")

    return sorted_word_freq

# Usage
CORPUS_FOLDER = "/Users/chandra/Documents/4th sem/sentiment_analysis/archive/train/train"
WORD_COUNT_DICTIONARY = build_word_count_dictionary(CORPUS_FOLDER)
len(WORD_COUNT_DICTIONARY)
WORD_COUNT_DICTIONARY



Word count dictionary updated and saved.


{'र': np.int64(71188),
 'छ': np.int64(40288),
 'हो': np.int64(38885),
 'यो': np.int64(29268),
 'पनि': np.int64(28314),
 'एक': np.int64(23869),
 'थियो': np.int64(19582),
 'यस': np.int64(18421),
 'मा': np.int64(17689),
 'तथा': np.int64(17604),
 'भएको': np.int64(17383),
 'थिए': np.int64(14992),
 'छन्': np.int64(14750),
 'रहेको': np.int64(13554),
 'को': np.int64(12720),
 'लागि': np.int64(12437),
 'भने': np.int64(11554),
 'हुन्छ': np.int64(11381),
 'गर्ने': np.int64(9930),
 'आफ्नो': np.int64(9446),
 'रूपमा': np.int64(9303),
 'गर्न': np.int64(9063),
 'वा': np.int64(8942),
 'हुन्': np.int64(8546),
 'नेपालको': np.int64(8316),
 'नेपाली': np.int64(7924),
 'सन्': np.int64(7880),
 'नै': np.int64(7675),
 'विकास': np.int64(7671),
 'गरेको': np.int64(7297),
 'उनले': np.int64(7209),
 'यसको': np.int64(6936),
 'गरिएको': np.int64(6804),
 'तर': np.int64(6703),
 'गरेका': np.int64(6584),
 'नेपाल': np.int64(6422),
 'उनी': np.int64(6122),
 'अनुसार': np.int64(5989),
 'गरे': np.int64(5935),
 'भन्ने': np.int64(59

In [19]:
len(WORD_COUNT_DICTIONARY)

240683

In [21]:
WORD_COUNT_DICTIONARY

{'र': np.int64(71188),
 'छ': np.int64(40288),
 'हो': np.int64(38885),
 'यो': np.int64(29268),
 'पनि': np.int64(28314),
 'एक': np.int64(23869),
 'थियो': np.int64(19582),
 'यस': np.int64(18421),
 'मा': np.int64(17689),
 'तथा': np.int64(17604),
 'भएको': np.int64(17383),
 'थिए': np.int64(14992),
 'छन्': np.int64(14750),
 'रहेको': np.int64(13554),
 'को': np.int64(12720),
 'लागि': np.int64(12437),
 'भने': np.int64(11554),
 'हुन्छ': np.int64(11381),
 'गर्ने': np.int64(9930),
 'आफ्नो': np.int64(9446),
 'रूपमा': np.int64(9303),
 'गर्न': np.int64(9063),
 'वा': np.int64(8942),
 'हुन्': np.int64(8546),
 'नेपालको': np.int64(8316),
 'नेपाली': np.int64(7924),
 'सन्': np.int64(7880),
 'नै': np.int64(7675),
 'विकास': np.int64(7671),
 'गरेको': np.int64(7297),
 'उनले': np.int64(7209),
 'यसको': np.int64(6936),
 'गरिएको': np.int64(6804),
 'तर': np.int64(6703),
 'गरेका': np.int64(6584),
 'नेपाल': np.int64(6422),
 'उनी': np.int64(6122),
 'अनुसार': np.int64(5989),
 'गरे': np.int64(5935),
 'भन्ने': np.int64(59