In [1]:
import re
import string
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import multiprocessing
import json
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForTokenClassification
import concurrent.futures
import os

2024-08-03 21:23:35.818563: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-03 21:23:35.818625: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-03 21:23:35.818637: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-03 21:23:35.824531: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Reading the CSV file

In [2]:
df = pd.read_csv("train_daily.csv")
df.shape

(8433, 1)

In [3]:
df.columns

Index(['correct_sentence'], dtype='object')

### Preprocessing the datframe

In [4]:
import re
import string

def preprocess_hindi(data, column_name):
    new_column = 'pre_process_' + str(column_name)
    
    # Create a copy of the original column
    data[new_column] = data[column_name].copy()
    
    # Remove URLs
    data[new_column] = data[new_column].apply(lambda x: re.sub(r'(https|http)?:\/\/\S+|www\.\S+', ' ', str(x)))
    print("Removed URLs")
    
    # Remove HTML tags
    data[new_column] = data[new_column].apply(lambda x: re.sub(r'<.*?>', ' ', str(x)))
    print("Removed HTML Tags")
    
    # Remove \n\t by space
    data[new_column] = data[new_column].apply(lambda x: re.sub(r'[\n\r\t]', ' ', str(x)))
    print("Removed tabs and new lines")
    
    # Define Hindi characters range and additional characters to keep
    hindi_pattern = r'[\u0900-\u097F\u0020\u0964\u0965\u0966-\u096F]'
    
    # Keep only Hindi characters and specified punctuation
    data[new_column] = data[new_column].apply(lambda x: ''.join(re.findall(hindi_pattern, str(x))))
    print("Kept only Hindi characters and specified punctuation")
    
    # Remove extra spaces
    data[new_column] = data[new_column].apply(lambda x: re.sub(r'\s+', ' ', str(x)).strip())
    print("Removed extra spaces")
    
    data.dropna(subset=[new_column], inplace=True)
    data.drop_duplicates(subset=[new_column], inplace=True)
    data.reset_index(inplace=True, drop=True)
    
    return data

In [5]:
preprocess_hindi(df, 'correct_sentence')

Removed URLs
Removed HTML Tags
Removed tabs and new lines
Kept only Hindi characters and specified punctuation
Removed extra spaces


Unnamed: 0,correct_sentence,pre_process_correct_sentence
0,जगदीश के नए ईमेल का जवाब दें,जगदीश के नए ईमेल का जवाब दें
1,मुझे एक चरनी चाहिए,मुझे एक चरनी चाहिए
2,walmart को tweet भेजो,को भेजो
3,फ़ोन नंबर 6180028469 और MPIN 8519 का उपयोग करक...,फ़ोन नंबर और का उपयोग करके पोर्टल पर लॉग इन करें
4,olly ओवन में आलू को किस टेंपरेचर पर पकाएं,ओवन में आलू को किस टेंपरेचर पर पकाएं
...,...,...
7856,बाद के लिए ईमेल सहेजें,बाद के लिए ईमेल सहेजें
7857,बसना का क्या मतलब है,बसना का क्या मतलब है
7858,राज्य तमिल नाडु में मेडिकल दूकान खोजने के लिए ...,राज्य तमिल नाडु में मेडिकल दूकान खोजने के लिए ...
7859,मुझे कल सुबह साढ़े पांच बजे का अलार्म चाहिए,मुझे कल सुबह साढ़े पांच बजे का अलार्म चाहिए


In [6]:
# Droping the rows where the length is less than 3
df = df[df['pre_process_correct_sentence'].apply(lambda x: len(x.split()) >= 3)]

### Masking the sentence

In [7]:
# Load the ONNX model for NER Classification
tokenizer = AutoTokenizer.from_pretrained("NER_model")
model = ORTModelForTokenClassification.from_pretrained('NER_model')

In [8]:
ner_pipeline = pipeline('token-classification', model=model, tokenizer=tokenizer, aggregation_strategy='max')

In [9]:
# Define the function to get the predictions
def get_predictions(sentence):
    """
    Process a single sentence and replace named entities with 'MASK'.
    
    :param sentence: A string containing the sentence to process
    :return: A string with named entities replaced by 'MASK'
    """
    # Process the sentence using the NER pipeline
    entities = ner_pipeline(sentence)
    
    # Create masked sentence
    masked_sentence = sentence
    for item in sorted(entities, key=lambda x: x['start'], reverse=True):
        start, end = item['start'], item['end']
        masked_sentence = masked_sentence[:start] + "MASK" + masked_sentence[end:]
    
    return masked_sentence


In [10]:
def process_row(row):
    return get_predictions(row['pre_process_correct_sentence'])

In [11]:
def main():
    # Read the DataFrame
    # df = pd.read_csv("hindi_wiki.csv")

    # Get the number of CPUs and set the number of workers to use
    num_cpus = os.cpu_count()
    num_workers = int(num_cpus * 0.8)
    
    # Apply the function to the DataFrame using multiprocessing
    with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
        # Create a progress bar
        tqdm.pandas(desc="Processing")
        
        # Apply the function to each row in parallel
        df['Masked_sentence'] = list(tqdm(executor.map(process_row, [row for _, row in df.iterrows()]), total=len(df)))

    # Save the updated DataFrame
    # df.to_csv("hindi_wiki.csv", index=False)

if __name__ == "__main__":
    main()

100%|██████████| 7568/7568 [00:23<00:00, 326.02it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Masked_sentence'] = list(tqdm(executor.map(process_row, [row for _, row in df.iterrows()]), total=len(df)))


### Generate trigrams

In [12]:
def generate_trigrams(text, word_to_combine):
    try:
        words = text.split()
        output = []
        for i in range(len(words)- word_to_combine+1):
            output.append(tuple(words[i:i+word_to_combine]))
        return output
    except TypeError:
        return None

In [13]:
df['trigrams'] = df['Masked_sentence'].progress_apply(lambda x: generate_trigrams(x, 3))

Processing: 100%|██████████| 7568/7568 [00:00<00:00, 244135.46it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trigrams'] = df['Masked_sentence'].progress_apply(lambda x: generate_trigrams(x, 3))


In [14]:
trigram_list = df['trigrams'].tolist()
flattened_list = list({item for sublist in trigram_list for item in sublist})

## Saving the JSON file

In [15]:
# # Convert list of tuples to list of lists
# data_list_of_lists = [list(item) for item in flattened_list]

# # Define the file path
# file_path = 'data.json'

# # Write the list of lists to a JSON file
# with open(file_path, 'w') as json_file:
#     json.dump(data_list_of_lists, json_file)

# print(f'Data has been saved to {file_path}')

In [16]:
# # Function to read the JSON file
# def read_json(file_path):
#     try:
#         with open(file_path, 'r') as json_file:
#             data_list_of_lists = json.load(json_file)
#         return [tuple(item) for item in data_list_of_lists]
#     except FileNotFoundError:
#         return []

# # Function to write data to the JSON file
# def write_json(file_path, data):
#     data_list_of_lists = [list(item) for item in data]
#     with open(file_path, 'w') as json_file:
#         json.dump(data_list_of_lists, json_file)

# # Function to append new data to the JSON file
# def append_to_json(file_path, new_data):
#     # Read existing data
#     data = read_json(file_path)
    
#     # Append new data (if it's not already present)
#     for item in tqdm(new_data):
#         if item not in data:
#             data.append(item)
    
#     # Write the updated data back to the JSON file
#     write_json(file_path, data)

In [17]:
# file_path = 'data.json'

In [18]:
# # Append new data to the JSON file
# append_to_json(file_path, flattened_list)

# print(f'New data has been appended to {file_path}')

In [19]:
def read_json(file_path):
    try:
        with open(file_path, 'r') as json_file:
            data_list_of_lists = json.load(json_file)
        return [tuple(item) for item in data_list_of_lists]
    except FileNotFoundError:
        return []

def write_json(file_path, data):
    data_list_of_lists = [list(item) for item in data]
    with open(file_path, 'w') as json_file:
        json.dump(data_list_of_lists, json_file)

def process_chunk(args):
    chunk, existing_data = args
    return [item for item in chunk if item not in existing_data]

def append_to_json(file_path, new_data):
    # Read existing data
    existing_data = set(read_json(file_path))
    
    # Determine the number of CPU cores to use (80% of available cores)
    num_cores = max(1, int(multiprocessing.cpu_count() * 0.8))
    
    # Split new_data into chunks
    chunk_size = max(1, len(new_data) // num_cores)
    chunks = [new_data[i:i + chunk_size] for i in range(0, len(new_data), chunk_size)]
    
    # Prepare arguments for process_chunk
    args = [(chunk, existing_data) for chunk in chunks]
    
    # Process chunks in parallel
    with multiprocessing.Pool(num_cores) as pool:
        results = list(tqdm(pool.imap(process_chunk, args), 
                            total=len(chunks), desc="Processing chunks"))
    
    # Combine results
    new_items = [item for sublist in results for item in sublist]
    
    # Append new items to existing data
    updated_data = list(existing_data) + new_items
    
    # Write the updated data back to the JSON file
    write_json(file_path, updated_data)

In [20]:
# Example usage
if __name__ == '__main__':
    file_path = 'data1.json'
    # new_data = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]  # Example new data
    append_to_json(file_path, flattened_list)

Processing chunks: 100%|██████████| 39/39 [00:00<00:00, 806.87it/s]
