In [1]:
import re
from collections import defaultdict

def clean_vocabulary_file(input_file, output_file):
    try:
        # Read the input file
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Split content into sections based on headers (marked with **)
        sections = re.split(r'\n\*\*[^*]+\*\*\n', content)
        headers = re.findall(r'\*\*([^*]+)\*\*', content)
        
        # Dictionary to store words and their first occurrences
        word_dict = defaultdict(list)
        seen_translations = set()
        duplicates_found = []
        
        # Process the cleaned content
        cleaned_sections = []
        current_line_number = 1
        
        # Process the header section separately if it exists
        if sections[0].strip() == '':
            sections = sections[1:]
        
        # Process each section
        for section_idx, section in enumerate(sections):
            cleaned_lines = []
            lines = section.strip().split('\n')
            
            for line in lines:
                if line.strip():
                    # Extract number and content using regex
                    match = re.match(r'(\d+)\. \*\*([^*]+)\*\* - (.+)', line)
                    if match:
                        number, word, translation = match.groups()
                        
                        # Create a key that combines the word and translation
                        entry_key = f"{word.lower().strip()} - {translation.lower().strip()}"
                        
                        if entry_key not in seen_translations:
                            seen_translations.add(entry_key)
                            cleaned_lines.append(line)
                        else:
                            duplicates_found.append((word, translation, number))
            
            if cleaned_lines:
                # Add header back
                if section_idx < len(headers):
                    cleaned_sections.append(f"\n**{headers[section_idx]}**\n")
                cleaned_sections.append('\n'.join(cleaned_lines))
        
        # Renumber all entries
        final_content = cleaned_sections[0]  # First header
        current_number = 1
        
        for section in cleaned_sections[1:]:
            lines = section.split('\n')
            renumbered_lines = []
            for line in lines:
                if line.startswith('**'):
                    renumbered_lines.append(line)
                elif line.strip():
                    new_line = re.sub(r'^\d+', str(current_number), line)
                    renumbered_lines.append(new_line)
                    current_number += 1
            final_content += '\n' + '\n'.join(renumbered_lines)
        
        # Write the cleaned content to output file
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(final_content)
        
        # Print summary
        print(f"Processing complete!")
        print(f"Found {len(duplicates_found)} duplicates:")
        for word, translation, number in duplicates_found:
            print(f"Removed duplicate: {number}. **{word}** - {translation}")
        print(f"\nCleaned file saved as: {output_file}")
        print(f"Total entries in cleaned file: {current_number - 1}")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Example usage
input_file = "Thanglish.txt"  # Your input file name
output_file = "cleaned_vocabulary.txt"  # Your output file name
clean_vocabulary_file(input_file, output_file)

Processing complete!
Found 1 duplicates:
Removed duplicate: 352. **police** - Police

Cleaned file saved as: cleaned_vocabulary.txt
Total entries in cleaned file: 424


In [3]:
import pandas as pd 
df = pd.read_csv("tanglish_vocabulary.csv")

In [8]:
uni_words = df['tanglish'].unique()
pd.DataFrame({'tanglish': uni_words}).to_csv('thanglish_words.csv', index=False)

In [14]:
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
df = pd.read_csv("phrases.csv", header=None)
# Flatten the DataFrame into a single column and remove NaN values
df_flat = df.values.flatten()
df_flat = pd.Series(df_flat).dropna().reset_index(drop=True)

# Convert to DataFrame with 'phrases' as the column name
df_phrases = pd.DataFrame(df_flat, columns=['phrases'])

# Save the DataFrame to a CSV file
df_phrases.to_csv('phrases_cleaned.csv', index=False)

In [16]:
df_phrases.describe()

Unnamed: 0,phrases
count,100
unique,100
top,Naan kadaikku vengayam vaangaren
freq,1


In [17]:
df_1 = pd.read_csv("phrases_cleaned.csv")

In [2]:
import pandas as pd 
# Read the ground_truth.csv file
ground_truth_df = pd.read_csv("ground_truth.csv", header=None, names=['phrases'])

# Read the tamil_dictionary.csv file
tamil_dict_df = pd.read_csv("tamil_dictionary.csv", header=None, names=['tanglish'])

# Extract unique words from ground_truth_df
ground_truth_words = set(ground_truth_df['phrases'].str.split(expand=True).stack().unique())

# Extract words from tamil_dict_df
tamil_dict_words = set(tamil_dict_df['tanglish'].unique())

# Find words in ground_truth that are not in tamil_dictionary
missing_words = ground_truth_words - tamil_dict_words

# Add missing words to the tamil_dictionary DataFrame
for word in missing_words:
    tamil_dict_df = tamil_dict_df.append({'tanglish': word}, ignore_index=True)

# Save the updated tamil_dictionary.csv
tamil_dict_df.to_csv("tamil_dictionary.csv", index=False, header=False)
