#### Import modules

In [7]:
import sys, os, json, re, zipfile, csv
import pandas as pd

In [None]:
class Util():
    def __init__(self) -> None:
        self.emoji_pattern = re.compile(r"[\U0001F000-\U0001F9FF\U0001FA00-\U0001FFFF\U00020000-\U0002FFFF\U00030000-\U0003FFFF]+", flags=re.UNICODE)

        
        self.symbols = re.compile("["
                                  "\""
                                  "\“"
                                  "\""
                                  "\'"
                                  "\-"
                                  "\*"
                                  "\•"
                                  "\ℹ"
                                  "\﻿"
                                  "\_"
                                  "]+")
        self.url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        self.mention_pattern = r'@(\w+)'

    def read_file(self, file_path: str) -> dict:
        # Open the file for reading
        with open(file_path, 'r') as file:
            # Load the JSON data from the file
            data = json.load(file)
            return data

    def write_file(self, file_path: str, data: dict) -> None:
        # Open the file for writing
        with open(file_path, 'w') as file:
            # Dump the JSON data to the file
            json.dump(data, file, indent=2)

    def parse_text(self, text: any) -> str:
        if isinstance(text, str):
            return text
        elif isinstance(text, list):
            contents = []
            for item in text:
                if isinstance(item, str):
                    contents.append(item)
                elif isinstance(item, dict):
                    contents.append(item['text'])
            return "".join(contents)
        else:
            return ""

    def parse_messages(self, messages: list) -> dict:
        parsed_messages = {
            'id': [],
            'text': [],
            'date': []
        }
        for message in messages:
            if message['type'] != 'message' or len(message['text']) == 0:
                continue
            parsed_messages['id'].append(message['id'])
            message_content = self.parse_text(message['text'])
            parsed_messages['text'].append(message_content)
            parsed_messages['date'].append(message['date'])
        return parsed_messages

    def extract_hashtags(self, text: str) -> list:
        return [word for word in text.split() if word.startswith('#')]

    def extract_emojis(self, text):
        return ''.join(self.emoji_pattern.findall(text))

    def remove_emojis(self, text):
        return self.emoji_pattern.sub('', text)

    def extract_symbols(self, text):
        return ''.join(self.symbols.findall(text))

    def remove_symbols(self, text):
        return self.symbols.sub(' ', text)

    def extract_urls(self, text):
        return re.findall(self.url_pattern, text)

    def extract_mentions(self, text):
        return re.findall(self.mention_pattern, text)
    
    def extract_fields(self, message):
        """
        Extracts relevant fields from the message.
        Returns a tuple containing (channel_id, text, date, labels).
        """
        text = ' '.join(item['text'] for item in message['text_entities'] if item['type'] in 'plain')
        date = message['date']
        labels = "LABEL"  # Replace 'your_label' with the actual label(s) relevant to your use case
        return text, date, labels

    def process_json_file(self, json_file, csv_writer):
        """
        Processes a JSON file, extracts relevant fields, and writes to CSV.
        """
        data = json.load(json_file)

        channel_id = data['id']
        for message in data['messages']:
            text, date, labels = self.extract_fields(message)
            csv_writer.writerow([channel_id, text, date, labels])

    def process_zip(self, zip_file_path, output_csv_path):
        """
        Processes a zip file, extracts data from JSON files, and writes to a CSV file.
        """
        with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
            with open(output_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
                csv_writer = csv.writer(csv_file)
                csv_writer.writerow(['id', 'text', 'date', 'label'])

                for file_info in zip_file.infolist():
                    with zip_file.open(file_info.filename) as json_file:
                        print(json_file)
                        self.process_json_file(json_file, csv_writer)

    def process_zip_files(self, zip_file_path, output_directory):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
            # Iterate through each file in the zip archive
            for file_info in zip_file.infolist():
                with zip_file.open(file_info.filename) as json_file:
                    # Process the JSON file and create a DataFrame
                    data = json.load(json_file)
                    parsed_data = self.parse_json_data(data)

                    # Create a DataFrame from the parsed data
                    df = pd.DataFrame(parsed_data)

                    # Save the DataFrame to a CSV file
                    output_file_name = os.path.splitext(os.path.basename(file_info.filename))[0]
                    output_csv_path = os.path.join(output_directory, f"{output_file_name}_parsed.csv")
                    df.to_csv(output_csv_path, index=False)

    def parse_json_data(self, data):
        # Implement your JSON parsing logic here
        # Modify this method according to how you want to extract data from the JSON
        parsed_data = {
            'id': [],
            'text': [],
            'date': [],
            'label': []
        }

        for message in data['messages']:
            # Extract relevant fields from the message
            text, date, labels = self.extract_fields(message)
            parsed_data['id'].append(data['id'])
            parsed_data['text'].append(text)
            parsed_data['date'].append(date)
            parsed_data['label'].append(labels)
            
        return parsed_data
                        
    def file_reader(self, path: str, ) -> str:
        fname = os.path.join(path)
        with open(fname, 'r') as f:
            system_message = f.read()
        return system_message


In [10]:
util = Util()

## Parsing the zip data file

We will create a csv file with the following files from all json files found in the zip file containing telegram channel infos in json format. The parsed file will have the following cols.


- id: Telegram Channel ID
- text: message content
- date: message broadcast datetime.
- label (s): one or more data labels relevant to your supervised training 

In [9]:
# zip_file_path = os.path.join("../data/raw/raw.zip") 
# output_csv_path = os.path.join("../data/parsed/parsed.csv") 
zip_file_path = "../data/raw/raw.zip"
output_csv_path = "../data/parsed/parsed.csv"

util.process_zip(zip_file_path, output_csv_path)
print("Parsing completed. Output saved to", output_csv_path)

<zipfile.ZipExtFile name='4-3-3 FAST SPORT™.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='90 ደቂቃ ስፖርት™.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='DID U KNOW️⁉️.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='DREAM APP™.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='DREAM SPORT ™.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='ETHIO ARSENAL.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='ETHIO-MEREJA®.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='Ethio University News®.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='History 📚.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='Manchester United Fans™.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='QUBEE ACADEMY.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='Sheger Press️️.json' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='THE GOAT LM♾ 🐐.json' m

## Cleaning the parsed data

We will cleaning the text column of the parsed file. We will be cleaning the following things:

1. **Clean Null Values:** Remove null or empty values in the 'text' column.
   
2. **Clean New Lines:** Remove extra line breaks or new lines from the 'text' column.

3. **Remove Hashtags:** Remove hashtags from the 'text' column.

4. **Remove Emojis:** Remove emojis from the 'text' column.

5. **Remove Symbols:** Remove special symbols from the 'text' column.

6. **Remove Links:** Remove hyperlinks or URLs from the 'text' column.

7. **Remove Mentions:** Remove mentions or references (e.g., @username) from the 'text' column.

8. **Remove Extra Spaces:** Remove extra spaces, multiple spaces, or leading/trailing spaces from the 'text' column.

9. **Remove Non-Amharic Characters:** Remove characters that are not part of the Amharic script from the 'text' column.



In [10]:
# Specify the paths
parsed_csv_path = "../data/parsed/parsed.csv"
output_cleaned_csv_path = "../data/parsed/cleaned_parsed.csv"

# Read the parsed CSV into a DataFrame
df = pd.read_csv(parsed_csv_path)

### 1. Clean Null Values

In [11]:
# Apply method to clean null or empty values in the 'text' column
df = df.dropna()
df.head()

Unnamed: 0,id,text,date,label
9,1292390819,አርሰናል የታሚ አብርሀምን ፊርማ ስፐርስ እና ዌስትሀምን በልጦ ለማግኘት ...,2021-07-21T11:15:04,LABEL
41,1292390819,ዶኒ ቫንዴቢክ ከቤቱ አይለቅም!\n\nየማንቸስተር ዩናይትዱ ኮከብ ዶኒ ቫን...,2021-07-22T07:29:56,LABEL
42,1292390819,"1982 ""ኬን ቤትስ ቼልሲን በ 1 ሚ ፓ ገዛ\n\n2003 ኬን ቤትስ ቼል...",2021-07-22T07:29:56,LABEL
43,1292390819,ሉይ ቫን ሃል የ ኔዘርላንድ ብሄራዊ ቡድን አሰልጣኝ ተደርጎ ተሹሙዋል የ ...,2021-07-22T07:29:56,LABEL
44,1292390819,የፓውሎ ዲባላ ወኪል በአጥቂው የውል ማራዘሚያ ላይ ለመወያየት በወሩ መጨረ...,2021-07-22T07:29:56,LABEL


### 2. Clean New Lines

In [12]:
# Remove newlines from the 'text' column
df['text'] = df['text'].replace('\n', ' ', regex=True)
df.head()

Unnamed: 0,id,text,date,label
9,1292390819,አርሰናል የታሚ አብርሀምን ፊርማ ስፐርስ እና ዌስትሀምን በልጦ ለማግኘት ...,2021-07-21T11:15:04,LABEL
41,1292390819,ዶኒ ቫንዴቢክ ከቤቱ አይለቅም! የማንቸስተር ዩናይትዱ ኮከብ ዶኒ ቫንዴቢ...,2021-07-22T07:29:56,LABEL
42,1292390819,"1982 ""ኬን ቤትስ ቼልሲን በ 1 ሚ ፓ ገዛ 2003 ኬን ቤትስ ቼልሲን...",2021-07-22T07:29:56,LABEL
43,1292390819,ሉይ ቫን ሃል የ ኔዘርላንድ ብሄራዊ ቡድን አሰልጣኝ ተደርጎ ተሹሙዋል የ ...,2021-07-22T07:29:56,LABEL
44,1292390819,የፓውሎ ዲባላ ወኪል በአጥቂው የውል ማራዘሚያ ላይ ለመወያየት በወሩ መጨረ...,2021-07-22T07:29:56,LABEL


### 3. Remove Hashtags

In [13]:
# Remove hashtags from the 'text' column
df['text'] = df['text'].str.replace(r'\#\w+', '', regex=True)
df.head()


Unnamed: 0,id,text,date,label
9,1292390819,አርሰናል የታሚ አብርሀምን ፊርማ ስፐርስ እና ዌስትሀምን በልጦ ለማግኘት ...,2021-07-21T11:15:04,LABEL
41,1292390819,ዶኒ ቫንዴቢክ ከቤቱ አይለቅም! የማንቸስተር ዩናይትዱ ኮከብ ዶኒ ቫንዴቢ...,2021-07-22T07:29:56,LABEL
42,1292390819,"1982 ""ኬን ቤትስ ቼልሲን በ 1 ሚ ፓ ገዛ 2003 ኬን ቤትስ ቼልሲን...",2021-07-22T07:29:56,LABEL
43,1292390819,ሉይ ቫን ሃል የ ኔዘርላንድ ብሄራዊ ቡድን አሰልጣኝ ተደርጎ ተሹሙዋል የ ...,2021-07-22T07:29:56,LABEL
44,1292390819,የፓውሎ ዲባላ ወኪል በአጥቂው የውል ማራዘሚያ ላይ ለመወያየት በወሩ መጨረ...,2021-07-22T07:29:56,LABEL


### 4. Remove Emojis

In [14]:
# Apply method to remove emojis from the 'text' column
df['text'] = df['text'].apply(util.remove_emojis)
df.head()

Unnamed: 0,id,text,date,label
9,1292390819,አርሰናል የታሚ አብርሀምን ፊርማ ስፐርስ እና ዌስትሀምን በልጦ ለማግኘት ...,2021-07-21T11:15:04,LABEL
41,1292390819,ዶኒ ቫንዴቢክ ከቤቱ አይለቅም! የማንቸስተር ዩናይትዱ ኮከብ ዶኒ ቫንዴቢ...,2021-07-22T07:29:56,LABEL
42,1292390819,"1982 ""ኬን ቤትስ ቼልሲን በ 1 ሚ ፓ ገዛ 2003 ኬን ቤትስ ቼልሲን...",2021-07-22T07:29:56,LABEL
43,1292390819,ሉይ ቫን ሃል የ ኔዘርላንድ ብሄራዊ ቡድን አሰልጣኝ ተደርጎ ተሹሙዋል የ ...,2021-07-22T07:29:56,LABEL
44,1292390819,የፓውሎ ዲባላ ወኪል በአጥቂው የውል ማራዘሚያ ላይ ለመወያየት በወሩ መጨረ...,2021-07-22T07:29:56,LABEL


### 5. Remove Symbols 

In [15]:
# Apply method to remove special symbols from the 'text' column
df['text'] = df['text'].apply(util.remove_symbols)
df.head()

Unnamed: 0,id,text,date,label
9,1292390819,አርሰናል የታሚ አብርሀምን ፊርማ ስፐርስ እና ዌስትሀምን በልጦ ለማግኘት ...,2021-07-21T11:15:04,LABEL
41,1292390819,ዶኒ ቫንዴቢክ ከቤቱ አይለቅም! የማንቸስተር ዩናይትዱ ኮከብ ዶኒ ቫንዴቢ...,2021-07-22T07:29:56,LABEL
42,1292390819,1982 ኬን ቤትስ ቼልሲን በ 1 ሚ ፓ ገዛ 2003 ኬን ቤትስ ቼልሲን...,2021-07-22T07:29:56,LABEL
43,1292390819,ሉይ ቫን ሃል የ ኔዘርላንድ ብሄራዊ ቡድን አሰልጣኝ ተደርጎ ተሹሙዋል የ ...,2021-07-22T07:29:56,LABEL
44,1292390819,የፓውሎ ዲባላ ወኪል በአጥቂው የውል ማራዘሚያ ላይ ለመወያየት በወሩ መጨረ...,2021-07-22T07:29:56,LABEL


### 6. Remove Links 

In [16]:
# Apply method to remove hyperlinks or URLs from the 'text' column
df['text'] = df['text'].str.replace(util.url_pattern, '', regex=True).str.strip()
df.head()

Unnamed: 0,id,text,date,label
9,1292390819,አርሰናል የታሚ አብርሀምን ፊርማ ስፐርስ እና ዌስትሀምን በልጦ ለማግኘት ...,2021-07-21T11:15:04,LABEL
41,1292390819,ዶኒ ቫንዴቢክ ከቤቱ አይለቅም! የማንቸስተር ዩናይትዱ ኮከብ ዶኒ ቫንዴቢ...,2021-07-22T07:29:56,LABEL
42,1292390819,1982 ኬን ቤትስ ቼልሲን በ 1 ሚ ፓ ገዛ 2003 ኬን ቤትስ ቼልሲን...,2021-07-22T07:29:56,LABEL
43,1292390819,ሉይ ቫን ሃል የ ኔዘርላንድ ብሄራዊ ቡድን አሰልጣኝ ተደርጎ ተሹሙዋል የ ...,2021-07-22T07:29:56,LABEL
44,1292390819,የፓውሎ ዲባላ ወኪል በአጥቂው የውል ማራዘሚያ ላይ ለመወያየት በወሩ መጨረ...,2021-07-22T07:29:56,LABEL


### 7. Remove Mentions 

In [17]:
df['text'] = df['text'].str.replace(util.mention_pattern, '', regex=True).str.strip()
df.head()

Unnamed: 0,id,text,date,label
9,1292390819,አርሰናል የታሚ አብርሀምን ፊርማ ስፐርስ እና ዌስትሀምን በልጦ ለማግኘት ...,2021-07-21T11:15:04,LABEL
41,1292390819,ዶኒ ቫንዴቢክ ከቤቱ አይለቅም! የማንቸስተር ዩናይትዱ ኮከብ ዶኒ ቫንዴቢ...,2021-07-22T07:29:56,LABEL
42,1292390819,1982 ኬን ቤትስ ቼልሲን በ 1 ሚ ፓ ገዛ 2003 ኬን ቤትስ ቼልሲን...,2021-07-22T07:29:56,LABEL
43,1292390819,ሉይ ቫን ሃል የ ኔዘርላንድ ብሄራዊ ቡድን አሰልጣኝ ተደርጎ ተሹሙዋል የ ...,2021-07-22T07:29:56,LABEL
44,1292390819,የፓውሎ ዲባላ ወኪል በአጥቂው የውል ማራዘሚያ ላይ ለመወያየት በወሩ መጨረ...,2021-07-22T07:29:56,LABEL


### 8. Remove Extra Spaces 

In [18]:
# Apply method to remove extra spaces, multiple spaces, or leading/trailing spaces from the 'text' column
df['text'] = df['text'].str.replace('\s+', ' ', regex=True).str.strip()
df['text'] = df['text'].replace(r'!+', '!', regex=True)
df['text'] = df['text'].replace(r'\.+', '', regex=True)
df.head()

  df['text'] = df['text'].str.replace('\s+', ' ', regex=True).str.strip()


Unnamed: 0,id,text,date,label
9,1292390819,አርሰናል የታሚ አብርሀምን ፊርማ ስፐርስ እና ዌስትሀምን በልጦ ለማግኘት ...,2021-07-21T11:15:04,LABEL
41,1292390819,ዶኒ ቫንዴቢክ ከቤቱ አይለቅም! የማንቸስተር ዩናይትዱ ኮከብ ዶኒ ቫንዴቢክ...,2021-07-22T07:29:56,LABEL
42,1292390819,1982 ኬን ቤትስ ቼልሲን በ 1 ሚ ፓ ገዛ 2003 ኬን ቤትስ ቼልሲን በ...,2021-07-22T07:29:56,LABEL
43,1292390819,ሉይ ቫን ሃል የ ኔዘርላንድ ብሄራዊ ቡድን አሰልጣኝ ተደርጎ ተሹሙዋል የ ...,2021-07-22T07:29:56,LABEL
44,1292390819,የፓውሎ ዲባላ ወኪል በአጥቂው የውል ማራዘሚያ ላይ ለመወያየት በወሩ መጨረ...,2021-07-22T07:29:56,LABEL



### 9. Replace
* ['ሐ', 'ሑ', 'ሒ', 'ሓ', 'ሔ', 'ሖ'] with ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']
* ['ኀ', 'ኁ', 'ኂ', 'ኃ', 'ኄ', 'ኅ', 'ኆ'] with ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']
* ['ሠ', 'ሡ', 'ሢ', 'ሣ', 'ሤ', 'ሦ', 'ሦ', 'ሧ'] with ['ሰ, 'ሱ', 'ሲ', 'ሳ', 'ሴ', 'ስ', 'ሶ', 'ሷ']
* ['ዐ', 'ዑ', 'ዒ', 'ዓ', 'ዔ', 'ዕ', 'ዖ'] with ['አ', 'ኡ', 'ኢ', 'ኣ', 'ኤ', 'እ', 'ኦ']
* ['ጸ', 'ጹ', 'ጺ', 'ጻ', 'ጼ', 'ጽ', 'ጾ'] with ['ፀ', 'ፁ', 'ፂ', 'ፃ', 'ፄ', 'ፅ', 'ፆ']


In [19]:
letters = [
  [['ሐ', 'ሑ', 'ሒ', 'ሓ', 'ሔ', 'ሖ'], ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']],
  [['ኀ', 'ኁ', 'ኂ', 'ኃ', 'ኄ', 'ኅ', 'ኆ'], ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']],
  [['ሠ', 'ሡ', 'ሢ', 'ሣ', 'ሤ', 'ሦ', 'ሦ', 'ሧ'], ['ሰ', 'ሱ', 'ሲ', 'ሳ', 'ሴ', 'ስ', 'ሶ', 'ሷ']],
  [['ዐ', 'ዑ', 'ዒ', 'ዓ', 'ዔ', 'ዕ', 'ዖ'], ['አ', 'ኡ', 'ኢ', 'ኣ', 'ኤ', 'እ', 'ኦ']],
  [['ጸ', 'ጹ', 'ጺ', 'ጻ', 'ጼ', 'ጽ', 'ጾ'], ['ፀ', 'ፁ', 'ፂ', 'ፃ', 'ፄ', 'ፅ', 'ፆ']]
]

for letter in letters:
  for i in range(len(letter[0])):
    df['text'] = df['text'].str.replace(letter[0][i], letter[1][i])

df.head()  


Unnamed: 0,id,text,date,label
9,1292390819,አርሰናል የታሚ አብርሀምን ፊርማ ስፐርስ እና ዌስትሀምን በልጦ ለማግኘት ...,2021-07-21T11:15:04,LABEL
41,1292390819,ዶኒ ቫንዴቢክ ከቤቱ አይለቅም! የማንቸስተር ዩናይትዱ ኮከብ ዶኒ ቫንዴቢክ...,2021-07-22T07:29:56,LABEL
42,1292390819,1982 ኬን ቤትስ ቼልሲን በ 1 ሚ ፓ ገዛ 2003 ኬን ቤትስ ቼልሲን በ...,2021-07-22T07:29:56,LABEL
43,1292390819,ሉይ ቫን ሃል የ ኔዘርላንድ ብሄራዊ ቡድን አሰልጣኝ ተደርጎ ተሹሙዋል የ ...,2021-07-22T07:29:56,LABEL
44,1292390819,የፓውሎ ዲባላ ወኪል በአጥቂው የውል ማራዘሚያ ላይ ለመወያየት በወሩ መጨረ...,2021-07-22T07:29:56,LABEL


### 10. Remove Non Amharic Characters 

In [20]:
# Clean English characters from the 'text' column
df['text'] = df['text'].str.replace(r'[A-Za-z]+', '', regex=True)
df.head()

Unnamed: 0,id,text,date,label
9,1292390819,አርሰናል የታሚ አብርሀምን ፊርማ ስፐርስ እና ዌስትሀምን በልጦ ለማግኘት ...,2021-07-21T11:15:04,LABEL
41,1292390819,ዶኒ ቫንዴቢክ ከቤቱ አይለቅም! የማንቸስተር ዩናይትዱ ኮከብ ዶኒ ቫንዴቢክ...,2021-07-22T07:29:56,LABEL
42,1292390819,1982 ኬን ቤትስ ቼልሲን በ 1 ሚ ፓ ገዛ 2003 ኬን ቤትስ ቼልሲን በ...,2021-07-22T07:29:56,LABEL
43,1292390819,ሉይ ቫን ሃል የ ኔዘርላንድ ብሄራዊ ቡድን አሰልጣኝ ተደርጎ ተሹሙዋል የ ...,2021-07-22T07:29:56,LABEL
44,1292390819,የፓውሎ ዲባላ ወኪል በአጥቂው የውል ማራዘሚያ ላይ ለመወያየት በወሩ መጨረ...,2021-07-22T07:29:56,LABEL


## Save the cleaned DataFrame to a new CSV file

In [21]:

cleaned_output_path = "../data/parsed/cleaned_parsed_data.csv"
df.to_csv(cleaned_output_path, index=False)

## Save 'text' column to a text file

In [22]:
output_text_path = "../data/cleaned/cleaned.txt"
df['text'] = df['text'].apply(util.remove_emojis)
df['text'].to_csv(output_text_path, index=False, header=False, sep='\t')

## Parse and save individual json files

In [25]:
zip_file_path = "../data/raw/raw.zip"
output_directory = "../data/parsed/"
util.process_zip_files(zip_file_path, output_directory)

## Clean and save individual json files

In [30]:
import os
import pandas as pd

# Set the directory path where parsed CSV files are stored
parsed_files_directory = "../data/parsed/"
cleaned_files_directory = "../data/cleaned/"

# Iterate through each parsed file
for filename in os.listdir(parsed_files_directory):
    if filename.endswith("_parsed.csv"):
        # Read the parsed CSV file into a DataFrame
        filepath = os.path.join(parsed_files_directory, filename)
        df = pd.read_csv(filepath)

        # Apply method to clean null or empty values in the 'text' column
        df = df.dropna()

        # Remove newlines from the 'text' column
        df['text'] = df['text'].replace('\n', ' ', regex=True)

        # Remove hashtags from the 'text' column
        df['text'] = df['text'].str.replace(r'\#\w+', '', regex=True)

        # Apply method to remove emojis from the 'text' column
        df['text'] = df['text'].apply(util.remove_emojis)

        # Apply method to remove special symbols from the 'text' column
        df['text'] = df['text'].apply(util.remove_symbols)

        # Apply method to remove hyperlinks or URLs from the 'text' column
        df['text'] = df['text'].str.replace(util.url_pattern, '', regex=True).str.strip()
        df['text'] = df['text'].str.replace(util.mention_pattern, '', regex=True).str.strip()

        # Apply method to remove extra spaces, multiple spaces, or leading/trailing spaces from the 'text' column
        df['text'] = df['text'].str.replace('\s+', ' ', regex=True).str.strip()
        df['text'] = df['text'].replace(r'!+', '!', regex=True)
        df['text'] = df['text'].replace(r'\.+', '', regex=True)

        # Clean specific Amharic letters
        letters = [
            [['ሐ', 'ሑ', 'ሒ', 'ሓ', 'ሔ', 'ሖ'], ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']],
            [['ኀ', 'ኁ', 'ኂ', 'ኃ', 'ኄ', 'ኅ', 'ኆ'], ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']],
            [['ሠ', 'ሡ', 'ሢ', 'ሣ', 'ሤ', 'ሦ', 'ሦ', 'ሧ'], ['ሰ', 'ሱ', 'ሲ', 'ሳ', 'ሴ', 'ስ', 'ሶ', 'ሷ']],
            [['ዐ', 'ዑ', 'ዒ', 'ዓ', 'ዔ', 'ዕ', 'ዖ'], ['አ', 'ኡ', 'ኢ', 'ኣ', 'ኤ', 'እ', 'ኦ']],
            [['ጸ', 'ጹ', 'ጺ', 'ጻ', 'ጼ', 'ጽ', 'ጾ'], ['ፀ', 'ፁ', 'ፂ', 'ፃ', 'ፄ', 'ፅ', 'ፆ']]
        ]

        for letter in letters:
            for i in range(len(letter[0])):
                df['text'] = df['text'].str.replace(letter[0][i], letter[1][i])

        # Clean English characters from the 'text' column
        df['text'] = df['text'].str.replace(r'[A-Za-z]+', '', regex=True)

        # Save the cleaned DataFrame back to the same CSV file
        df.to_csv(filepath, index=False)

        # Save the cleaned text to a separate text file
        cleaned_text_path = os.path.join(cleaned_files_directory, f"{os.path.splitext(filename)[0]}.txt")
        df['text'].to_csv(cleaned_text_path, index=False, header=False)

  df['text'] = df['text'].str.replace('\s+', ' ', regex=True).str.strip()
