### Data cleaning and transformation

In [1]:
# Import required modules
import os
import sys
import asyncio
import nest_asyncio
import logging
import pandas as pd
from dotenv import load_dotenv
from telethon import TelegramClient

current_dir = os.getcwd()
# Append the parent directory to sys.path
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# ignore warrnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
from database.database_conn import insert_dataframe_to_db
from scripts.data_cleaner import DataCleaner

### Read extracted telegram data for cleaning and trasformation

In [3]:
df = pd.read_csv('scraped_telegram_messages.csv')

In [4]:
DataCleaner = DataCleaner(df)

In [5]:
cleand_df = DataCleaner.remove_duplicates()

2024-10-10 23:08:56,597 - INFO - Removed 1809 duplicate rows.


In [6]:
DataCleaner.handle_missing_values()

2024-10-10 23:08:56,877 - INFO - Removed 581 rows due to missing values.


In [7]:
DataCleaner.standardize_formats()

2024-10-10 23:08:57,316 - INFO - Standardized date formats and cleaned text fields.


In [8]:
DataCleaner.validate_data()

2024-10-10 23:08:57,636 - INFO - Data validation completed.


In [9]:
DataCleaner.store_cleaned_data('../data/cleaned_data.csv')

2024-10-10 23:08:58,116 - INFO - Cleaned data stored in ../data/cleaned_data.csv


In [10]:
final_df = DataCleaner.get_cleaned_data()

In [11]:
final_df

Unnamed: 0,id,text,sender,channel,date
0,864,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,-1001102021238,DoctorsET,2023-12-18 17:04:02+00:00
1,863,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,-1001102021238,DoctorsET,2023-11-03 16:14:39+00:00
2,862,ሞት በስኳር \n\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀን...,-1001102021238,DoctorsET,2023-10-02 16:37:39+00:00
3,861,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\n\nሙ...,-1001102021238,DoctorsET,2023-09-16 07:54:32+00:00
4,860,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,-1001102021238,DoctorsET,2023-09-01 16:16:15+00:00
...,...,...,...,...,...
5589,858,🧿 ለመሆኑ November 14/ ህዳር 4 የአለም የስኳር በሽታ ቀን መሆኑ...,-1001447066276,yetenaweg,2023-11-14 10:11:50+00:00
5593,854,💥የጤና ወግ መረጃ ሰጪ የሆነ #ጤናንበጃዝ የክለብሀውስ ቆይታችንን የል...,-1001447066276,yetenaweg,2023-11-07 13:25:38+00:00
5594,853,🚨 ማስታወሻ \nይህን አስተማሪ ውይይት ዛሬ ከምሽቱ 12:00 ይቀላቀሉ!!!!,-1001447066276,yetenaweg,2023-11-05 11:26:26+00:00
6073,328,🎙Yetenaweg &\n🎶 Ethiopia Jazz Hour \n🎧👋 On clu...,-1001447066276,yetenaweg,2021-06-24 05:11:06+00:00


In [12]:
# Call the function to insert the DataFrame into PostgreSQL
insert_dataframe_to_db(final_df)

2024-10-10 23:09:01,212 - INFO - Database connection established successfully.
2024-10-10 23:09:05,855 - INFO - Data successfully inserted into the PostgreSQL database.
2024-10-10 23:09:05,858 - INFO - Database connection closed.
