## Data Preprocessing and Tokenization

In [1]:
import sys
import os
import pandas as pd
from glob import glob

In [2]:
from pathlib import Path
from importlib import reload
# add the project root to the path
project_root = Path("..").resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [3]:
from src.data_loader import DataLoader
import src.data_loader
# reload the module to ensure we have the latest version
reload(src.data_loader)

<module 'src.data_loader' from 'D:\\Research & Project\\10academy\\week 4\\challenge\\EthioMart-E-commerce-NER\\src\\data_loader.py'>

In [4]:
import pandas as pd
# import and load the raw data
raw_data_path = os.path.join('../data', 'raw')
# Get all CSV files from the raw data directory
raw_data_files = glob(os.path.join(raw_data_path, '*.csv'))
# import all raw data files into  a single dataframe 
combined_df = pd.DataFrame()

# Loop through each CSV file and load the data
for file in raw_data_files:
    data_loader = DataLoader(file)
    data_loader.load_data()  # Load data into the internal variable
    df = data_loader.get_data()  # Retrieve the loaded data
    print(f"Loading data from {file}")
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Optional: display final combined dataframe shape
print(f"Total combined shape: {combined_df.shape}")
combined_df.head()

Data loaded successfully from ../data\raw\telegram_data.csv.
Loading data from ../data\raw\telegram_data.csv
Data loaded successfully from ../data\raw\telegram_data_20250622_170157.csv.
Loading data from ../data\raw\telegram_data_20250622_170157.csv
Data loaded successfully from ../data\raw\telegram_data_20250622_220106.csv.
Loading data from ../data\raw\telegram_data_20250622_220106.csv
Total combined shape: (39366, 6)


Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,Sheger online-store,@Shageronlinestore,7383,💥Miralux Hot plate\n ባለሁለት ምድጃ ስቶቭ\n\n 💯o...,2025-06-19 06:31:31+00:00,data/photos/@Shageronlinestore_7383.jpg
1,Sheger online-store,@Shageronlinestore,7382,💥7pcs glass water set\n\n✔️ አንድ ማራኪ ጆግና 6 መጠጫ ...,2025-06-18 11:19:11+00:00,data/photos/@Shageronlinestore_7382.jpg
2,Sheger online-store,@Shageronlinestore,7381,,2025-06-18 11:19:11+00:00,data/photos/@Shageronlinestore_7381.jpg
3,Sheger online-store,@Shageronlinestore,7380,,2025-06-18 11:19:11+00:00,data/photos/@Shageronlinestore_7380.jpg
4,Sheger online-store,@Shageronlinestore,7379,,2025-06-18 11:19:11+00:00,data/photos/@Shageronlinestore_7379.jpg


In [16]:
combined_df.describe()

Unnamed: 0,ID
count,21847.0
mean,4777.052776
std,3216.868765
min,3.0
25%,2431.5
50%,4322.0
75%,6322.0
max,16428.0


In [19]:
# number of data per channel title
channel_counts = combined_df['Channel Title'].value_counts()
channel_counts

Channel Title
ምርጥ ዕቃ                 4935
ማራኪ ცЯﾑŋの™             4751
Sheger online-store    4092
qnash.com - ቅናሽ ®️     3511
ልዩ እቃ                  1785
SINA KIDS/ሲና ኪድስⓇ      1694
EthioBrand®             999
Zemen Express®           80
Name: count, dtype: int64

In [5]:
# load the module for preprocessing
from src.preprocessor import Preprocessor
import src.preprocessor
# reload the module to ensure we have the latest version
reload(src.preprocessor)

<module 'src.preprocessor' from 'D:\\Research & Project\\10academy\\week 4\\challenge\\EthioMart-E-commerce-NER\\src\\preprocessor.py'>

In [6]:
# initialize the preprocessor
preprocessor = Preprocessor(combined_df)

In [7]:
combined_df['Message']

0        💥Miralux Hot plate\n ባለሁለት ምድጃ ስቶቭ\n\n      💯o...
1        💥7pcs glass water set\n\n✔️ አንድ ማራኪ ጆግና 6 መጠጫ ...
2                                                      NaN
3                                                      NaN
4                                                      NaN
                               ...                        
39361    Air Jordan 4  \nSize 39,40,41,42,43,44\nPrice ...
39362    Skechers Spring lite \nSize 40,41,42,43\nPrice...
39363    Skechers lace ultra go\nSize - 40,41,42,43\nPr...
39364    Nike Air Max ACG\nSize - 40,41,42,43\nPrice 32...
39365    Nike air max bliss \nSize 40,41,42,43,44\nPric...
Name: Message, Length: 39366, dtype: object

In [8]:
# drop the 'Message' column has empty values
combined_df = combined_df.dropna(subset=['Message'])

In [9]:
# remove duplicates
combined_df = combined_df.drop_duplicates(subset=['ID', 'Message', 'Date'])

In [10]:
# after cleaning dropping the empty rows
combined_df['Message']

0        💥Miralux Hot plate\n ባለሁለት ምድጃ ስቶቭ\n\n      💯o...
1        💥7pcs glass water set\n\n✔️ አንድ ማራኪ ጆግና 6 መጠጫ ...
6        🎯 Universal water-saving dishwasher head\n\n🔰I...
7        💥 special base for refrigerator and washing ma...
8        💥 special base for refrigerator and washing ma...
                               ...                        
37362    Air Jordan 4  \nSize 39,40,41,42,43,44\nPrice ...
37363    Skechers Spring lite \nSize 40,41,42,43\nPrice...
37364    Skechers lace ultra go\nSize - 40,41,42,43\nPr...
37365    Nike Air Max ACG\nSize - 40,41,42,43\nPrice 32...
37366    Nike air max bliss \nSize 40,41,42,43,44\nPric...
Name: Message, Length: 21847, dtype: object

In [11]:
# clean up the data
combined_df['Message'] = preprocessor.clean_text('Message')


Column 'Message' must be of string type for text cleaning.


In [12]:
# after cleaning the emoji
combined_df['Message']

0        Miralux Hot plate ባለሁለት ምድጃ ስቶቭ orginal 2000 ዋ...
1        7pcs glass water set አንድ ማራኪ ጆግና 6 መጠጫ ብርጭቆዎች ...
6        Universal watersaving dishwasher head Increase...
7        special base for refrigerator and washing mach...
8        special base for refrigerator and washing mach...
                               ...                        
37362    Air Jordan 4 Size 394041424344 Price 3200 birr...
37363    Skechers Spring lite Size 40414243 Price 2700 ...
37364    Skechers lace ultra go Size 40414243 Price 290...
37365    Nike Air Max ACG Size 40414243 Price 3200 birr...
37366    Nike air max bliss Size 4041424344 Price 3400 ...
Name: Message, Length: 21847, dtype: object

In [13]:
# Normalize the text
combined_df['Message'] = preprocessor.normalize_text('Message')

In [14]:
combined_df['Tokens'] = preprocessor.tokenize_sentences('Message')
print("Generated tokens:")
print(combined_df['Tokens'].head(20))

Generated tokens:
0     [Miralux, Hot, plate, ባለሁለት, ምድጃ, ስቶቭ, orginal...
1     [7pcs, glass, water, set, አንድ, ማራኪ, ጆግና, 6, መጠ...
6     [Universal, watersaving, dishwasher, head, Inc...
7     [special, base, for, refrigerator, and, washin...
8     [special, base, for, refrigerator, and, washin...
13    [Vintage, Mason, Glass, Drinking, Jar, with, S...
14    [HOBBY, LOBBY, 3, in, 1, Chopper, and, Garlic,...
15    [HOBBY, LOBBY, 3, in, 1, Chopper, and, Garlic,...
17    [HOBBY, LOBBY, 3, in, 1, Chopper, and, Garlic,...
23    [አልቆልለተባላችሁበድጋሚአስገብተናል, የመስታዎት, ፓትራዎች, glass, ...
24    [portable, foldable, mosquito, net, for, kids,...
25    [3in1, delux, multifunctional, bassinet, ከግራ, ...
26    [3in1, delux, multifunctional, bassinet, ከግራ, ...
35    [Electric, Hot, plate, በኤሌክትሪክ, የሚሰራ, የራሱ, ሙቀት...
40    [Portable, Folding, Stool, ተጣጣፊ, ወንበር, እስከ, 10...
41    [Multifunction, Juice, Extractor, ለአፕል, ብርቱካን,...
44    [Multifunction, Juice, Extractor, ለአፕል, ብርቱካን,...
47    [1pc, stainless, steel, 

In [15]:
# save the cleaned data to a new CSV file
output_path = os.path.join('../data', 'processed', 'cleaned_data.csv')
combined_df.to_csv(output_path, index=False)