# **POLARS**

#### **Data Processing using Polars library (Benchmark)**

In [None]:
!pip install fastexcel




In [None]:
import polars as pl
import re
import time
import psutil


> Standalone function to calculate and return processing metrics.
- Parameters:
    - df: Polars DataFrame being processed
    
- Returns:
    - A dictionary with metrics such as CPU usage, memory usage, processing time, etc.



In [None]:
def calculate_processing_metrics(df):

    # Record the start time
    start_time = time.time()

    # Get initial system metrics
    initial_cpu = psutil.cpu_percent(interval=1)
    initial_memory = psutil.virtual_memory().percent


    # Get row count (handle both Polars and Pandas/Dask DataFrames)
    if isinstance(df, pl.DataFrame):
        row_count = df.height
    else:
        row_count = len(df)

    # Record the end time of the operation
    end_time = time.time()

    # Get final system metrics
    final_cpu = psutil.cpu_percent(interval=1)
    final_memory = psutil.virtual_memory().percent

    # Calculate processing time
    processing_time = end_time - start_time

    # Calculate throughput (assuming rows processed are equal to the DataFrame rows)
    throughput = row_count / processing_time if processing_time > 0 else 0

    # Return the metrics as a dictionary
    # Return the metrics in a nicely formatted way
    metrics = (
        f"Total Rows Processed: {row_count:,} records\n"
        f"Total Processing Time: {processing_time:.4f} seconds\n"
        f"Initial CPU Usage: {initial_cpu:.2f}%\n"
        f"Final CPU Usage: {final_cpu:.2f}%\n"
        f"Memory Usage: {final_memory:.2f}%\n"
        f"Throughput (Records per Second): {throughput:.2f} records/sec"
    )

    return metrics

> **1. Loading Data**

We load the raw dataset from the `NST_News_Articles.csv` file. The dataset contains information such as the article's title, teaser, URL, and category.

In [None]:
rd = pl.read_excel("NST_News_Articles.xlsx")

print(calculate_processing_metrics(rd))

Total Rows Processed: 110,641 records
Total Processing Time: 1.0011 seconds
Initial CPU Usage: 61.00%
Final CPU Usage: 25.90%
Memory Usage: 21.60%
Throughput (Records per Second): 110522.87 records/sec


> **2. Handle Duplicated Data**

We remove any duplicate rows from the dataset to avoid redundant data.



In [None]:
df_cleaned = rd.unique()
print(calculate_processing_metrics(df_cleaned))


Total Rows Processed: 106,473 records
Total Processing Time: 1.0006 seconds
Initial CPU Usage: 5.10%
Final CPU Usage: 5.10%
Memory Usage: 21.60%
Throughput (Records per Second): 106406.18 records/sec


> **3. Handle Missing Data**

We drop rows with missing values in key columns to maintain data quality.


In [None]:
df_cleaned = df_cleaned.drop_nulls()
print(calculate_processing_metrics(df_cleaned))

Total Rows Processed: 105,392 records
Total Processing Time: 1.0005 seconds
Initial CPU Usage: 5.00%
Final CPU Usage: 9.00%
Memory Usage: 21.60%
Throughput (Records per Second): 105340.71 records/sec


> **4. Clean the Teaser Column**

We clean the `Teaser` column by removing unwanted characters and ensuring that the teaser follows a standard format (e.g., extracting place and content from the teaser).

In [None]:
df_cleaned = df_cleaned.with_columns(
    pl.col('Teaser').str.replace_all(r'[^a-zA-Z0-9: ,]', '')
)
print(calculate_processing_metrics(df_cleaned))

Total Rows Processed: 105,392 records
Total Processing Time: 1.0006 seconds
Initial CPU Usage: 4.50%
Final CPU Usage: 5.10%
Memory Usage: 21.60%
Throughput (Records per Second): 105331.90 records/sec


In [None]:
df_cleaned = df_cleaned.filter(pl.col('Teaser').str.contains(':'))
print(calculate_processing_metrics(df_cleaned))

Total Rows Processed: 103,070 records
Total Processing Time: 1.0006 seconds
Initial CPU Usage: 5.50%
Final CPU Usage: 5.60%
Memory Usage: 21.60%
Throughput (Records per Second): 103003.91 records/sec


> **5. Splitting the place from 'Teaser' column**


In [None]:
# Split Teaser into Place and Teaser - corrected version
df_cleaned = df_cleaned.with_columns([
    pl.col('Teaser').str.split(':').list.get(0).alias('Place'),
    pl.col('Teaser').str.split(':').list.get(1).alias('Teaser_New')
])

# Replace the old Teaser with the new one
df_cleaned = df_cleaned.with_columns([
    pl.col('Teaser_New').alias('Teaser')
]).drop('Teaser_New')

print(calculate_processing_metrics(df_cleaned))

#KUALA LUMPUR: The Central Database Hub (PADU) system has recorded a total of 2.38 million individual information updates

Total Rows Processed: 103,070 records
Total Processing Time: 1.0005 seconds
Initial CPU Usage: 5.10%
Final CPU Usage: 51.30%
Memory Usage: 21.60%
Throughput (Records per Second): 103016.97 records/sec


> **6. Extract and Standardize Place Names**

We standardize the place names, convert them to uppercase, and remove any country names or other non-relevant information.

In [None]:
place_corrections = {
    'ALOR STAR': 'ALOR SETAR', 'AOR SETAR': 'ALOR SETAR','LOR STAR':'ALOR SETAR','ASTANA KAZAKHSTAN':'ASTANA',
    'BALIK PULAI':'BALIK PULAU','BATANG AI': 'BATANG KALI', 'BAGAN DATOH':'BAGAN DATUK',
    'CAMERON HIGHLAND': 'CAMERON HIGHLANDS','CHIANGMAI': 'CHIANG MAI','COLOMBO SRI LANKA': 'COLOMBO',
    'FRANK': 'FRANKFURT',
    'GUAMUSANG': 'GUA MUSANG','GUA MUSANG POS SIMPOR': 'GUA MUSANG',
    'DANANG': 'DA NANG',
    'GEOGE TOWN': 'GEORGE TOWN','GEORGETOWN': 'GEORGE TOWN','JERTIH':'JERTEH',
    'JOHOR BARU': 'JOHOR BAHRU', 'JOHOR BAHU': 'JOHOR BAHRU','JOHOR BHARU': 'JOHOR BAHRU','JOHOR BARY': 'JOHOR BAHRU','JOHOR BAHARU': 'JOHOR BAHRU',
    'JOHOR BARU KUALA LUMPUR':'JOHOR BAHRU','JOHOR BARUSINGAPORE':'JOHOR BAHRU',
    'KUALA KUBU BARU':'KUALA KUBU BAHRU','KUALA KUBU BAHARU':'KUALA KUBU BAHRU',
    'UALA LUMPUR': 'KUALA LUMPUR','KUALKUALA LUMPUR':'KUALA LUMPUR','SEPT  KUALA LUMPUR': 'KUALA LUMPR','KKUALA LUMUR': 'KUALA LUMPUR',
    'KIALA LUMUPUR': 'KUALA LUMPUR', 'IKUALA LUMPUR': 'KUALA LUMPUR','KUALAA LUMPUR':'KUALA LUMPUR','KUALALUMPUR':'KUALA LUMPUR',
    'KUALA LUMUR':'KUALA LUMPUR','KUALA LUMPU':'KUALA LUMPUR','KUALA LUMPURHONG KONG':'KUALA LUMPUR','KUALA LIMPUR':'KUALA LUMPUR',
    'KUALA LUMPURJAKARTA':'KUALA LUMPUR','KUALA KUMPUR':'KUALA LUMPUR','KUALA NERUS TERENGGANU':'KUALA NERUS',
    'KUALATERENGGANU':'KUALA TERENGGANU','KUALA TERENGANU':'KUALA TERENGGANU','KUALA TERENGAGNU':'KUALA TERENGGANU','KUALA TENGGANU':'KUALA TERENGGANU',
    'KULA LUMPUR':'KUALA LUMPUR','KUCHINGL':'KUCHING','KUANG':'KLUANG',
    'KUAL LUMPUR':'KUALA LUMPUR','KUALA  LUMPUR':'KUALA LUMPUR',
    'UALA TERENGGANU': 'KUALA TERENGGANU','KKOTA KINABALU':'KOTA KINABALU','KOTA KINABAU':'KOTA KINABALU','KOTA KINBALU':'KOTA KINABALU','KOTA  KINABALU':'KOTA KINABALU',
    'KOTA  BARU':'KOTA BAHRU', 'KOTA BAHARU':'KOTA BAHRU','KOTA BARU':'KOTA BAHRU','KOTA BARUGEORGE TOWN':'KOTA BAHRU',
    'LABUAN BAJO INDONESIA':'LABUAN BAJO','LONDONKUALA LUMPUR':'LONDON','LONDON TUES':'LONDON','LENGONG':'LENGGONG','LAMGKAWI':'LANGKAWI',
    'MARNG':'MARANG','MELAKA': 'MALACCA','MEKALA':'MALACCA','MANAMA BAHRAIN': 'MANAMA',
    'NIBONG TEBA':'NIBONG TEBAL','NEW DELHI INDIA':'NEW DELHI','NEW DELH':'NEW DELHI','NARATHIWAT SOUTHERN THAILAND':'NARATHIWAT','MUNDOK SOUTHERN THAILAND':'MUNDOK',
    'PARISBEIJING':'PARIS',
    'PUTRAJAYAS': 'PUTRAJAYA','PUTRAYAJA': 'PUTRAJAYA','PUTRJAYA': 'PUTRAJAYA','PPUTRAJAYA': 'PUTRAJAYA','PATTANI THAILAND':'PATTANI','PASIR PUTIH':'PASIR PUTEH',
    'PORT MORESBY PAPUA NEW GUINEA': 'PORT MORESBY','PANGKOR ISLAND':'PANGKOR','PULAU PERHENTIAN KECIL TERENGGANU':'PULAU PERHENTIAN',
    'SEBERANG PERAI': 'SEBERANG PRAI','SUNNYLANDS CALIFORNIA': 'SUNNYLANDS','SUNGAI GOLOK THAILAND':'SUNGAI GOLOK',
    'SUBANG': 'SUBANG JAYA','SONGKLA': 'SONGKHLA','SHAH  ALAM': 'SHAH ALAM','SEMENYEH': 'SEMENYIH','SELANGAU': 'SELANGOR','SARI': 'SARIKEI',
    'SAMARAHAN': 'SAMARKAND','SADAO THAILAND': 'SADAO',   'ALSHAH ALAM': 'SHAH ALAM',
    'THE HAGUE NETHERLANDS': 'THE HAGUE','TASHKENTL': 'TASHKENT','TAKBAI SOUTHERN THAILAND': 'TAKBAI','TAK': 'TAK THAILAND',
    'VALLETTA MALTA': 'VALLETTA','VIENTIANE LAOS': 'VIENTIANE','VLADIVOSTOK RUSSIA': 'VLADIVOSTOK','VALETTA':'VALLETTA',
    'ULAANBAATAR  MONGOLIA': 'ULAANBAATAR','ULAANBAATAR MONGOLIA': 'ULAANBAATAR','ULAANBAATAAR': 'ULAANBAATAR',
    'WASHINGTON DC': 'WASHINGTON', 'KKUALA LUMPURR': 'KUALA LUMPUR',

}

df_cleaned = df_cleaned.with_columns(
    pl.col('Place').str.to_uppercase()
)
for old, new in place_corrections.items():
    df_cleaned = df_cleaned.with_columns(
        pl.col('Place').str.replace_all(old, new)
    )
df_cleaned = df_cleaned.with_columns(
    pl.col('Place').str.split(',').list.first()
)
df_cleaned = df_cleaned.with_columns(
    pl.col('Place').str.replace_all(r'[^a-zA-Z\s]+', '')
)

print(calculate_processing_metrics(df_cleaned))


Total Rows Processed: 103,070 records
Total Processing Time: 1.0008 seconds
Initial CPU Usage: 14.80%
Final CPU Usage: 4.00%
Memory Usage: 21.70%
Throughput (Records per Second): 102986.27 records/sec


In [None]:
# Count the number of articles per city
city_counts = (
    df_cleaned
    .group_by("Place")
    .agg(pl.len().alias("count"))
    .filter(pl.col("count") >= 2)
)

# Extract valid cities
valid_cities = city_counts["Place"]

# Save the valid cities to a CSV file
pl.DataFrame({"Place": valid_cities}).write_csv("valid_cities.csv")

# Filter the original DataFrame
df_cleaned = df_cleaned.filter(pl.col("Place").is_in(valid_cities))


print(calculate_processing_metrics(df_cleaned))


Total Rows Processed: 102,756 records
Total Processing Time: 1.0006 seconds
Initial CPU Usage: 5.10%
Final CPU Usage: 4.50%
Memory Usage: 21.70%
Throughput (Records per Second): 102697.99 records/sec


In [None]:
# Filter the DataFrame to keep only rows where 'Place' is in valid_cities
df_cleaned = df_cleaned.filter(pl.col("Place").is_in(valid_cities))

# If your calculate_processing_metrics function is pandas-based, convert to pandas
print(calculate_processing_metrics(df_cleaned))


Total Rows Processed: 102,756 records
Total Processing Time: 1.0005 seconds
Initial CPU Usage: 5.10%
Final CPU Usage: 5.00%
Memory Usage: 21.70%
Throughput (Records per Second): 102705.31 records/sec


> **7. Extract Date from URL**

We extract the date in `YYYY/MM` format from the URL and add it as a separate column in the dataset.

In [None]:
df_cleaned = df_cleaned.with_columns(
    pl.col("URL").str.extract(r"(\d{4}/\d{2})").alias("Date")
)

print("Date column extracted from the URL.")

print(calculate_processing_metrics(df_cleaned))


Date column extracted from the URL.
Total Rows Processed: 102,756 records
Total Processing Time: 1.0008 seconds
Initial CPU Usage: 5.60%
Final CPU Usage: 4.50%
Memory Usage: 21.70%
Throughput (Records per Second): 102675.12 records/sec


> **8. Final Dataset**

After cleaning and transforming the data, we earrange dataframe and export the cleaned dataset to a new CSV file (`finalData.csv`).

In [None]:
# Rearrange the columns to the desired order
df_cleaned = df_cleaned[['Place', 'Date', 'Category', 'Title','Teaser']]
print(calculate_processing_metrics(df_cleaned))

Total Rows Processed: 102,756 records
Total Processing Time: 1.0015 seconds
Initial CPU Usage: 6.50%
Final CPU Usage: 5.50%
Memory Usage: 21.70%
Throughput (Records per Second): 102605.21 records/sec


In [None]:
sorted_df = df_cleaned.sort("Place")

sorted_df.write_csv('finalData.csv')


# **MODIN**

#### **Data Processing using Modin library (Benchmark)**

In [None]:
!pip install modin[ray] -q


In [None]:
import modin.pandas as pd
import re
import time
import psutil


> Standalone function to calculate and return processing metrics.
- Parameters:
    - df: modin pandas DataFrame being processed
    
- Returns:
    - A dictionary with metrics such as CPU usage, memory usage, processing time, etc.



In [None]:
def calculate_processing_metrics(df):

    # Record the start time
    start_time = time.time()

    # Get initial system metrics
    initial_cpu = psutil.cpu_percent(interval=1)
    initial_memory = psutil.virtual_memory().percent

    row_count = len(df)

    # Record the end time of the operation
    end_time = time.time()

    # Get final system metrics
    final_cpu = psutil.cpu_percent(interval=1)
    final_memory = psutil.virtual_memory().percent

    # Calculate processing time
    processing_time = end_time - start_time

    # Calculate throughput (assuming rows processed are equal to the DataFrame rows)
    throughput = row_count / processing_time if processing_time > 0 else 0

    # Return the metrics as a dictionary
    # Return the metrics in a nicely formatted way
    metrics = (
        f"Total Rows Processed: {row_count:,} records\n"
        f"Total Processing Time: {processing_time:.4f} seconds\n"
        f"Initial CPU Usage: {initial_cpu:.2f}%\n"
        f"Final CPU Usage: {final_cpu:.2f}%\n"
        f"Memory Usage: {final_memory:.2f}%\n"
        f"Throughput (Records per Second): {throughput:.2f} records/sec"
    )

    return metrics

> **1. Loading Data**

We load the raw dataset from the `NST_News_Articles.csv` file. The dataset contains information such as the article's title, teaser, URL, and category.

In [None]:
rd = pd.read_excel("NST_News_Articles.xlsx")

print(calculate_processing_metrics(rd))



Total Rows Processed: 110,641 records
Total Processing Time: 1.0011 seconds
Initial CPU Usage: 56.60%
Final CPU Usage: 54.60%
Memory Usage: 22.30%
Throughput (Records per Second): 110520.42 records/sec


> **2. Handle Duplicated Data**

We remove any duplicate rows from the dataset to avoid redundant data.



In [None]:
df_cleaned = rd.drop_duplicates()
print(calculate_processing_metrics(df_cleaned))


Total Rows Processed: 106,473 records
Total Processing Time: 1.0742 seconds
Initial CPU Usage: 50.00%
Final CPU Usage: 6.00%
Memory Usage: 22.40%
Throughput (Records per Second): 99120.34 records/sec


> **3. Handle Missing Data**

We drop rows with missing values in key columns to maintain data quality.


In [None]:
df_cleaned = df_cleaned.dropna()
print(calculate_processing_metrics(df_cleaned))

Total Rows Processed: 105,393 records
Total Processing Time: 1.0721 seconds
Initial CPU Usage: 25.10%
Final CPU Usage: 4.00%
Memory Usage: 22.40%
Throughput (Records per Second): 98303.39 records/sec


> **4. Clean the Teaser Column**

We clean the `Teaser` column by removing unwanted characters and ensuring that the teaser follows a standard format (e.g., extracting place and content from the teaser).

In [None]:
df_cleaned['Teaser'] = df_cleaned['Teaser'].astype(str).str.replace(r'[^a-zA-Z0-9: ,]', '', regex=True)
print(calculate_processing_metrics(df_cleaned))

Total Rows Processed: 105,393 records
Total Processing Time: 1.0007 seconds
Initial CPU Usage: 48.00%
Final CPU Usage: 5.00%
Memory Usage: 22.40%
Throughput (Records per Second): 105318.57 records/sec


In [None]:
df_cleaned = df_cleaned[df_cleaned['Teaser'].str.contains(':')]
print(calculate_processing_metrics(df_cleaned))

Total Rows Processed: 103,070 records
Total Processing Time: 1.0449 seconds
Initial CPU Usage: 46.80%
Final CPU Usage: 4.10%
Memory Usage: 22.40%
Throughput (Records per Second): 98638.17 records/sec


> **5. Splitting the place from 'Teaser' column**


In [None]:
# Split "Teaser" into "Place" and "Teaser_New"
df_cleaned[['Place', 'Teaser']] = df_cleaned['Teaser'].str.split(':', n=1, expand=True)

print(calculate_processing_metrics(df_cleaned))

#KUALA LUMPUR: The Central Database Hub (PADU) system has recorded a total of 2.38 million individual information updates



Total Rows Processed: 103,070 records
Total Processing Time: 1.0007 seconds
Initial CPU Usage: 19.80%
Final CPU Usage: 55.30%
Memory Usage: 22.30%
Throughput (Records per Second): 102998.93 records/sec


> **6. Extract and Standardize Place Names**

We standardize the place names, convert them to uppercase, and remove any country names or other non-relevant information.

In [None]:
place_corrections = {
    'ALOR STAR': 'ALOR SETAR', 'AOR SETAR': 'ALOR SETAR','LOR STAR':'ALOR SETAR','ASTANA KAZAKHSTAN':'ASTANA',
    'BALIK PULAI':'BALIK PULAU','BATANG AI': 'BATANG KALI', 'BAGAN DATOH':'BAGAN DATUK',
    'CAMERON HIGHLAND': 'CAMERON HIGHLANDS','CHIANGMAI': 'CHIANG MAI','COLOMBO SRI LANKA': 'COLOMBO',
    'FRANK': 'FRANKFURT',
    'GUAMUSANG': 'GUA MUSANG','GUA MUSANG POS SIMPOR': 'GUA MUSANG',
    'DANANG': 'DA NANG',
    'GEOGE TOWN': 'GEORGE TOWN','GEORGETOWN': 'GEORGE TOWN','JERTIH':'JERTEH',
    'JOHOR BARU': 'JOHOR BAHRU', 'JOHOR BAHU': 'JOHOR BAHRU','JOHOR BHARU': 'JOHOR BAHRU','JOHOR BARY': 'JOHOR BAHRU','JOHOR BAHARU': 'JOHOR BAHRU',
    'JOHOR BARU KUALA LUMPUR':'JOHOR BAHRU','JOHOR BARUSINGAPORE':'JOHOR BAHRU',
    'KUALA KUBU BARU':'KUALA KUBU BAHRU','KUALA KUBU BAHARU':'KUALA KUBU BAHRU',
    'UALA LUMPUR': 'KUALA LUMPUR','KUALKUALA LUMPUR':'KUALA LUMPUR','SEPT  KUALA LUMPUR': 'KUALA LUMPR','KKUALA LUMUR': 'KUALA LUMPUR',
    'KIALA LUMUPUR': 'KUALA LUMPUR', 'IKUALA LUMPUR': 'KUALA LUMPUR','KUALAA LUMPUR':'KUALA LUMPUR','KUALALUMPUR':'KUALA LUMPUR',
    'KUALA LUMUR':'KUALA LUMPUR','KUALA LUMPU':'KUALA LUMPUR','KUALA LUMPURHONG KONG':'KUALA LUMPUR','KUALA LIMPUR':'KUALA LUMPUR',
    'KUALA LUMPURJAKARTA':'KUALA LUMPUR','KUALA KUMPUR':'KUALA LUMPUR','KUALA NERUS TERENGGANU':'KUALA NERUS',
    'KUALATERENGGANU':'KUALA TERENGGANU','KUALA TERENGANU':'KUALA TERENGGANU','KUALA TERENGAGNU':'KUALA TERENGGANU','KUALA TENGGANU':'KUALA TERENGGANU',
    'KULA LUMPUR':'KUALA LUMPUR','KUCHINGL':'KUCHING','KUANG':'KLUANG',
    'KUAL LUMPUR':'KUALA LUMPUR','KUALA  LUMPUR':'KUALA LUMPUR',
    'UALA TERENGGANU': 'KUALA TERENGGANU','KKOTA KINABALU':'KOTA KINABALU','KOTA KINABAU':'KOTA KINABALU','KOTA KINBALU':'KOTA KINABALU','KOTA  KINABALU':'KOTA KINABALU',
    'KOTA  BARU':'KOTA BAHRU', 'KOTA BAHARU':'KOTA BAHRU','KOTA BARU':'KOTA BAHRU','KOTA BARUGEORGE TOWN':'KOTA BAHRU',
    'LABUAN BAJO INDONESIA':'LABUAN BAJO','LONDONKUALA LUMPUR':'LONDON','LONDON TUES':'LONDON','LENGONG':'LENGGONG','LAMGKAWI':'LANGKAWI',
    'MARNG':'MARANG','MELAKA': 'MALACCA','MEKALA':'MALACCA','MANAMA BAHRAIN': 'MANAMA',
    'NIBONG TEBA':'NIBONG TEBAL','NEW DELHI INDIA':'NEW DELHI','NEW DELH':'NEW DELHI','NARATHIWAT SOUTHERN THAILAND':'NARATHIWAT','MUNDOK SOUTHERN THAILAND':'MUNDOK',
    'PARISBEIJING':'PARIS',
    'PUTRAJAYAS': 'PUTRAJAYA','PUTRAYAJA': 'PUTRAJAYA','PUTRJAYA': 'PUTRAJAYA','PPUTRAJAYA': 'PUTRAJAYA','PATTANI THAILAND':'PATTANI','PASIR PUTIH':'PASIR PUTEH',
    'PORT MORESBY PAPUA NEW GUINEA': 'PORT MORESBY','PANGKOR ISLAND':'PANGKOR','PULAU PERHENTIAN KECIL TERENGGANU':'PULAU PERHENTIAN',
    'SEBERANG PERAI': 'SEBERANG PRAI','SUNNYLANDS CALIFORNIA': 'SUNNYLANDS','SUNGAI GOLOK THAILAND':'SUNGAI GOLOK',
    'SUBANG': 'SUBANG JAYA','SONGKLA': 'SONGKHLA','SHAH  ALAM': 'SHAH ALAM','SEMENYEH': 'SEMENYIH','SELANGAU': 'SELANGOR','SARI': 'SARIKEI',
    'SAMARAHAN': 'SAMARKAND','SADAO THAILAND': 'SADAO',   'ALSHAH ALAM': 'SHAH ALAM',
    'THE HAGUE NETHERLANDS': 'THE HAGUE','TASHKENTL': 'TASHKENT','TAKBAI SOUTHERN THAILAND': 'TAKBAI','TAK': 'TAK THAILAND',
    'VALLETTA MALTA': 'VALLETTA','VIENTIANE LAOS': 'VIENTIANE','VLADIVOSTOK RUSSIA': 'VLADIVOSTOK','VALETTA':'VALLETTA',
    'ULAANBAATAR  MONGOLIA': 'ULAANBAATAR','ULAANBAATAR MONGOLIA': 'ULAANBAATAR','ULAANBAATAAR': 'ULAANBAATAR',
    'WASHINGTON DC': 'WASHINGTON', 'KKUALA LUMPURR':'KUALA LUMPUR',

}

df_cleaned['Place'] = df_cleaned['Place'].str.upper()
for old, new in place_corrections.items():
    df_cleaned['Place'] = df_cleaned['Place'].str.replace(old, new, regex=False)
df_cleaned['Place'] = df_cleaned['Place'].str.split(',').str[0]
df_cleaned['Place'] = df_cleaned['Place'].str.replace(r'[^a-zA-Z\s]+', '', regex=True)

print(calculate_processing_metrics(df_cleaned))


Total Rows Processed: 103,070 records
Total Processing Time: 1.0008 seconds
Initial CPU Usage: 100.00%
Final CPU Usage: 100.00%
Memory Usage: 22.40%
Throughput (Records per Second): 102990.74 records/sec


In [None]:
# Count the number of articles per city
city_counts = df_cleaned['Place'].value_counts()

# Set a threshold: keep only cities with at least N articles
threshold = 2
valid_cities = city_counts[city_counts >= threshold].index

# Save the valid cities to a CSV file
pd.Series(valid_cities).to_csv('valid_cities.csv', index=False, header=False)

# Filter the DataFrame to keep only valid cities
df_cleaned = df_cleaned[df_cleaned['Place'].isin(valid_cities)]

# Print processing metrics
print(calculate_processing_metrics(df_cleaned))


Total Rows Processed: 102,756 records
Total Processing Time: 1.0608 seconds
Initial CPU Usage: 37.20%
Final CPU Usage: 48.30%
Memory Usage: 21.70%
Throughput (Records per Second): 96862.92 records/sec


In [None]:
# Filter the DataFrame to keep only rows where 'Place' is in valid_cities
df_cleaned = df_cleaned[df_cleaned['Place'].isin(valid_cities)]
print(calculate_processing_metrics(df_cleaned))

Total Rows Processed: 102,756 records
Total Processing Time: 1.0965 seconds
Initial CPU Usage: 77.90%
Final CPU Usage: 43.40%
Memory Usage: 22.00%
Throughput (Records per Second): 93712.48 records/sec


> **7. Extract Date from URL**

We extract the date in `YYYY/MM` format from the URL and add it as a separate column in the dataset.

In [None]:
df_cleaned['Date'] = df_cleaned['URL'].str.extract(r'(\d{4}/\d{2})')
print("Date column extracted from the URL.")
print(calculate_processing_metrics(df_cleaned))  # Metrics after extracting the Date column
print("")

Date column extracted from the URL.
Total Rows Processed: 102,756 records
Total Processing Time: 1.0009 seconds
Initial CPU Usage: 57.80%
Final CPU Usage: 4.10%
Memory Usage: 21.00%
Throughput (Records per Second): 102662.52 records/sec



> **8. Final Dataset**

After cleaning and transforming the data, we earrange dataframe and export the cleaned dataset to a new CSV file (`finalData.csv`).

In [None]:
# Rearrange the columns to the desired order
df_cleaned = df_cleaned[['Place', 'Date', 'Category', 'Title','Teaser']]
print(calculate_processing_metrics(df_cleaned))

Total Rows Processed: 102,756 records
Total Processing Time: 1.0007 seconds
Initial CPU Usage: 41.20%
Final CPU Usage: 4.60%
Memory Usage: 21.10%
Throughput (Records per Second): 102684.34 records/sec


In [None]:
sorted_df = df_cleaned.sort_values(by=['Place'])
sorted_df.to_csv('finalData.csv',index=False)