In [1]:
import pandas as pd
import re

def remove_nan(df: pd.DataFrame) :
    df = df.dropna()
    return df

def clean_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', ' ', text)

def replace_non_ascii_in_dataframe(df, columns=['title', 'tags', 'description']):
    for column in columns:
        df.loc[:, column] = df[column].apply(lambda x: clean_non_ascii(x) if isinstance(x, str) else x)
    return df

def remove_rows_with_empty_strings(df, columns=['title', 'tags', 'description']):
    # Filter out rows where any specified column has an empty string
    df_filtered = df[~df[columns].apply(lambda row: any(cell == "" for cell in row), axis=1)]
    return df_filtered

def clean_data(df: pd.DataFrame) :
    out = remove_nan(df)
    out = replace_non_ascii_in_dataframe(out)
    out = remove_rows_with_empty_strings(out)
    return out

In [2]:
from pathlib import Path

directory_path = './data/raw'

file_paths = [f"{directory_path}/Education_videos_{i}.csv" for i in range(8)]

for file_path in file_paths :
    print(f'Processing {Path(file_path).name}...')
    df = pd.read_csv(file_path)
    df_cleaned = clean_data(df)
    df_cleaned.to_csv(directory_path + '/' + Path(file_path).stem + '_cleaned.csv')


Processing Education_videos_0.csv...
Processing Education_videos_1.csv...
Processing Education_videos_2.csv...
Processing Education_videos_3.csv...
Processing Education_videos_4.csv...
Processing Education_videos_5.csv...
Processing Education_videos_6.csv...
Processing Education_videos_7.csv...
