<a href="https://colab.research.google.com/github/dayel1/AI/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np


def handle_outliers(df, column, upper_q=0.99):

    upper_limit = df[column].quantile(upper_q)
    df[column] = df[column].clip(upper=upper_limit)
    return df

def clean_strings(df, column):

    df[column] = (df[column].astype(str).str.strip().str.lower()
                  .str.replace(r"[^a-z0-9\s]", "", regex=True))
    return df

def clean_dates(df, column):

    df[column] = pd.to_datetime(df[column], errors='coerce')
    if df[column].dt.tz is None:
        df[column] = df[column].dt.tz_localize('UTC')
    else:
        df[column] = df[column].dt.tz_convert('UTC')
    return df


def clean_data_project(df_raw):


    df = df_raw.copy()


    if 'age' in df.columns:
        df['age'] = pd.to_numeric(df['age'], errors='coerce')
    if 'income' in df.columns:
        df['income'] = pd.to_numeric(df['income'], errors='coerce')


    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())


    if 'income' in df.columns:
        df = handle_outliers(df, 'income')


    if 'city' in df.columns:
        df = clean_strings(df, 'city')


    if 'signup_date' in df.columns:
        df = clean_dates(df, 'signup_date')


    print(f"تمت المعالجة بنجاح. عدد الصفوف: {len(df)}")
    return df


if __name__ == "__main__":

    data = {
        'age': ['25', '30', 'not_recorded', '45'],
        'income': [5000, 7000, 1000000, np.nan],
        'city': [' Riyadh ', 'JEDDAH!', 'dammam ', 'Riyadh'],
        'signup_date': ['2023-01-01', '2023-02-15', 'invalid_date', '2023-05-20']
    }
    df_test = pd.DataFrame(data)


    df_cleaned = clean_data_project(df_test)


    df_cleaned.to_csv('cleaned_dataset.csv', index=False)
    print("تم حفظ البيانات المنظفة في 'cleaned_dataset.csv'")

تمت المعالجة بنجاح. عدد الصفوف: 4
تم حفظ البيانات المنظفة في 'cleaned_dataset.csv'
