<a href="https://colab.research.google.com/github/fatymazahrae/DataCleanOptimizer/blob/main/COptimizerpart2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Project Guide  
------------  
- [Project Overview](#project-overview)  
- [Part 1: Reading Data - Exploratory Data Analysis](#I)
- [Part 2: Visual data analysis](#II)
- [Part 3: Data Pre-processing &  Preparation](#III)
- [Part 4: process automation](#IV)


<a id="project-overview"></a>

# Project Overview

![image.png](attachment:image.png)

##### This project aims to optimize data cleaning and preprocessing algorithms to prepare high-quality datasets for subsequent analyses. The optimization focuses not only on the efficiency of the algorithms but also on their ability to handle large volumes of data while maintaining information quality.

### about dataset :



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import dask.dataframe as dd

# Path to your CSV file in Google Drive
file_path = '/content/drive/My Drive/adult.csv'

# Load the data using Dask (similar to pd.read_csv for Pandas)
df = dd.read_csv(file_path)

# To get a quick look at the data, you can compute a small sample
print(df.head())


In [None]:
df.compute().describe()

In [None]:
df.compute().shape

In [None]:
import hashlib

In [None]:
# Define the columns used for near-duplicate detection
NEAR_DUPLICATES_COLUMNS = ['workclass', 'fnlwgt', 'education', 'education-num',
                           'marital-status', 'occupation', 'relationship', 'race', 'sex',
                           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

def compute_hash(row):
    available_cols = [col for col in NEAR_DUPLICATES_COLUMNS if col in row.index]
    row_str = ''.join([str(row[col]) for col in available_cols])
    return hashlib.sha256(row_str.encode('utf-8')).hexdigest()

def process_partition(partition):
    return partition.apply(compute_hash, axis=1)

# Provide meta information to map_partitions
meta = pd.Series(dtype=str, name='hash_value')
df['hash_value'] = df.map_partitions(process_partition, meta=meta)

df = df.drop_duplicates(subset=['hash_value'])

In [None]:
df.compute().shape

In [None]:
df = df.drop('hash_value', axis=1)

In [None]:
def detect_dtype(data):
    num_features = []
    cat_features = []

    for col in data.columns:
        if data[col].dtype.kind in 'biufc':
            num_features.append(col)
        else:
            cat_features.append(col)

    return num_features, cat_features

num_features, cat_features = detect_dtype(df)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

for ftr in cat_features:
    df[ftr] = df[ftr].fillna('missing')

mean_values = df[num_features].mean().compute()
df[num_features] = df[num_features].fillna(mean_values)

data_sample = df[num_features].sample(frac=0.1).compute()
scaler = MinMaxScaler()
scaler.fit(data_sample)

df[num_features] = df.map_partitions(lambda partition: pd.DataFrame(scaler.transform(partition[num_features]),
                                                                    columns=num_features))
label_encoders = {}

def label_encode_partition(partition, encoders):
    for col in cat_features:
        encoder = encoders[col]
        partition[col] = encoder.fit_transform(partition[col])
    return partition

# Create LabelEncoders for each categorical column
label_encoders = {col: LabelEncoder() for col in cat_features}

# Apply the label encoding partition by partition
df = df.map_partitions(label_encode_partition, encoders=label_encoders, meta=df)

In [None]:
from keras.layers import Input, Embedding, Flatten, Dense, concatenate
from keras.models import Model
from keras.optimizers import Adam

num_input = Input(shape=(len(num_features),), name='num_input')

def embed_ftr(ftr):
    cat_input = Input(shape=(1,), name=f'cat_input_{ftr}')
    cat_embed = Embedding(input_dim=len(df[ftr].unique()), output_dim=2, input_length=1)(cat_input)
    cat_flatten = Flatten()(cat_embed)
    return cat_input, cat_flatten

embedded_cats = []
cat_inputs = []

for ftr in cat_features:
    cat_input, embedded_cat = embed_ftr(ftr)
    embedded_cats.append(embedded_cat)
    cat_inputs.append(cat_input)

# # Combine numerical input and embedded categorical features
# combined = concatenate([num_input] + embedded_cats)