# HateSQL
#### made by Andrian Hyriak and Victor Golomovzuy

## Library example usage

In [1]:
import pandas as pd
import numpy as np
from dataclean.src.data_valydator import DataValidator
from dataclean.src.feature_engineering import FeatureEngineering
from dataclean.src.missing_data_handler import MissingDataHandler
from dataclean.src.outliner_handler import OutlierHandler
from dataclean.src.scaling_encoding_handler import ScalingEncodingHandler
from dataclean.src.transformation_normalization_handler import TransformationNormalizationHandler



ModuleNotFoundError: No module named 'dataclean'

In [2]:
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [3]:
engine = DataValidator(df)
engine.describe_data()


Basic Dataset Information:
--------------------------
Dataset Shape: (374, 13)

Data Types:
Person ID                    int64
Gender                      object
Age                          int64
Occupation                  object
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level      int64
Stress Level                 int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
Daily Steps                  int64
Sleep Disorder              object
dtype: object

First few rows of data:
   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sle

## additionl features

In [4]:

# Sample dataset with more realistic column names
data = pd.DataFrame({
    'age': [10, 20, 15, 25, 30],
    'income': [100, 150, 120, 200, 180],
    'category': ['A', 'A', 'B', 'B', 'C'],
    'target': [1, 0, 1, 1, 0]
})

# Initialize FeatureEngineering instance
fe = FeatureEngineering()

# Generate interaction terms for 'age' and 'income'
data = fe.generate_interaction_terms(data, columns=['age', 'income'], degree=2)

# Bin 'age' into 3 quantiles
data = fe.binning(data, column='age', bins=3, labels=['Low', 'Medium', 'High'], strategy='quantile')

# Create polynomial features for 'age' up to degree 3
data = fe.create_polynomial_features(data, columns=['age'], degree=3)

# Expand 'category' column into dummy variables with a 20% frequency threshold
data = fe.expand_categorical(data, column='category', threshold=0.2)

# Aggregate 'target' column by 'category' column, calculating the mean for each category
data = fe.group_and_aggregate(data, column='category', target='target', agg_func='mean')

# Generate 2 lagged features for 'age'
data = fe.generate_lagged_features(data, column='age', lags=2)

# Calculate rolling mean and standard deviation for 'income' with a window size of 2
data = fe.rolling_statistics(data, column='income', window=2, stats=['mean', 'std'])

# Print the resulting DataFrame
print(data)


   age  income category  target  age_x_income age_binned  age_pow_2  \
0   10     100        A       1          1000        Low        100   
1   20     150        A       0          3000     Medium        400   
2   15     120        B       1          1800        Low        225   
3   25     200        B       1          5000       High        625   
4   30     180        C       0          5400       High        900   

   age_pow_3  category_A  category_B  category_other  category_mean_target  \
0       1000           1           0               0                   0.5   
1       8000           1           0               0                   0.5   
2       3375           0           1               0                   1.0   
3      15625           0           1               0                   1.0   
4      27000           0           0               1                   0.0   

   age_lag_1  age_lag_2  income_rolling_mean_2  income_rolling_std_2  
0        NaN        NaN          

## missing values handling

In [5]:
# Initialize MissingDataHandler
data = {
    "Name": ["Alice", "Bob", "Charlie", None, "Eve", "Frank", None],
    "Age": [25, np.nan, 30, 22, None, 35, 40],
    "Gender": ["Female", "Male", None, "Female", "Female", None, "Male"],
    "Sleep Disorder": [None, "Insomnia", None, "Sleep Apnea", "Insomnia", None, None],
    "Hours Worked": [40, 50, 45, np.nan, 38, None, 60],
    "Salary": [50000, 60000, 55000, None, 48000, 62000, None],
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

missing_handler = MissingDataHandler(df)

# Get recommendations for handling missing values based on correlation analysis
missing_handler.summarize_missing_impact()

# Detailed missing data report
missing_handler.detect_missing()

# Identify patterns in missing data
missing_handler.identify_missing_patterns()

# Fill missing values based on mean, with before/after statistics
missing_handler.fill_missing(strategy="constant", value="healthy sleap", columns=['Sleep Disorder'])



      Name   Age  Gender Sleep Disorder  Hours Worked   Salary
0    Alice  25.0  Female           None          40.0  50000.0
1      Bob   NaN    Male       Insomnia          50.0  60000.0
2  Charlie  30.0    None           None          45.0  55000.0
3     None  22.0  Female    Sleep Apnea           NaN      NaN
4      Eve   NaN  Female       Insomnia          38.0  48000.0
5    Frank  35.0    None           None           NaN  62000.0
6     None  40.0    Male           None          60.0      NaN
Missing Data Impact Analysis and Fill Recommendations:
               Most Correlated Feature Recommended Fill Method
Name                              None                    mode
Age                               None                    mean
Gender                            None                    mode
Sleep Disorder                    None                    mode
Hours Worked                      None                    mean
Salary                            None                    mean


## Outliners handling

In [6]:

data = pd.DataFrame({
    'price': [10, 12, 14, 15, 100, 15, 12, 14, 15, 100],
    'quantity': [1, 2, 2, 2, 200, 2, 1, 2, 2, 200]
})

# Initialize OutlierHandler instance
handler = OutlierHandler()

# Detect outliers in 'price' and 'quantity'
outliers = handler.detect_outliers(data, columns=['price', 'quantity'])
print("Outliers detected:", outliers)

# Remove outliers from 'price' and 'quantity'
cleaned_data = handler.remove_outliers(data, columns=['price', 'quantity'])
print("\nData after removing outliers:\n", cleaned_data)

# Cap outliers in 'price' and 'quantity' within defined bounds
capped_data = handler.cap_outliers(data, columns=['price', 'quantity'])
print("\nData after capping outliers:\n", capped_data)


Outliers detected: {'price': [4, 9], 'quantity': [0, 4, 6, 9]}

Data after removing outliers:
    price  quantity
1     12         2
2     14         2
3     15         2
5     15         2
7     14         2
8     15         2

Data after capping outliers:
    price  quantity
0  10.00       2.0
1  12.00       2.0
2  14.00       2.0
3  15.00       2.0
4  18.75       2.0
5  15.00       2.0
6  12.00       2.0
7  14.00       2.0
8  15.00       2.0
9  18.75       2.0


## scaling encoding handler

In [8]:

data = {
    "Employee ID": [101, 102, 103, 104, 105],
    "Age": [25, 30, None, 35, 40],
    "Department": ["HR", "IT", "Finance", None, "IT"],
    "Hours Worked": [40, None, 45, 38, 60],
    "Salary": [50000, 60000, 55000, 48000, None],
    "Sleep Disorder": ["None", "Insomnia", None, "Sleep Apnea", "None"]
}

# Convert to DataFrame
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
handler = ScalingEncodingHandler()

# Step 1: Scale numerical features
print("\nScaling numerical features:")
scaled_df = handler.scale_features(
    data=df.copy(),
    method='standard',  # Using StandardScaler
    fillna_strategy='mean'  # Fill missing values with the mean
)
print(scaled_df)

# Step 2: Encode categorical features
print("\nEncoding categorical features:")
encoded_df = handler.encode_categorical(
    data=df.copy(),
    method='onehot',  # Using OneHotEncoding
    drop_first=True,  # Drop the first column for linear models
    fillna_strategy='mode'  # Fill missing values with the mode
)
print(encoded_df)

# Step 3: Combine both scaling and encoding
print("\nScaling and encoding combined:")
scaled_encoded_df = handler.scale_features(
    data=encoded_df.copy(),
    method='minmax',  # Using MinMaxScaler
    fillna_strategy='zero'  # Fill missing values with zero
)
print(scaled_encoded_df)

Original DataFrame:
   Employee ID   Age Department  Hours Worked   Salary Sleep Disorder
0          101  25.0         HR          40.0  50000.0           None
1          102  30.0         IT           NaN  60000.0       Insomnia
2          103   NaN    Finance          45.0  55000.0           None
3          104  35.0       None          38.0  48000.0    Sleep Apnea
4          105  40.0         IT          60.0      NaN           None

Scaling numerical features:
   Employee ID  Age Department  Hours Worked    Salary Sleep Disorder
0    -1.414214 -1.5         HR     -0.746376 -0.780250           None
1    -0.707107 -0.5         IT      0.000000  1.620519       Insomnia
2     0.000000  0.0    Finance     -0.097353  0.420134           None
3     0.707107  0.5       None     -1.005985 -1.260403    Sleep Apnea
4     1.414214  1.5         IT      1.849714  0.000000           None

Encoding categorical features:
   Employee ID   Age  Hours Worked   Salary  Department_HR  Department_IT  \
0 

# Transformation Normalization Handler

In [None]:
import pandas as pd
import numpy as np

# Create a sample dataset
data = {
    "Feature_A": [10, 20, 15, 25, 30],
    "Feature_B": [0.1, 0.2, -0.1, 0.3, 0.4],
    "Feature_C": [100, 200, 150, 250, 300],
    "Feature_D": [1, 4, 9, 16, 25],
}
df = pd.DataFrame(data)

print("Original Dataset:")
print(df)
# Initialize the handler
handler = TransformationNormalizationHandler()

# Log Transformation with handling negatives for Feature_B
df = handler.log_transform(data=df, columns=["Feature_B"], handle_negatives=True)
print("\nDataset after Log Transformation (Feature_B):")
print(df)

# Normalizing Feature_A and Feature_C to range (0, 1)
df = handler.normalize(data=df, columns=["Feature_A", "Feature_C"], target_range=(0, 1))
print("\nDataset after Normalization (Feature_A, Feature_C):")
print(df)

# Box-Cox Transformation for Feature_C
if (df["Feature_C"] <= 0).any():
    offset = abs(df["Feature_C"].min()) + 1  # Add an offset to make values positive
    df["Feature_C"] += offset

# Apply Box-Cox Transformation
df = handler.boxcox_transform(data=df, columns=["Feature_C"])
print("\nDataset after Box-Cox Transformation (Feature_C):")
print(df)

# Square Root Transformation for Feature_D
df = handler.sqrt_transform(data=df, columns=["Feature_D"], handle_negatives=False)
print("\nDataset after Square Root Transformation (Feature_D):")
print(df)


Original Dataset:
   Feature_A  Feature_B  Feature_C  Feature_D
0         10        0.1        100          1
1         20        0.2        200          4
2         15       -0.1        150          9
3         25        0.3        250         16
4         30        0.4        300         25

Dataset after Log Transformation (Feature_B):
   Feature_A  Feature_B  Feature_C  Feature_D
0         10   0.182322        100          1
1         20   0.262364        200          4
2         15   0.000000        150          9
3         25   0.336472        250         16
4         30   0.405465        300         25

Dataset after Normalization (Feature_A, Feature_C):
   Feature_A  Feature_B  Feature_C  Feature_D
0       0.00   0.182322       0.00          1
1       0.50   0.262364       0.50          4
2       0.25   0.000000       0.25          9
3       0.75   0.336472       0.75         16
4       1.00   0.405465       1.00         25

Dataset after Box-Cox Transformation (Feature_C):
   