In [1]:
import pandas as pd
import numpy as np
import os
from data_handler import DataHandler

In [None]:
data = {
        'id': ['01', '02', '03'],
        'text_col': ['How can I stay ahead in the rapidly evolving AI landscape, especially with foundation models and AutoML?', 
                     'What does it mean for me to be an impactful and scalable engineer beyond just writing high-performance models?',
                     'Where do I have critical gaps in business/product thinking that impact my ability to drive real-world AI adoption?'],
        'categorical_col': ['A', 'B', 'A'],
        'int_col': [1, 2, 3],
        'float_col': [1.1, 2.2, 3.3],
        'bool_col': [True, False, True],
        'datetime_col': ['2023-01-01', None, '2023-01-03'],
        'datetime_col2': [None, None, '2013-01-03'],
        'label': [0, 1, 0],
        'target': [10.5, 20.5, 30.5]
    }
df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,text_col,categorical_col,int_col,float_col,bool_col,datetime_col,datetime_col2,label,target
0,1,How can I stay ahead in the rapidly evolving A...,A,1,1.1,True,2023-01-01,,0,10.5
1,2,What does it mean for me to be an impactful an...,B,2,2.2,False,,,1,20.5
2,3,Where do I have critical gaps in business/prod...,A,3,3.3,True,2023-01-03,2013-01-03,0,30.5


In [3]:
csv_path = "tests/test.csv"
df.to_csv(csv_path, index=False)

In [4]:
def handler():
    """Create DataHandler instance"""
    return DataHandler()
handler = handler()

In [None]:
# Test CSV reading
df, sep = handler.read_file(str(csv_path))
print(sep)
df.head()

,


Unnamed: 0,id,text_col,categorical_col,int_col,float_col,bool_col,datetime_col,datetime_col2,label,target
0,1,How can I stay ahead in the rapidly evolving A...,A,1,1.1,True,2023-01-01,,0,10.5
1,2,What does it mean for me to be an impactful an...,B,2,2.2,False,,,1,20.5
2,3,Where do I have critical gaps in business/prod...,A,3,3.3,True,2023-01-03,2013-01-03,0,30.5


In [6]:
# test test_analyze_dataset
handler.analyze_dataset(df)


=== Dataset Overview ===
Number of samples: 3
Number of features: 10
Memory usage: 0.00 MB

=== Feature Types Summary ===
object: 4 columns
int64: 3 columns
float64: 2 columns
bool: 1 columns

=== Detailed Feature Analysis ===

Column: id
Type: int64
Unique values: 3
Missing values: 0 (0.00%)
Min: 1
Max: 3
Mean: 2.00
Memory usage: 0.00 MB

Column: text_col
Type: object
Unique values: 3
Missing values: 0 (0.00%)
Sample values: ['What does it mean for me to be an impactful and scalable engineer beyond just writing high-performance models?', 'How can I stay ahead in the rapidly evolving AI landscape, especially with foundation models and AutoML?', 'Where do I have critical gaps in business/product thinking that impact my ability to drive real-world AI adoption?']
Memory usage: 0.00 MB

Column: categorical_col
Type: object
Unique values: 2
Missing values: 0 (0.00%)
Sample values: ['A', 'B']
Memory usage: 0.00 MB

Column: int_col
Type: int64
Unique values: 3
Missing values: 0 (0.00%)
Min: 

In [7]:
df

Unnamed: 0,id,text_col,categorical_col,int_col,float_col,bool_col,datetime_col,datetime_col2,label,target
0,1,How can I stay ahead in the rapidly evolving A...,A,1,1.1,True,2023-01-01,,0,10.5
1,2,What does it mean for me to be an impactful an...,B,2,2.2,False,,,1,20.5
2,3,Where do I have critical gaps in business/prod...,A,3,3.3,True,2023-01-03,2013-01-03,0,30.5


In [8]:
# test low_quality_columns
df, dropped_cols = handler.drop_low_quality_columns(df, missing_threshold=0.5)
print(dropped_cols)

Dropping 1 low quality columns:

Columns with > 50% missing values:
- datetime_col2: 66.7% missing

Columns with only one unique value:
- datetime_col2: value = nan
['datetime_col2']


In [9]:
# test clean_data
df = handler.clean_data(df, drop_cols=['id'], label='label')
df.head()

Unnamed: 0,text_col,categorical_col,int_col,float_col,bool_col,datetime_col,label,target
0,How can I stay ahead in the rapidly evolving A...,A,1,1.1,True,2023-01-01,0,10.5
1,What does it mean for me to be an impactful an...,B,2,2.2,False,,1,20.5
2,Where do I have critical gaps in business/prod...,A,3,3.3,True,2023-01-03,0,30.5


In [10]:
#test create_metadata
metadata = handler.create_metadata(df, output_dir='tests', output_type='classes', output_label=['label'])
metadata

{'output_type': 'classes',
 'input_features': ['text_col',
  'categorical_col',
  'int_col',
  'float_col',
  'bool_col',
  'datetime_col',
  'target'],
 'output_label': ['label'],
 'input_text': ['text_col'],
 'input_float': ['float_col', 'target'],
 'input_int': ['int_col'],
 'input_categorical': ['categorical_col'],
 'input_datetime': ['datetime_col'],
 'input_bool': ['bool_col']}

In [11]:
def is_categorical_column(series, threshold=0.05):
    """Helper function to identify categorical columns
    Args:
        series: pandas Series to check
        threshold: maximum ratio of unique values to total values to be considered categorical
    """
    if series.dtype == 'category':
        return True
    
    if series.dtype == 'object':
        # Calculate ratio of unique values to total values
        n_unique = series.nunique()
        n_total = len(series)
        unique_ratio = n_unique / n_total
        
        # Special handling for small datasets (less than 100 rows)
        if n_total < 100:
            # For small datasets, primarily look at absolute number of unique values
            if n_unique <= 5:  # If 5 or fewer unique values, consider it categorical
                avg_length = series.str.len().mean()
                return avg_length < 20      
        else:
            # For larger datasets, use the ratio approach
            if n_unique < 50:
                avg_length = series.str.len().mean()
                return (unique_ratio < threshold and 
                        avg_length < 20)
    return False

In [12]:
is_categorical_column(df['text_col'])

False

In [13]:
is_categorical_column(df['categorical_col'])

True

In [14]:
# test load_metadata
metadata = handler.load_metadata(metadata_path='tests/metadata.json')
metadata
# test fill_missing_values
df = handler.fill_missing_values(df, metadata)
df.head()


Unnamed: 0,text_col,categorical_col,int_col,float_col,bool_col,datetime_col,label,target
0,How can I stay ahead in the rapidly evolving A...,A,1,1.1,True,2023-01-01 00:00:00.000000000,0,10.5
1,What does it mean for me to be an impactful an...,B,2,2.2,False,1677-09-21 00:12:43.145224193,1,20.5
2,Where do I have critical gaps in business/prod...,A,3,3.3,True,2023-01-03 00:00:00.000000000,0,30.5
