In [3]:
import pandas as pd
import numpy as np

# Simulate a sample DataFrame
df = pd.DataFrame({
    "age": [25, 30, 22, 40, 35, 50, 28, 33, 27, 45],
    "income": [3000, 4500, 2800, 5200, 4100, 6000, 3900, 4800, 3100, 5500],
    "score": [0.8, 0.6, 0.75, 0.9, 0.65, 0.92, 0.7, 0.85, 0.77, 0.88]
})

# Define info dictionary for generate()
info = {
    "rate": 0.2,  # 20% missingness
    "type": 1     # Use type_two generator
}

def type_two(data, missing_rate=0.1, seed=1):
    verify_missing_rate(missing_rate)
    data = data.astype(float)
    total_elements = data.size
    missing_elements = int(total_elements * missing_rate)

    data_with_missing = data.copy()
    np.random.seed(seed)
    mask_indices = np.random.choice(total_elements, missing_elements, replace=False)
    multi_indices = np.unravel_index(mask_indices, data.shape)
    data_with_missing[multi_indices] = np.nan

    return data_with_missing


# Run the test using generate
data_with_missing = type_two(df, missing_rate=0.1)



TypeError: unhashable type: 'numpy.ndarray'

In [1]:
def verify_missing_rate(rate, var_name="missing_rate"):
    """
    Verify that the missing rate is between 0 and 1 (inclusive).

    Parameters
    ----------
    rate : float
        The missing rate to check.
    var_name : str, optional
        Variable name to show in the error message.

    Raises
    ------
    ValueError
        If the rate is not in the range [0, 1].
    """
    if not isinstance(rate, (float, int)):
        raise TypeError(f"{var_name} must be a float or int.")
    if not (0 <= rate <= 1):
        raise ValueError(f"{var_name} must be between 0 and 1 (got {rate}).")


In [1]:
import pandas as pd
import numpy as np
from missmecha.analysis import report
df = pd.DataFrame({
    "age": [25, 30, np.nan, 40],
    "income": [3000, np.nan, 2800, 5200],
    "gender": ["M", "F", np.nan, "F"]
})

result = report.missing_rate(df)




Overall missing rate: 25.00% (3 / 12 values are missed)


In [2]:
result['report']  # per-column summary


Unnamed: 0_level_0,n_missing,missing_rate (%),n_unique,dtype,n_total
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
age,1,25.0,3,float64,4
income,1,25.0,3,float64,4
gender,1,25.0,2,object,4


In [3]:
result['overall_missing_rate']  # overall %

25.0

In [None]:
from missmecha.analysis import visual
import numpy as np
import pandas as pd

collisions = pd.read_csv("https://raw.githubusercontent.com/ResidentMario/missingno-data/master/nyc_collision_factors.csv")
data = collisions.sample(250)
df = pd.read_csv("_data.txt", delimiter=",", header=None)

# Simulated larger dataset with some missing values
df = pd.DataFrame({
    "DATE": ["09/10/2016", "03/31/2016", "03/16/2016", "04/01/2016", np.nan, "04/03/2016"],
    "TIME": ["12:09:00", "22:10:00", "14:58:00", np.nan, "08:30:00", "19:00:00"],
    "BOROUGH": ["QUEENS", "BROOKLYN", "MANHATTAN", "QUEENS", "BRONX", np.nan],
    "ZIP CODE": ["11427", "11223", "10001", "11434", np.nan, "10010"],
    "LATITUDE": [40.724692, 40.598761, 40.712776, np.nan, 40.850000, 40.755000],
    "LONGITUDE": [-73.874245, -73.987843, -74.006058, -73.900000, -73.880000, np.nan],
    "VEHICLE TYPE": ["BICYCLE", "PASSENGER VEHICLE", "TAXI", "SUV", np.nan, "BICYCLE"]
})

def type_convert(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = pd.to_datetime(df[col])
            except ValueError:
                 df[col] = df[col].astype('category').cat.codes

        else:
            df[col].to_numpy(dtype=float) 

    return df
# Convert using your helper
converted_array = type_convert(df)

null_pattern =(np.random.random(10000).reshape((500, 20))> 0.5).astype(bool)
null_pattern = pd.DataFrame(null_pattern).replace({False: None})
ts = null_pattern.set_index(pd.period_range('1/1/2011', '5/14/2012', freq='D'))
# visual.matrix(ts,cmap="GnBu",ts=True)
# visual.matrix(df,cmap="GnBu")
# visual.matrix(data,cmap="GnBu",color=False)
# visual.heatmap(ts)
# visual.heatmap(df)
# visual.heatmap(data)

In [None]:
import pandas as pd
import numpy as np
from missmecha.evaluation import imp_eval

# Step 1: Create complete test data
np.random.seed(0)
df_true = pd.DataFrame({
    "A": np.random.normal(loc=10, scale=2, size=100),
    "B": np.random.normal(loc=0, scale=1, size=100),
    "C": np.random.normal(loc=50, scale=10, size=100)
})

# Step 2: Inject missing values (20%)
mask = np.random.rand(*df_true.shape) < 0.2
df_incomplete = df_true.mask(mask)

# Step 3: Simple imputation with column means
df_filled = df_incomplete.fillna(df_incomplete.mean().round())

results = imp_eval(df_true, df_filled, df_incomplete, method="rmse")
print("Per-column RMSE:", results["column_scores"])
print("Overall RMSE:", results["overall_score"])

status = {
    "A": "num",
    "B": "cat",
    "C": "disc"
}

# Even though method="mae", it's only used for numeric columns
# categorical columns will be evaluated by accuracy
scores = imp_eval(df_true, df_filled, df_incomplete, method="mae", status=status)
print("Per-column RMSE:", scores["column_scores"])
print("Overall RMSE:", scores["overall_score"])


Per-column RMSE: {'A': 1.4817116029992137, 'B': 0.9307352818998821, 'C': 8.79733248775861}
Overall RMSE: 3.736593124219235


{'A': 1.133288651991824}
0.0
{'A': 1.133288651991824, 'B': 0.0}
0.0
{'A': 1.133288651991824, 'B': 0.0, 'C': 0.0}
{'A': 1.133288651991824, 'B': 0.0, 'C': 0.0}
Per-column RMSE: {'A': 1.133288651991824, 'B': 0.0, 'C': 0.0}
Overall RMSE: 0.37776288399727465
