In [22]:
import pandas as pd 
import geopandas as gpd

import sys 
sys.path.append('../')
from src import diffusion 
from src.spatial_graph import spatial_neighbor_graph
# using the env : xarr 

random_seed = 42 

from sklearn import metrics
import numpy as np


def load_pc_shp(pcs_to_load):
    ll = []
    for pc in pcs_to_load:
        if len(pc)==1:
            path = f'/Volumes/T9/2024_Data_downloads/codepoint_polygons_edina/Download_all_postcodes_2378998/codepoint-poly_5267291/one_letter_pc_code/{pc}/{pc}.shp'
        else:
            path = f'/Volumes/T9/2024_Data_downloads/codepoint_polygons_edina/Download_all_postcodes_2378998/codepoint-poly_5267291/two_letter_pc_code/{pc}.shp'
        sd = gpd.read_file(path)    
        ll.append(sd) 
    pc_shp = pd.concat(ll)
    return pc_shp 


def calculate_mape(y_true, y_pred):
    # Remove pairs where y_true is zero
    mask = y_true != 0
    if not mask.any():
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def calculate_metrics(y_true, y_pred):
    # Input validation
    if y_true.shape != y_pred.shape:
        raise ValueError("Arrays must have the same shape")
    if not (np.isfinite(y_true).all() and np.isfinite(y_pred).all()):
        raise ValueError("Arrays contain NaN or infinite values")
    
    # Calculate metrics
    rmse = np.sqrt(metrics.mean_squared_error(y_true, y_pred))
    mae = metrics.mean_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)
    
    # MAPE with zero handling
    mape = calculate_mape(y_true, y_pred)
    
    # Format and print results
    print('RMSE: {:,.1f}'.format(rmse),
          'MAE: {:,.1f}'.format(mae),
          'MAPE: {:.1f}'.format(mape),
          'R2: {:.2f}'.format(r2))
    
    return rmse, mae, mape, r2



In [None]:

import os 
import scipy.sparse as sp
import os 
import scipy.sparse as sp

def load_graph(dataset_name, distance_metric, nrows, k=10):
    return sp.load_npz(f"graphs//{dataset_name}_dm{distance_metric}_nr{nrows}_k{k}_graph.npz" ) 

def graph_exists(dataset_name, distance_metric, nrows, k=10):
    return os.path.isfile(f"graphs//{dataset_name}_dm{distance_metric}_nr{nrows}_k{k}_graph.npz" ) 


def get_graph(dataset_name, geo_df, distance, k, nrows, graph_fn):
    os.makedirs('graphs', exist_ok=True)

    # check if graph exists 
    if not graph_exists(dataset_name, distance, nrows, k):
        print('Creating graph')
        graph = spatial_neighbor_graph(geo_df, k=5, distance_metric=distance)
        sp.save_npz(f"graphs/{dataset_name}_dm{distance}_nr{nrows}_k{k}_graph.npz", graph)
        return graph 
    else:
        
        adj = load_graph(dataset_name, distance,nrows,  k)
        return adj


In [23]:
# take first 1000 rows  
def load_geodf(nrows):
    df = pd.read_csv('/Users/gracecolverd/NebulaDataset/final_dataset/NEBULA_englandwales_domestic_filtered.csv', nrows=nrows)
    pcs_load = df.postcode.str[0:2].unique().tolist()
    pc_shp = load_pc_shp(pcs_load)

    geo_df = pc_shp.merge(df, left_on='POSTCODE', right_on='postcode', how='inner')
    geo_df = geo_df.to_crs('EPSG:4326')
    geo_df['latitude'] = geo_df.geometry.centroid.y 
    geo_df['longitude'] = geo_df.geometry.centroid.x
    return geo_df

def create_geo_df(df):
    pcs_load = df.postcode.str[0:2].unique().tolist()
    pc_shp = load_pc_shp(pcs_load)

    geo_df = pc_shp.merge(df, left_on='POSTCODE', right_on='postcode', how='inner')
    geo_df = geo_df.to_crs('EPSG:4326')
    geo_df['latitude'] = geo_df.geometry.centroid.y 
    geo_df['longitude'] = geo_df.geometry.centroid.x
    return geo_df


In [24]:
import pandas as pd 

df = pd.read_csv('/Users/gracecolverd/NebulaDataset/final_dataset/NEBULA_englandwales_domestic_filtered.csv')

  df = pd.read_csv('/Users/gracecolverd/NebulaDataset/final_dataset/NEBULA_englandwales_domestic_filtered.csv')


In [132]:
ld_cd

'E06000060'

In [25]:
ld_cd = df.ladcd.unique().tolist()[2]

test_df = df[df['ladcd']==ld_cd].copy() 
test_df.shape

(5951, 258)

In [130]:


def run_graph_prop(target_col, geo_df, percent_missing, adj, distance, random_seed=42):
    # set the random seed 
    np.random.seed(random_seed)
    og_data =  geo_df[target_col].values 
    incomplete_postcode_data= og_data.copy()    
    # generate maks using percent_missing 
    missing_mask = np.random.choice([0, 1], size=og_data.shape[0], p=[percent_missing, 1-percent_missing])
    # validate that mask is correct number 
    if not (1 - missing_mask.sum() / missing_mask.shape[0]) -0.1 < 0.1:
        print('Error with mask ')
    
    incomplete_postcode_data[missing_mask==0] = np.nan  

    if incomplete_postcode_data.ndim != 1:
        print('error expecting diff dimesnions')
    

    completed_pc_df = diffusion.graph_prop(adj, incomplete_postcode_data, missing_mask )

    # check if all above 0  for completed_pc_df 
    if not (completed_pc_df > 0).all():
        print('error in completed_pc_df')



    missing_data_subset = og_data[missing_mask==0]
    filled_subset = completed_pc_df[missing_mask==0]


    y_true = missing_data_subset
    y_pred = filled_subset 
    print(f'The errors for filling target col: {target_col} with spatial graph on {distance} distance are:')
    rmse, mae, mape, r2 = calculate_metrics(y_true, y_pred)
    return rmse, mae, mape, r2




In [27]:
# retool to take the geo df load it all, create lat lons and and then subset  

In [29]:
# 1k rows, takes 11s 
# 10k rows takes 46s 
# nrows=10000
# geo_df = load_geodf(nrows=nrows)
geo_df = create_geo_df(test_df)


  geo_df['latitude'] = geo_df.geometry.centroid.y

  geo_df['longitude'] = geo_df.geometry.centroid.x


In [30]:
dataset_name = 'NEBULA_englandwales_domestic_filtered'
target_col  = 'total_gas'
percent_missing = 0.1
k=5 


In [32]:
# print(f'nrows: {nrows}')
print(f'using test df for ladc " {ld_cd}')
nrows = ld_cd
for distance in ['haversine', 'euclidean']:
    print(f'DISTANCE: {distance}')
    for target_col in ['total_gas', 'avg_gas', 'total_elec', 'HDD_winter', 'CDD_summer',]:
        adj = get_graph(dataset_name, geo_df, distance, k, nrows)
        run_graph_prop(target_col, geo_df, percent_missing, adj, distance)

using test df for ladc " E06000060
DISTANCE: haversine
Creating graph
The errors for filling target col: total_gas with spatial graph on haversine distance are:
RMSE: 161,645.4 MAE: 124,577.4 MAPE: 85.1 R2: 0.05
The errors for filling target col: avg_gas with spatial graph on haversine distance are:
RMSE: 4,476.7 MAE: 3,221.4 MAPE: 26.7 R2: 0.50
The errors for filling target col: total_elec with spatial graph on haversine distance are:
RMSE: 41,847.6 MAE: 32,473.0 MAPE: 83.4 R2: 0.04
The errors for filling target col: HDD_winter with spatial graph on haversine distance are:
RMSE: 0.3 MAE: 0.1 MAPE: 0.2 R2: 0.97
The errors for filling target col: CDD_summer with spatial graph on haversine distance are:
RMSE: 0.1 MAE: 0.0 MAPE: 1.7 R2: 0.97
DISTANCE: euclidean
Creating graph
The errors for filling target col: total_gas with spatial graph on euclidean distance are:
RMSE: 151,232.7 MAE: 115,167.6 MAPE: 75.3 R2: 0.05
The errors for filling target col: avg_gas with spatial graph on euclidean

# try using feature distance 

In [52]:
from src.feature_graph import haversine_distance, spatial_feature_neighbor_graph, calculate_edge_weights

from src. get_graph import get_graph

Generating new graph with spatial_feature_neighbor_graph


In [57]:
geo_df.columns.tolist()

['POSTCODE',
 'UPP',
 'PC_AREA',
 'geometry',
 'postcode',
 'all_types_total_buildings',
 'all_types_premise_area_total',
 'all_types_total_fl_area_H_total',
 'all_types_total_fl_area_FC_total',
 'all_types_uprn_count_total',
 'all_types_premise_area_null_count',
 'all_types_total_fl_area_H_null_count',
 'all_types_total_fl_area_FC_null_count',
 'mixed_alltypes_count',
 'comm_alltypes_count',
 'unknown_alltypes_count',
 'all_residential_types_count',
 'clean_res_total_buildings',
 'clean_res_premise_area_total',
 'clean_res_total_fl_area_H_total',
 'clean_res_total_fl_area_FC_total',
 'clean_res_base_floor_total',
 'clean_res_basement_heated_vol_total',
 'clean_res_listed_bool_total',
 'clean_res_uprn_count_total',
 'clean_res_premise_area_null_count',
 'clean_res_total_fl_area_H_null_count',
 'clean_res_total_fl_area_FC_null_count',
 'outb_res_total_buildings',
 'outb_res_premise_area_total',
 'outb_res_total_fl_area_H_total',
 'outb_res_total_fl_area_FC_total',
 'outb_res_uprn_count_

In [60]:


feature_params = {
    'k': 5,
    'distance_metric': 'haversine',
    'feature_cols': ['avg_gas', 'HDD_winter', 'CDD_summer'],
    'spatial_weight': 0.7,

}

feature_graph = get_graph(
    dataset_name='NEBULA_englandwales_domestic_filtered',
    geo_df=geo_df,
    graph_fn=spatial_feature_neighbor_graph,
    graph_params=feature_params
)
target_col = 'total_gas'
percent_missing= 0.1 
distance = feature_params['distance_metric']
print('inputs are {}'.format(feature_params['feature_cols']))
run_graph_prop(target_col, geo_df, percent_missing, feature_graph, distance)

Loading cached graph: graphs/07654e7995.npz
inputs are ['avg_gas', 'HDD_winter', 'CDD_summer']
The errors for filling target col: total_gas with spatial graph on haversine distance are:
RMSE: 145,566.0 MAE: 112,975.8 MAPE: 68.9 R2: 0.06


In [78]:


feature_params = {
    'k': 5,
    'distance_metric': 'haversine',
    'feature_cols': ['avg_gas', 'all_types_total_buildings', 'HDD_winter', 'CDD_summer'],
    'spatial_weight': 0.7,   
}

feature_graph = get_graph(
    dataset_name=ld_cd,
    geo_df=geo_df,
    graph_fn=spatial_feature_neighbor_graph,
    graph_params=feature_params
)
target_col = 'total_gas'
percent_missing= 0.1 
distance = feature_params['distance_metric']
print('inputs are {}'.format(feature_params['feature_cols']))
run_graph_prop(target_col, geo_df, percent_missing, feature_graph, distance)

Generating new graph with spatial_feature_neighbor_graph
inputs are ['avg_gas', 'all_types_total_buildings', 'HDD_winter', 'CDD_summer']
The errors for filling target col: total_gas with spatial graph on haversine distance are:
RMSE: 77,620.6 MAE: 50,798.0 MAPE: 29.2 R2: 0.75


In [61]:
from src.column_settings import settings_dict

In [None]:

settings_dict[52][1] ,  

fi = ['all_res_total_fl_area_H_total',
 'Pre 1919_pct',
 'Standard size detached_pct',
 'postcode_area',
 'HDD_winter',
 'economic_activity_perc_Economically active (excluding full-time students): In employment: Employee: Full-time',
 'ethnic_group_perc_White: English, Welsh, Scottish, Northern Irish or British',
 'socio_class_perc_L1, L2 and L3: Higher managerial, administrative and professional occupations',
 'household_comp_perc_One-person household',
 'Domestic outbuilding_pct',
 '3-4 storey and smaller flats_pct',
]

In [71]:
geo_df['Pre 1919_pct'] =  geo_df['Pre 1919_pct'].fillna(0)
geo_df['Standard size detached_pct'] =  geo_df['Standard size detached_pct'].fillna(0)
geo_df['Domestic outbuilding_pct'] =  geo_df['Domestic outbuilding_pct'].fillna(0)
geo_df['3-4 storey and smaller flats_pct'] =  geo_df['3-4 storey and smaller flats_pct'].fillna(0)


In [None]:
feature_params = {
    'k': 5,
    'distance_metric': 'haversine',
    'feature_cols': fi, 
    'spatial_weight': 0.7,
}

feature_graph = get_graph(
    dataset_name=ld_cd,
    geo_df=geo_df,
    graph_fn=spatial_feature_neighbor_graph,
    graph_params=feature_params
)
target_col = 'total_gas'
percent_missing= 0.1 
distance = feature_params['distance_metric']
print('inputs are FI setting 52 ')
print(fi[0:5])
print(fi[5:])
run_graph_prop(target_col, geo_df, percent_missing, feature_graph, distance)

Generating new graph with spatial_feature_neighbor_graph
inputs are FI setting 52 
['all_res_total_fl_area_H_total', 'Pre 1919_pct', 'Standard size detached_pct', 'postcode_area', 'HDD_winter']
['economic_activity_perc_Economically active (excluding full-time students): In employment: Employee: Full-time', 'ethnic_group_perc_White: English, Welsh, Scottish, Northern Irish or British', 'socio_class_perc_L1, L2 and L3: Higher managerial, administrative and professional occupations', 'household_comp_perc_One-person household', 'Domestic outbuilding_pct', '3-4 storey and smaller flats_pct']
The errors for filling target col: total_gas with spatial graph on haversine distance are:
RMSE: 81,168.0 MAE: 61,720.2 MAPE: 35.4 R2: 0.75


In [105]:
# try all numeric cols 

numeric_cols = [col for col in settings_dict[18][1] if geo_df[col].dtype.kind in 'fc']

numeric_data = geo_df[numeric_cols+['latitude', 'longitude', 'total_gas']].copy()

# Get columns with any NaN values
cols_with_nans = [col for col in test_df[numeric_cols].columns if test_df[col].isna().any()]

# Print them along with their NaN count
for col in cols_with_nans:
    nan_count = test_df[col].isna().sum()
    print(f"{col}: {nan_count} NaN values")

all_types_total_fl_area_FC_total: 1 NaN values
2 storeys terraces with t rear extension_pct: 5886 NaN values
3-4 storey and smaller flats_pct: 5192 NaN values
Domestic outbuilding_pct: 3261 NaN values
Large detached_pct: 5370 NaN values
Large semi detached_pct: 5889 NaN values
Linked and step linked premises_pct: 5470 NaN values
Medium height flats 5-6 storeys_pct: 5947 NaN values
Planned balanced mixed estates_pct: 5847 NaN values
Semi type house in multiples_pct: 5088 NaN values
Small low terraces_pct: 4402 NaN values
Standard size detached_pct: 2342 NaN values
Standard size semi detached_pct: 2747 NaN values
Tall flats 6-15 storeys_pct: 5950 NaN values
Tall terraces 3-4 storeys_pct: 5871 NaN values
Very large detached_pct: 5946 NaN values
Very tall point block flats_pct: 5951 NaN values
1919-1944_pct: 4530 NaN values
1945-1959_pct: 3706 NaN values
1960-1979_pct: 2800 NaN values
1980-1989_pct: 4535 NaN values
1990-1999_pct: 4942 NaN values
Post 1999_pct: 4462 NaN values
Pre 1919_pct:

In [106]:
pct_cols = [
    '2 storeys terraces with t rear extension_pct',
    '3-4 storey and smaller flats_pct',
    'Domestic outbuilding_pct',
    'Large detached_pct',
    'Large semi detached_pct',
    'Linked and step linked premises_pct',
    'Medium height flats 5-6 storeys_pct',
    'Planned balanced mixed estates_pct',
    'Semi type house in multiples_pct',
    'Small low terraces_pct',
    'Standard size detached_pct',
    'Standard size semi detached_pct',
    'Tall flats 6-15 storeys_pct',
    'Tall terraces 3-4 storeys_pct',
    'Very large detached_pct',
    'Very tall point block flats_pct',
    '1919-1944_pct',
    '1945-1959_pct',
    '1960-1979_pct',
    '1980-1989_pct',
    '1990-1999_pct',
    'Post 1999_pct',
    'Pre 1919_pct'
]

numeric_data[pct_cols] = numeric_data[pct_cols].fillna(0)

In [107]:
numeric_data.dropna().shape, numeric_data.shape

((5950, 185), (5951, 185))

In [108]:


feature_params = {
    'k': 5,
    'distance_metric': 'haversine',
    'feature_cols': numeric_cols, 
    'spatial_weight': 0.7,
}

input_data = numeric_data.dropna()

feature_graph = get_graph(
    dataset_name=ld_cd,
    geo_df=input_data , 
    graph_fn=spatial_feature_neighbor_graph,
    graph_params=feature_params
)
target_col = 'total_gas'
percent_missing= 0.1 
distance = feature_params['distance_metric']
print('inputs are all numeric cols') 
run_graph_prop(target_col, input_data, percent_missing, feature_graph, distance)

Loading cached graph: graphs/e92aa35454.npz
inputs are all numeric cols
The errors for filling target col: total_gas with spatial graph on haversine distance are:
RMSE: 73,149.6 MAE: 55,356.0 MAPE: 33.8 R2: 0.78


# trying to tune avg gas and total building to get the total gas 

In [131]:

rs = 100

feature_params = {
    'k': 5,
    'distance_metric': 'haversine',
    'feature_cols': ['avg_gas', 'all_types_total_buildings'], 
    'spatial_weight': 0.7,
}

input_data = geo_df 

feature_graph = get_graph(
    dataset_name=ld_cd,
    geo_df=input_data , 
    graph_fn=spatial_feature_neighbor_graph,
    graph_params=feature_params
)
target_col = 'total_gas'
percent_missing= 0.1 
distance = feature_params['distance_metric']
print('inputs are all numeric cols') 
rmse, mae, mape, r2 = run_graph_prop(target_col, input_data, percent_missing, feature_graph, distance, random_seed= rs)

Loading cached graph: graphs/910a8e15f4.npz
inputs are all numeric cols
The errors for filling target col: total_gas with spatial graph on haversine distance are:
RMSE: 88,112.3 MAE: 59,060.7 MAPE: 34.4 R2: 0.69


In [None]:
# experiments 
# - need to run 10 times for CV across 10 random seeds 
# - need to test across ranges of missing data, startin at 10 increasing to 90 in 10 increments 
# - need to test havershine vs euclidean distance 
# - need to test spatial weights rangin from 0 to 1 in 0.1 increments 

# The method is quick to run so we can do a full feature spac sweep to test which parameters are important to tune for the large dataset 

In [113]:


feature_params = {
    'k': 5,
    'distance_metric': 'haversine',
    'feature_cols': ['avg_gas', 'all_types_total_buildings', 'HDD_winter', 'CDD_summer'], 
    'spatial_weight': 0.7,
}

input_data = geo_df 

feature_graph = get_graph(
    dataset_name=ld_cd,
    geo_df=input_data , 
    graph_fn=spatial_feature_neighbor_graph,
    graph_params=feature_params
)
target_col = 'total_gas'
percent_missing= 0.1 
distance = feature_params['distance_metric']
print('inputs are all numeric cols') 
run_graph_prop(target_col, input_data, percent_missing, feature_graph, distance)

Loading cached graph: graphs/42019733b6.npz
inputs are all numeric cols
The errors for filling target col: total_gas with spatial graph on haversine distance are:
RMSE: 78,882.0 MAE: 53,343.4 MAPE: 27.6 R2: 0.77


In [None]:

print(f'using test df for ladc " {ld_cd}')
nrows = ld_cd
for distance in ['haversine', 'euclidean']:
    print(f'DISTANCE: {distance}')
    for target_col in ['total_gas', 'avg_gas', 'total_elec', 'HDD_winter', 'CDD_summer',]:
        adj = get_graph(dataset_name, geo_df, distance, k, nrows)
        run_graph_prop(target_col, geo_df, percent_missing, adj, distance)

In [38]:
test_df.columns.tolist()

['postcode',
 'all_types_total_buildings',
 'all_types_premise_area_total',
 'all_types_total_fl_area_H_total',
 'all_types_total_fl_area_FC_total',
 'all_types_uprn_count_total',
 'all_types_premise_area_null_count',
 'all_types_total_fl_area_H_null_count',
 'all_types_total_fl_area_FC_null_count',
 'mixed_alltypes_count',
 'comm_alltypes_count',
 'unknown_alltypes_count',
 'all_residential_types_count',
 'clean_res_total_buildings',
 'clean_res_premise_area_total',
 'clean_res_total_fl_area_H_total',
 'clean_res_total_fl_area_FC_total',
 'clean_res_base_floor_total',
 'clean_res_basement_heated_vol_total',
 'clean_res_listed_bool_total',
 'clean_res_uprn_count_total',
 'clean_res_premise_area_null_count',
 'clean_res_total_fl_area_H_null_count',
 'clean_res_total_fl_area_FC_null_count',
 'outb_res_total_buildings',
 'outb_res_premise_area_total',
 'outb_res_total_fl_area_H_total',
 'outb_res_total_fl_area_FC_total',
 'outb_res_uprn_count_total',
 'outb_res_premise_area_null_count',
 

In [47]:
geo_df.columns.tolist()

['POSTCODE',
 'UPP',
 'PC_AREA',
 'geometry',
 'postcode',
 'all_types_total_buildings',
 'all_types_premise_area_total',
 'all_types_total_fl_area_H_total',
 'all_types_total_fl_area_FC_total',
 'all_types_uprn_count_total',
 'all_types_premise_area_null_count',
 'all_types_total_fl_area_H_null_count',
 'all_types_total_fl_area_FC_null_count',
 'mixed_alltypes_count',
 'comm_alltypes_count',
 'unknown_alltypes_count',
 'all_residential_types_count',
 'clean_res_total_buildings',
 'clean_res_premise_area_total',
 'clean_res_total_fl_area_H_total',
 'clean_res_total_fl_area_FC_total',
 'clean_res_base_floor_total',
 'clean_res_basement_heated_vol_total',
 'clean_res_listed_bool_total',
 'clean_res_uprn_count_total',
 'clean_res_premise_area_null_count',
 'clean_res_total_fl_area_H_null_count',
 'clean_res_total_fl_area_FC_null_count',
 'outb_res_total_buildings',
 'outb_res_premise_area_total',
 'outb_res_total_fl_area_H_total',
 'outb_res_total_fl_area_FC_total',
 'outb_res_uprn_count_

In [None]:

# If you want weighted edges
weighted_graph = calculate_edge_weights(
    geo_df,
    graph,
    feature_cols=feature_columns,
    spatial_weight=0.7
)

RMSE: 133,457.5 MAE: 106,755.2 MAPE: 76.5 R2: 0.10


In [42]:
print(graph.shape)  # Will output: (1000, 1000)
print(graph.nnz)  

(1000, 1000)
6190


In [33]:
len(components)

12

In [92]:
# import numpy as np
# import pandas as pd
# from typing import List

# def missing_col_only(df: pd.DataFrame, missing_col_name: str, list_complete_cols: List[str]):
#     """
#     Handle scenario where only one column has missing values
    
#     Args:
#         df: Input DataFrame
#         missing_col_name: Name of column with missing values
#         list_complete_cols: List of columns that are complete (no missing values)
#     """
#     # Load the complete columns data
#     complete_data = {col: df[col] for col in list_complete_cols}
    
#     # Get the partial column data
#     return compute_with_partial_data(
#         complete_data,
#         partial_column(df, missing_col_name)
#     )

# def partial_column(df: pd.DataFrame, col_name: str, missing_percent: float = 0.2, specific_indices: List[int] = None):
#     """
#     Create partial data for a column by marking some values as missing
    
#     Args:
#         df: Input DataFrame
#         col_name: Name of column to make partial
#         missing_percent: Percentage of data to mark as missing
#         specific_indices: Specific indices to mark as missing (if None, random selection)
#     """
#     column = df[col_name].copy()
    
#     # Get mask for missing values
#     if specific_indices is not None:
#         missing_mask = np.zeros(len(df), dtype=bool)
#         missing_mask[specific_indices] = True
#     else:
#         n_missing = int(len(df) * missing_percent)
#         missing_mask = np.zeros(len(df), dtype=bool)
#         missing_indices = np.random.choice(len(df), size=n_missing, replace=False)
#         missing_mask[missing_indices] = True
    
#     # Mark values as missing
#     column[missing_mask] = np.nan
#     return column

# def compute_with_partial_data(complete_data: dict, partial_column: pd.Series):
#     """
#     Compute final result using complete and partial data
    
#     Args:
#         complete_data: Dictionary of complete columns
#         partial_column: Series containing partial data with missing values
#     """
#     # Here you would implement your specific computation logic
#     # This is a placeholder for whatever computation you need to do
#     result = partial_column.copy()
#     return result



In [None]:
# Example usage:
if __name__ == "__main__":
    # Create sample DataFrame
    df = pd.DataFrame({
        'col1': [1, 2, 3, 4, 5],
        'col2': [2, 4, 6, 8, 10],
        'target': [1, 2, 3, 4, 5]
    })
    
    # Example: Make target column partial and compute using complete cols
    result = missing_col_only(
        df=df,
        missing_col_name='target',
        list_complete_cols=['col1', 'col2']
    )