# Creating testing data

In [1]:
import pandas as pd
import numpy as np

import jupyter_black

jupyter_black.load()

In [2]:
"""Creating a test dataset with a lot of mixed variable types and wierd column names to help build our functions."""

norm_0_1 = pd.DataFrame(
    np.random.normal(0, 1, size=(1000, 1)), columns=["A"]
)  # Creating contineuos data, column A is normally distributed.

norm_10_20 = pd.DataFrame(
    np.random.normal(0, 1, size=(1000, 1)), columns=[1]
)  # Creating contineuos data, column 1 is normally distributed.

expo_5 = pd.DataFrame(
    np.random.exponential(5, size=(1000, 1)), columns=["B"]
)  # Column B is exponentially distributed.

random_float = pd.DataFrame(
    np.random.rand(1000, 1) * 10, columns=["C"]
)  # Column C is random data.

discrete = pd.DataFrame(
    np.random.randint(0, 50, size=(1000, 1)), columns=["D"]
)  # Column D is random but discrete data.

categorical = pd.DataFrame(
    np.random.choice(["a", "b", "c", "d"], size=(1000, 1)), columns=["E"]
)  # Column E is categorical data.

categorical_2 = pd.DataFrame(
    np.random.choice(["v", "y", "z", "w"], size=(1000, 1)), columns=[2]
)  # Column 2 is categorical data.

df = pd.concat(
    (norm_0_1, norm_10_20, expo_5, random_float, discrete, categorical, categorical_2),
    axis=1,
)
df.index.name = "ID"  # We set the index name to ID because it is a general name for the index of a dataset.

np.random.seed(42)  # We set the seed to 42 so that we can reproduce the results.
for c in range(len(df.columns)):
    df.iloc[
        [x for x in range(1, 999) if ((x % np.random.randint(1, 100) == 0))], c
    ] = np.nan
df.head()

Unnamed: 0_level_0,A,1,B,C,D,E,2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.239968,0.248315,11.128861,4.184911,14.0,b,w
1,0.636761,-0.218202,18.210613,4.597633,1.0,d,w
2,0.847832,1.589175,,7.955868,11.0,c,y
3,1.669577,-1.004183,7.909009,1.35309,27.0,b,w
4,-0.875562,0.524021,1.477171,,0.0,b,w


In [3]:
## Creating a multi-indexed dataframe
second_idx = pd.Series([x for x in range(1, 1001)])
df_multi_idx = df.copy()
df_multi_idx["2nd ID"] = second_idx
df_multi_idx.set_index(["2nd ID"], append=True, inplace=True)
df_multi_idx.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,1,B,C,D,E,2
ID,2nd ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,1.239968,0.248315,11.128861,4.184911,14.0,b,w
1,2,0.636761,-0.218202,18.210613,4.597633,1.0,d,w
2,3,0.847832,1.589175,,7.955868,11.0,c,y
3,4,1.669577,-1.004183,7.909009,1.35309,27.0,b,w
4,5,-0.875562,0.524021,1.477171,,0.0,b,w


In [4]:
# Creating the csv and xlsx files
df.to_csv("imputation_example.csv")
df.to_excel("imputation_example.xlsx")

df_multi_idx.to_csv("imputation_example_multi_idx.csv")
df_multi_idx.to_excel("imputation_example_multi_idx.xlsx")

# Creating and upload function
### Stuff we need to keep as variables - 
1. continuous variables
2. discrete variables
3. categorical variables

In [5]:
def feature_type_extraction(
    continuous=[], discrete=[], categorical=[], index_columns=[0]
):
    """Given a list of column names for each feature type, this function returns a dictionary with the feature type as the key and the list of column names as the value."""

    feature_type_dict = {
        "continuous continuous": continuous,
        "discrete columns": discrete,
        "categorical columns": categorical,
        "index column": index_columns,
    }

    return feature_type_dict

In [6]:
feature_type_dict = feature_type_extraction(
    continuous=["A", 1, "B"],
    discrete=["D"],
    categorical=["E", 2],
    index_columns=[0],
)
feature_type_dict

{'continuous continuous': ['A', 1, 'B'],
 'discrete columns': ['D'],
 'categorical columns': ['E', 2],
 'index column': [0]}

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       945 non-null    float64
 1   1       937 non-null    float64
 2   B       945 non-null    float64
 3   C       950 non-null    float64
 4   D       959 non-null    float64
 5   E       952 non-null    object 
 6   2       952 non-null    object 
dtypes: float64(5), object(2)
memory usage: 54.8+ KB


In [8]:
df

Unnamed: 0_level_0,A,1,B,C,D,E,2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.239968,0.248315,11.128861,4.184911,14.0,b,w
1,0.636761,-0.218202,18.210613,4.597633,1.0,d,w
2,0.847832,1.589175,,7.955868,11.0,c,y
3,1.669577,-1.004183,7.909009,1.353090,27.0,b,w
4,-0.875562,0.524021,1.477171,,0.0,b,w
...,...,...,...,...,...,...,...
995,0.773697,0.480230,2.611412,7.607091,41.0,d,z
996,-1.690989,-0.455491,2.755111,6.111638,17.0,b,w
997,0.165602,0.088020,5.812832,6.868825,25.0,a,w
998,-0.212716,-0.873440,2.399876,2.884358,36.0,a,w
