# Creating testing data

In [65]:
import pandas as pd
import numpy as np

import jupyter_black

jupyter_black.load()

np.random.seed(42)  # We set the seed to 42 so that we can reproduce the results.

In [75]:
N_0_1 = pd.DataFrame(
    np.random.normal(0, 1, size=(1000, 1)), columns=["A"]
)  # Creating contineuos data, column A is normally distributed.

N_0_50 = pd.DataFrame(
    np.random.exponential(5, size=(1000, 1)), columns=["B"]
)  # Column B is exponentially distributed.

random_float = pd.DataFrame(
    np.random.rand(1000, 1) * 10, columns=["C"]
)  # Column C is random data.

discrete = pd.DataFrame(
    np.random.randint(0, 50, size=(1000, 1)), columns=["D"]
)  # Column D is random but discrete data.

df = pd.concat((N_0_1, N_0_50, random_float, discrete), axis=1)
df.index.name = "ID"  # We set the index name to ID because it is a general name for the index of a dataset.
for c in range(len(df.columns)):
    df.iloc[
        [x for x in range(1, 999) if ((x % np.random.randint(1, 50) == 0))], c
    ] = np.nan
df.head()

Unnamed: 0_level_0,A,B,C,D
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-0.082903,1.059678,5.731641,20.0
1,-0.729317,2.658149,5.19293,49.0
2,0.950576,2.633232,2.170884,49.0
3,-0.567931,5.005153,5.291587,26.0
4,-0.903401,0.547059,8.971045,35.0


In [76]:
## Creating a multi-indexed dataframe
second_idx = pd.Series([x for x in range(1, 1001)])
df_multi_idx = df.copy()
df_multi_idx["2nd ID"] = second_idx
df_multi_idx.set_index(["2nd ID"], append=True, inplace=True)
df_multi_idx.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
ID,2nd ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,-0.082903,1.059678,5.731641,20.0
1,2,-0.729317,2.658149,5.19293,49.0
2,3,0.950576,2.633232,2.170884,49.0
3,4,-0.567931,5.005153,5.291587,26.0
4,5,-0.903401,0.547059,8.971045,35.0


In [77]:
# # Creating the csv and xlsx files
# df.to_csv("imputation_example.csv")
# df.to_excel("imputation_example.xlsx")

# df_multi_idx.to_csv("imputation_example_multi_idx.csv")
# df_multi_idx.to_excel("imputation_example_multi_idx.xlsx")

In [79]:
def upload_data(path, index_rows=[0]):
    if ".csv" in path:
        return pd.read_csv(path, index_col=index_rows)
    elif ".xlsx" in path:
        return pd.read_excel(path, index_col=index_rows)