# Synthetic data generator for anomaly detection technical challenge problem

### Objective

- Create a 4-column dataframe with three explanatory feature columns and a binary output/response variable column.
- Each column should have null values missing at random.
- Each explanatory feature column should consist of values ranging from -500 to 200 at random. 
- Dataframe should be 10000 rows in length, so resulting dataframe should be a 10000 x 4 dimension matrix.
- Save the dataframe to a CSV called `raw_data.csv`  

### Installations

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification

### Create a class `DataGenerator`

In [14]:
### enter number of features for the dataset
n_features = int(input("Please enter number of features for the dataset \n"))

Please enter number of features for the dataset 
 40


In [15]:
### enter number of informative features for the dataset
n_informative = int(input("Please enter number of informative features for the dataset \n"))

Please enter number of informative features for the dataset 
 3


In [84]:
class DataGenerator:
    
    def __init__(self, n_samples, n_features, n_informative, n_redundant, n_repeated, n_classes, 
                 n_clusters_per_class, flip_y, null_prob):
        ''' generate data using the make_classification module '''
        self.n_samples = n_samples
        self.n_features = n_features
        self.n_informative = n_informative
        self.n_redundant = n_redundant
        self.n_repeated = n_repeated
        self.n_classes = n_classes
        self.n_clusters_per_class = n_clusters_per_class
        self.flip_y = flip_y
        self.null_prob = null_prob

    def generate_random_data(self):
        X,y = make_classification(n_samples=self.n_samples, n_features=self.n_features,
                            n_informative=self.n_informative, n_redundant=self.n_redundant, n_repeated=self.n_repeated, 
                            n_classes=self.n_classes, n_clusters_per_class=self.n_clusters_per_class, weights=None, 
                            flip_y=self.flip_y, class_sep=1.0, hypercube=True, shift=0.0, 
                            scale=1.0, shuffle=True, random_state=None)
        return X,y
        
    def create_dataframe(self):
        X,y = self.generate_random_data()
        featurenames = list()
        for i in range(n_features):
            featurenames.append(f"x{i}")
        df = pd.concat([pd.DataFrame(X, columns=featurenames), 
                        pd.DataFrame(y, columns=['y'])], axis=1)

        ### generate random null values
        X = df.drop('y', axis=1) ### create dataframe with just X variables
        for col in X.columns:
            mask_values = np.random.choice([np.nan, True], size=df.shape[0], p=[self.null_prob, 1 - self.null_prob])
            df[col] = df[col]*mask_values
        return df

In [85]:
generator = DataGenerator(n_samples=10000, n_features=n_features, n_informative=n_informative, 
                          n_redundant=2, n_repeated=0, n_classes=2, flip_y=0.01, n_clusters_per_class=2, 
                          null_prob=.001)
df = generator.create_dataframe()

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 41 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x0      9989 non-null   float64
 1   x1      9995 non-null   float64
 2   x2      9988 non-null   float64
 3   x3      9990 non-null   float64
 4   x4      9989 non-null   float64
 5   x5      9988 non-null   float64
 6   x6      9987 non-null   float64
 7   x7      9990 non-null   float64
 8   x8      9990 non-null   float64
 9   x9      9991 non-null   float64
 10  x10     9989 non-null   float64
 11  x11     9992 non-null   float64
 12  x12     9986 non-null   float64
 13  x13     9990 non-null   float64
 14  x14     9987 non-null   float64
 15  x15     9991 non-null   float64
 16  x16     9994 non-null   float64
 17  x17     9990 non-null   float64
 18  x18     9995 non-null   float64
 19  x19     9990 non-null   float64
 20  x20     9995 non-null   float64
 21  x21     9993 non-null   float64
 22 

In [87]:
df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x31,x32,x33,x34,x35,x36,x37,x38,x39,y
0,0.821615,0.005438,1.405877,-1.671051,0.673243,2.569206,-1.358099,0.600167,-0.271563,-0.713674,...,0.903795,-1.23174,1.055078,-0.996066,0.397088,-0.276625,-1.232512,0.40347,-0.892124,0
1,-0.157628,0.488277,1.032593,-0.65146,0.293222,-0.692851,0.476781,-2.036175,-0.154158,-0.427911,...,-2.039804,-2.483759,-1.348559,-0.965881,-0.811055,-0.961314,0.069055,0.87428,-0.710027,1
2,2.335337,-1.259756,-1.409324,-2.243465,-1.005477,0.552449,0.205637,0.423192,1.086872,-0.367398,...,-0.815861,1.358779,0.375093,-0.847342,0.863651,-1.159884,-0.573352,-0.310774,-0.529847,1
3,0.444714,-1.155224,-0.498338,-0.681449,-0.807188,-0.223881,-1.203262,0.386788,-0.227016,-1.448302,...,0.423448,0.340145,-0.625418,-3.447103,-0.195254,-0.044352,1.275212,-0.944736,0.255152,0
4,-0.182744,-1.710438,0.24353,0.693694,-2.635318,0.619008,-0.172413,-0.038792,0.050902,-0.46907,...,0.570074,-0.14803,1.794412,-0.784764,0.180351,0.757999,0.307451,0.927743,0.623604,0


In [7]:
# Function to generate random data
def generate_random_data(size, min_value, max_value):
    data = np.random.randint(min_value, max_value + 1, size)
    return data

# Generate data for each column
x0 = generate_random_data(10000, -500, 200)
x1 = generate_random_data(10000, -500, 200)
x2 = generate_random_data(10000, -500, 200)
x3 = generate_random_data(10000, -500, 200)  # Additional column
y = np.random.choice([0, 1], size=10000, p=[0.999, 0.001])  # Binary output column

In [8]:
# class DataGenerator:
#     def __init__(self, num_rows, min_value, max_value, null_probs):
#         self.num_rows = num_rows
#         self.min_value = min_value
#         self.max_value = max_value
#         self.null_probs = null_probs

#     def generate_random_data(self, null_prob):
#         data = np.random.randint(self.min_value, self.max_value + 1, size=self.num_rows)
#         ### generate null values at random
#         null_mask = np.random.choice([True, False], size=self.num_rows, p=[null_prob, 1 - null_prob])
#         data[null_mask] = np.nan
#         return data

#     def create_dataframe(self):
#         columns = []
#         for null_prob in self.null_probs:
#             columns.append(self.generate_random_data(null_prob))
#         columns.append(np.random.choice([0, 1], size=self.num_rows, p=[0.7, 0.3]))  # Binary output column

#         df = pd.DataFrame(columns, index=['x0', 'x1', 'x2', 'x3', 'y']).T
#         return df


### Create dataframe

In [9]:
# Create a DataFrame
df = pd.DataFrame({'x0': x0, 'x1': x1, 'x2': x2, 'x3': x3, 'y': y})

# Insert null values at random in the X variables only 
null_prob = .001
X = df.drop('y', axis=1)
for col in X.columns:
        mask_values = np.random.choice([np.nan, True], size=df.shape[0], p=[null_prob, 1 - null_prob])
        df[col] = df[col]*mask_values


In [10]:
# # Generate a small portion of null values at random
# generator = DataGenerator(10000, -500, 200, [0.0001, 0.0001, 0.000015])
# df = generator.create_dataframe()

### Save to csv

In [11]:
df.to_csv('raw_data.csv', index=False)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x0      9989 non-null   float64
 1   x1      9991 non-null   float64
 2   x2      9987 non-null   float64
 3   x3      9990 non-null   float64
 4   y       10000 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 390.8 KB


In [14]:
df.head()

Unnamed: 0,x0,x1,x2,x3,y
0,-77.0,157.0,-226.0,114.0,0
1,-253.0,-213.0,-321.0,147.0,0
2,-21.0,165.0,-86.0,-390.0,0
3,182.0,-348.0,-233.0,137.0,0
4,-421.0,177.0,-217.0,-384.0,0
