In [10]:
import csv
import json
import os
import random
import math

If you're using a JSON file, you can point to it in the code block below. Otherwise, use the second code block to fill out a dict that describes the dataset you're generating. Your data should look something like this:
``` 
data_descriptor_json = {
  "defaultLabelColumn": "recommendation", #the default label to be predicted
  "fields": [
    {
      "id": "age", #whatever the field contains
      "type": "continuous", #or continuous, or binary
      "values": {
        #if categorical or binary
        "categories": [],
        #if continuous:
        "min": 0,
        "max": 100
      }
    }
   ]
}
```

In [1]:
data_json_file = open('your_file.json')
data_descriptor_json = json.read(data_json_file)

FileNotFoundError: [Errno 2] No such file or directory: 'your_file.json'

In [26]:
data_descriptor_json = {
  "defaultLabelColumn": "Alert", 
  "fields": [
    {
      "id": "Alert", 
      "type": "continuous", 
      "values": {
        "min": 0,
        "max": 100
      }
    },
    {
       "id": "Cars",
      "type": "continuous",
      "values": {
        "min": 0,
        "max": 50
      }
    }
   ]
}

print(data_descriptor_json)
fields = data_descriptor_json['fields']
label = data_descriptor_json['defaultLabelColumn']
print()
for x in fields:
    print(x["id"])

{'defaultLabelColumn': 'Alert', 'fields': [{'id': 'Alert', 'type': 'continuous', 'values': {'min': 0, 'max': 100}}, {'id': 'Cars', 'type': 'continuous', 'values': {'min': 0, 'max': 50}}]}

Alert
Cars


In [27]:
correlation_values = {
    "Cars": "linearPositive",
}

correlation_valuesNone = {
    "temperature": "noCorrelation",
    "people": "noCorrelation",
    "dayOfMonth": "noCorrelation",
    "antellope": "noCorrelation",
    "rhino": "noCorrelation",
    "elephant": "noCorrelation"
}

Here we simply define the relationship our variables should have and the percentage of predictions that should *not* meet that relationship - (randomly generated) predictions.

In [28]:
import random
import math

##In all of these: x is the randomly generated numerical label we're trying to predict
##Then we're reverse-engineering a possible feature input that would correlate to that numerical value
def gauss(lmin, lmax, fmin, fmax, x):
    lmin = float(lmin-1)
    lmax = float(lmax)
    fmin = float(fmin)
    fmax = float(fmax)
    x = float(x)
    
    #Figure out center of bell curve based on feature mins and max
    center = (fmin + fmax) / 2
    #Figure out distance from center to ends, which is then broken into 4 quadrants for standard dev in next step
    dist = fmax - center
    #determine width of a single standard deviation, then add a little extra to make it not completely approach 0 at endpoints
    width = dist/4 + math.sqrt(dist / 4)
    
    temp = x - lmin
    temp = temp / (lmax - lmin)
    #print("x: " + str(x) + "\t temp: " + str(temp))
    temp = math.log(temp)
    temp = temp * -2 * width * width
    
    #need this because inverse gaussian curve isn't a function
    if(random.randint(1, 2) == 1):
        #positive square root
        temp = math.sqrt(temp)
    else:
        #negative square root
        temp = -1 * math.sqrt(temp)
    
    temp = temp + center
    return temp

def linearPositive(lmin, lmax, fmin, fmax, x):
    lmin = float(lmin)
    lmax = float(lmax)
    fmin = float(fmin)
    fmax = float(fmax)
    x = float(x)
    slope = (lmax - lmin)/(fmax - fmin)
    
    return ((x-lmin)/slope) + fmin

def linearNegative(lmin, lmax, fmin, fmax, x):
    lmin = float(lmin)
    lmax = float(lmax)
    fmin = float(fmin)
    fmax = float(fmax)
    x = float(x)
    
    slope = -1*(lmax - lmin)/(fmax - fmin)
    
    return ((x-lmin)/slope) + fmax

def noCorrelation(lmin, lmax, fmin, fmax, x):
    #doesn't actually need feature values or x
    fmin = float(fmin)
    fmax = float(fmax)
    
    return random.uniform(fmin, fmax)
    

    
#print(linearNegative(0, 50, 60, 80, 25))
#print(linearPositive(0, 50, 60, 80, 25))
#print(gauss(12, 84, 47, 104, 12))

In [29]:
numRows = 1000
outputFile = "driving_cars.csv"
#percent of data for each feature that is just randomly generated rather than using model
percentTotallyRandom = .01
#amount that a particular point can deviate from its ideal position
#written as a percent of the range of the feature
#in other words: if a feature has a range of 10 and variance = 0.1, any point can be at most 1 unit away from ideal position
variance = 0.01
decimalPoints = 1

In [30]:
import random
import math
import csv
import json

headers = [field['id'] for field in data_descriptor_json['fields']]
print(headers)

print(fields[0])

## Get metadata about label
t = ""
labelMin = 0
labelMax = 0

for f in fields:
    if f['id'] == label:
        t = type(f['values']['min'])
        labelMin = f['values']['min']
        labelMax = f['values']['max']
        break
        
labelOut = []
print(t)

## Generate all the output values for my label
for repeat in range(numRows):
    if t is int:
        labelOut.append(random.randint(labelMin, labelMax))
    elif t is float:
        labelOut.append(random.uniform(labelMin, labelMax))
    else:
        print("Error w/continuous types")
        
rows = []
## Repeat numRows times
for i in range(numRows):
    tempRow = []
    curLabelVal = labelOut[i]
    #curLabelVal = 12
    ## For each field
    for f in fields:
        if f['id'] == label:
            ## Found the label, so I can start adding the value from before
            ## But first, need to add in the variance I got from before. 
            ## Need to add to the LABEL value since I want it to represent variance in the vertical / label output,
            ## not variabce in the horizontal / feature output
            ## (This comment is extra long because I had it backwards at one point)
            varianceRange = (labelMax - labelMin) * variance
            if t is int:
                tempVal = random.randint(int(curLabelVal - varianceRange), int(curLabelVal + varianceRange))
            elif t is float:
                tempVal = random.uniform(curLabelVal - varianceRange, curLabelVal + varianceRange)
            tempRow.append(tempVal)
        else:
            ## get feature metadata to generate
            featureMin = f['values']['min']
            featureMax = f['values']['max']
            fType = type(featureMin)
            corType = correlation_values[f['id']]
            featureVal = 0
            rand = random.random()
            #if(rand < percentTotallyRandom):
            #    featureVal = noCorrelation(labelMin, labelMax, featureMin, featureMax, curLabelVal)
            #elif corType == "gauss":
            if corType == "gauss":
                featureVal = gauss(labelMin, labelMax, featureMin, featureMax, curLabelVal)
            elif corType == "linearPositive":
                featureVal = linearPositive(labelMin, labelMax, featureMin, featureMax, curLabelVal)
            elif corType == "linearNegative":
                featureVal = linearNegative(labelMin, labelMax, featureMin, featureMax, curLabelVal)
            elif corType == "noCorrelation":
                featureVal = noCorrelation(labelMin, labelMax, featureMin, featureMax, curLabelVal)
            
            #if random.randint(1, 2) == 1:
            #    featureVal += featureVariance
            #else:
            #    featureVal -= featureVariance
            
            if fType is int:
                featureVal = int(featureVal)
            elif fType is float:
                featureVal = round(featureVal, decimalPoints)
            tempRow.append(featureVal)
    rows.append(tempRow)

#for row in rows:
#    print(row)
    
with open(outputFile, 'w', newline='') as outfile:
    writer = csv.writer(outfile, delimiter=',')
    writer.writerow(headers)
    for row in rows:
        writer.writerow(row)
    


['Alert', 'Cars']
{'id': 'Alert', 'type': 'continuous', 'values': {'min': 0, 'max': 100}}
<class 'int'>


In [8]:
# DON'T USE!!!!

#write the file
with open(outfile_name, 'w') as outfile:
    writer = csv.writer(outfile, delimiter=',')
    headers = [field['id'] for field in data_descriptor_json['fields']]
    fields = data_descriptor_json['fields']
    writer.writerow(headers)
    result_index = headers.index(predict_label)
    for i in range(num_rows):
        output = {}
        row = []
        rand_num = random.random()
        for i in fields:
            if i['id'] != predict_label:
                random_val = random.uniform(i['values']['min'], i['values']['max'])
                output[i['id']] = round(random_val, decimal_points)
        # quick pass at code for adding some degree of error to the "correct" predictions
        # error_pos_neg = 1 if random.random() < 0.5 else -1
        prediction = formula(output) 
        # predict_error = prediction * error_pos_neg * error
        # output[predict_label] = round(prediction + predict_error, decimal_points)
        if rand_num < error:
            print(fields[result_index])
            output[predict_label] = round(random.uniform(fields[result_index]['values']['min'], fields[result_index]['values']['min']), decimal_points)
        else:
            output[predict_label] = round(prediction, decimal_points)
        for i in headers:
            row.append(output[i])
        writer.writerow(row)                                                     
                    
                                                
                                                              
                
        
                
                
        
        


NameError: name 'outfile_name' is not defined