# Generate Fake Data
Concept:
* Input 1,000 fake entries

Data Structure:
1. Timestamp
2. Name: Use 50 different names
3. Callsign/Alias: Use 50 different aliases
4. Which group are you from?
5. Which activity have you chosen to clock the mileage?
6. How far did you clock? (km)
7. Share with us a picture/screenshot of your accomplishment!
8. Actual Mileage

In [1]:
import datetime
import numpy as np
import pandas as pd

from faker import Faker

## Configure Form Options

### Names & Aliases

In [2]:
Faker.seed(123)
fake = Faker()

names = []

for _ in range(100):
    names.append(fake.name())

In [3]:
def get_alias(name):
    full_name = name.split(' ')
    return full_name[0][0] + '. ' + full_name[1]
    
aliases = [get_alias(name) for name in names]

### Groups

In [4]:
groups = [
    'ABM', 'ADSS/ADWS/ADWO', 'ADW', 'AFE', 'AOSX/AOS', 'ATC'
]

### Combine Names, Aliases, and Groups

In [5]:
people = pd.DataFrame()
people['Name'] = names
people['Alias'] = aliases

np.random.seed(123)
people['Group'] = np.array(groups)[list(np.random.randint(0,6, 100))]

### Activities

In [6]:
activities = [
    'Walk', 'Run', 'Cycle', 'Swim'
]

## Generate Data

In [7]:
fake_data = pd.DataFrame()

# Generate timestamps
Faker.seed(456)
fake = Faker()
fake_data['timestamp'] = [fake.date_time_between(datetime.datetime(2020, 10, 20), '+50d') for _ in range(1000)]

# Input names
np.random.seed(456)
people_id = np.random.randint(0, 100, 1000)
fake_data['name'] = people.Name.loc[people_id].tolist()
fake_data['alias'] = people.Alias.loc[people_id].tolist()
fake_data['group'] = people.Group.loc[people_id].tolist()

# Convert names to FB IDs
fake_data['name'] = fake_data['name'].str.lower().str.replace(' ', '_') + '123'

# Generate activity
np.random.seed(789)
fake_data['activity'] = np.array(activities)[np.random.randint(0,4, 1000)]

# Generate raw mileage clocked
np.random.seed(123)
fake_data['raw_mileage'] = np.random.randint(1,11, 1000)

# Create empty column for picture
fake_data['share_img'] = ''

# Sort by date and time
fake_data = fake_data.sort_values('timestamp').reset_index(drop=True)

In [8]:
# Compute mileage
fake_data['actual_mileage'] = np.where(
    fake_data.activity=='Swim', fake_data.raw_mileage * 6,
    np.where(
        fake_data.activity=='Cycle', fake_data.raw_mileage / 4,
        fake_data.raw_mileage
    )
)

In [9]:
# Output
fake_data.to_csv('fake_data.csv', index=False)

In [10]:
fake_data

Unnamed: 0,timestamp,name,alias,group,activity,raw_mileage,share_img,actual_mileage
0,2020-10-20 00:12:30,anthony_warren123,A. Warren,ADSS/ADWS/ADWO,Walk,4,,4.00
1,2020-10-20 00:14:41,evelyn_christian123,E. Christian,AOSX/AOS,Swim,2,,12.00
2,2020-10-20 00:44:20,zachary_thomas123,Z. Thomas,ADSS/ADWS/ADWO,Cycle,9,,2.25
3,2020-10-20 01:16:27,erica_raymond123,E. Raymond,ADSS/ADWS/ADWO,Walk,9,,9.00
4,2020-10-20 01:30:09,brandon_russell123,B. Russell,ATC,Cycle,5,,1.25
5,2020-10-20 03:48:42,jeffrey_smith123,J. Smith,ABM,Cycle,8,,2.00
6,2020-10-20 04:23:09,angel_ho123,A. Ho,AFE,Walk,7,,7.00
7,2020-10-20 09:59:42,tanya_lamb123,T. Lamb,AFE,Walk,1,,1.00
8,2020-10-20 10:21:19,arthur_kelly123,A. Kelly,ATC,Swim,8,,48.00
9,2020-10-20 10:37:06,catherine_smith123,C. Smith,ADW,Walk,10,,10.00
