## Student CSV Generator

In [19]:
# install faker package if not yet installed
!pip install faker

[31mtwisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.[0m
[31mindeed 0.0.4 has requirement requests==2.0.0, but you'll have requests 2.20.1 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 19.2.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [20]:
# inport faker, numpy and pandas
from faker import Faker
import numpy as np
import pandas as pd

In [21]:
# create a list of majors specializations separated by '_' (underscore)
major_spec = [
    'engineering_civil',
    'engineering_mechanical',
    'engineering_chemical',
    'engineering_industrial',
    'engineering_mechatronics',
    'engineering_civil',
    'engineering_computer',
    'engineering_electronics',
    'engineering_electrical',
    'artificial-intelligence_natural-language-processing',
    'artificial-intelligence_robotics',
    'artificial-intelligence_expert-systems',
    'artificial-intelligence_machile-learning',
    'artificial-intelligence_computer-vision'
]
# create a faker object, this generates fake data
fake = Faker()

In [145]:
# create a student dataframe
student_df = pd.DataFrame({
    # using list comprehension to efficiently generate a list of 100 student_ids 
    'student_id': ['2019-'+str(id) for id in np.arange(1,101)], 
    # using list comprehension to efficiently generate a list of 100 fake student names
    'name': [fake.name() for x in np.arange(100)],
    # using numpy's choice function to get 100 random major_specializations for the fake students students
    'major_spec': np.random.choice(major_spec,size=100),
    # for Q1 to Q4, generate random integers in between 70 to 100, with size of 100
    'Q1': np.random.randint(70,100,100),
    'Q2': np.random.randint(70,100,100),
    'Q3': np.random.randint(70,100,100),
    'Q4': np.random.randint(70,100,100),
})
# print out dataframe (only works in a jupyter notebook)
student_df

Unnamed: 0,student_id,name,major_spec,Q1,Q2,Q3,Q4
0,2019-1,Jeremy Morgan,engineering_mechanical,77,96,72,80
1,2019-2,David Hoover,engineering_civil,87,70,82,88
2,2019-3,Vincent Lynch,artificial-intelligence_natural-language-proce...,79,91,88,81
3,2019-4,Frank Simpson,engineering_electrical,87,70,74,81
4,2019-5,John Mason,engineering_mechanical,85,86,79,75
5,2019-6,Joseph Smith,artificial-intelligence_expert-systems,78,99,97,78
6,2019-7,Melissa Faulkner,engineering_electronics,72,83,79,97
7,2019-8,Emily Nguyen,engineering_civil,76,81,85,74
8,2019-9,Tony Scott,engineering_mechatronics,98,99,99,70
9,2019-10,Donna Moore,engineering_industrial,86,71,93,78


In [143]:
# save to students.csv
student_df.to_csv('students.csv',index=False)

In [144]:
# load students.csv into a new dataframe to check that all rows and columns were saved successfully
test_student_df = pd.read_csv('students.csv')
test_student_df

Unnamed: 0,student_id,name,major_spec,Q1,Q2,Q3,Q4
0,2019-1,John Smith,engineering_civil,70,96,98,73
1,2019-2,Amanda Anderson,engineering_chemical,93,75,84,85
2,2019-3,Dominique Schneider,engineering_industrial,72,88,82,94
3,2019-4,Kyle Campbell,engineering_mechatronics,73,76,80,82
4,2019-5,Stephanie Gutierrez,engineering_mechatronics,71,90,86,70
5,2019-6,Gerald Jacobs,artificial-intelligence_natural-language-proce...,77,75,88,92
6,2019-7,Kevin Marshall,engineering_mechanical,76,99,92,72
7,2019-8,Paul Williams,engineering_chemical,88,96,86,74
8,2019-9,Gabriel Vaughan,engineering_electrical,87,81,76,79
9,2019-10,Jonathan Cantrell,engineering_civil,98,81,82,71


## Address Traffic Data Generator

In [118]:
columns =['id','address','type','W1','W2','W3','W4']
address_df = pd.DataFrame(columns=columns)

for x in range(1,51):
    address = fake.address().replace('\n',',')
    for week in range(1,5):
        peak_data = {
            'id': x,
            'address': address,
            'type': 'peak',
            'W'+str(week): np.random.randint(0,24),
        }
        address_df = address_df.append(peak_data,ignore_index=True)
        offpeak_data = {
            'id': x,
            'address': address,
            'type': 'off-peak',
            'W'+str(week): np.random.randint(0,24),
        }
        address_df = address_df.append(offpeak_data,ignore_index=True)

address_df

Unnamed: 0,id,address,type,W1,W2,W3,W4
0,1,"56560 Joyce Canyon,Lake Ronaldmouth, IL 75050",peak,0,,,
1,1,"56560 Joyce Canyon,Lake Ronaldmouth, IL 75050",off-peak,23,,,
2,1,"56560 Joyce Canyon,Lake Ronaldmouth, IL 75050",peak,,4,,
3,1,"56560 Joyce Canyon,Lake Ronaldmouth, IL 75050",off-peak,,4,,
4,1,"56560 Joyce Canyon,Lake Ronaldmouth, IL 75050",peak,,,13,
5,1,"56560 Joyce Canyon,Lake Ronaldmouth, IL 75050",off-peak,,,12,
6,1,"56560 Joyce Canyon,Lake Ronaldmouth, IL 75050",peak,,,,21
7,1,"56560 Joyce Canyon,Lake Ronaldmouth, IL 75050",off-peak,,,,5
8,2,"9384 Kane Fort Suite 285,South Kayla, CO 48729",peak,15,,,
9,2,"9384 Kane Fort Suite 285,South Kayla, CO 48729",off-peak,21,,,


In [119]:
address_melt = pd.melt(address_df,
                       id_vars = ['id','address','type'],
                       var_name = 'week',
                       value_name = 'hour'
)
address_melt.head()
address_melt_clean = address_melt.dropna()
address_melt_clean.head()

Unnamed: 0,id,address,type,week,hour
0,1,"56560 Joyce Canyon,Lake Ronaldmouth, IL 75050",peak,W1,0
1,1,"56560 Joyce Canyon,Lake Ronaldmouth, IL 75050",off-peak,W1,23
8,2,"9384 Kane Fort Suite 285,South Kayla, CO 48729",peak,W1,15
9,2,"9384 Kane Fort Suite 285,South Kayla, CO 48729",off-peak,W1,21
16,3,"02857 Kathleen Divide Suite 570,Wandaside, VT ...",peak,W1,9


address_melt_clean.pivot_table(index=['id','address','week'],
                               columns="type",
                               values="hour",
                               aggfunc=np.mean
                              )