# **Import necessary library, especially Faker**
We will try to randomize/generate/dummy transaction data for Transjakarta based on publicly accessed Master Data.

In [88]:
import pandas as pd
import numpy as np
import random as rd
import calendar
import datetime as dt
from datetime import datetime, timedelta

from faker import Faker
from faker.providers import BaseProvider

In [170]:
pd.set_option('display.max_columns',None)

# **First, we load master data**

In [89]:
dfCorridor = pd.read_csv('routes.csv')
dfCorridor

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,1,Tije,1,Blok M - Kota,,3,,D62126,FFFFFF
1,2,Tije,2,Pulo Gadung - Monas,,3,,2F489C,FFFFFF
2,3,Tije,3,Kalideres - Bundaran HI via Veteran,,3,,FDCB1C,000000
3,4,Tije,4,Pulo Gadung 2 - Tosari,,3,,512C62,FFFFFF
4,5,Tije,5,Matraman Baru - Ancol,,3,,D46425,000000
...,...,...,...,...,...,...,...,...,...
227,BW9,Tije,BW9,Monas - Pantai Indah Kapuk,,3,,FFB6DB,000000
228,T21,Tije,T21,Palem Semi - Bundaran Senayan,,3,,9C4782,FFFFFF
229,1T,Tije,1T,Cibubur - Balai Kota,,3,,9C4782,FFFFFF
230,9N,Tije,9N,Pinang Ranti - Pramuka,,3,,45A49E,FFFFFF


In [143]:
dfTrips = pd.read_csv('trips.csv')
dfTrips

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id
0,1,SH,1-R05,Blok M - Kali Besar Barat,0,,1-R05_shp
1,1,SH,1-R06,Kali Besar Barat - Blok M,1,,1-R06_shp
2,10,SH,10-R01,Tanjung Priok - PGC 2,0,,10-R01_shp
3,10,SH,10-R02,PGC 2 - Tanjung Priok,1,,10-R02_shp
4,10A,HK,10A-R01,Rusun Marunda - Tanjung Priok,0,,10A-R01_shp
...,...,...,...,...,...,...,...
477,9N,SH_TBA,9N-R02,BKN - Pinang Ranti,1,,9N-R02_shp
478,7D,SH_TBA,7D-R14,Kampung Rambutan - Tegal Parang,0,,7D-R14_shp
479,7D,SH_TBA,7D-R15,Tegal Parang - Kampung Rambutan,1,,7D-R15_shp
480,8A,HM,8A-R04,Juanda - Jelambar,1,,8A-R04_shp


In [144]:
series = dfTrips['route_id'].value_counts()
selectIndex = series[series == 1].index
selectIndex

Index(['5ST', 'JAK.10B', 'JAK.10A', '7ST', '3ST', 'GR5', 'GR4', 'BW2', '2ST',
       'JAK.09', 'BW4'],
      dtype='object')

In [145]:
dfTrips.drop(dfTrips[dfTrips['route_id'].isin(selectIndex)].index, inplace=True)
dfTrips


Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id
0,1,SH,1-R05,Blok M - Kali Besar Barat,0,,1-R05_shp
1,1,SH,1-R06,Kali Besar Barat - Blok M,1,,1-R06_shp
2,10,SH,10-R01,Tanjung Priok - PGC 2,0,,10-R01_shp
3,10,SH,10-R02,PGC 2 - Tanjung Priok,1,,10-R02_shp
4,10A,HK,10A-R01,Rusun Marunda - Tanjung Priok,0,,10A-R01_shp
...,...,...,...,...,...,...,...
477,9N,SH_TBA,9N-R02,BKN - Pinang Ranti,1,,9N-R02_shp
478,7D,SH_TBA,7D-R14,Kampung Rambutan - Tegal Parang,0,,7D-R14_shp
479,7D,SH_TBA,7D-R15,Tegal Parang - Kampung Rambutan,1,,7D-R15_shp
480,8A,HM,8A-R04,Juanda - Jelambar,1,,8A-R04_shp


In [146]:
# Group the DataFrame by 'route_id' and filter groups that don't have both 0 and 1 in 'direction_id'
filteredGroups = dfTrips.groupby('route_id').filter(lambda x: set(x['direction_id']) != {0, 1})

# Get the unique 'route_id' values from the filtered groups
selectRoute = filteredGroups['route_id'].unique()
selectRoute


array(['M7', 'M8'], dtype=object)

In [147]:
dfTrips[dfTrips['route_id'].isin(selectRoute)]

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id
445,M7,SH,M7-R03,Kampung Rambutan - Monas,1,,M7-R03_shp
446,M7,SH,M7-R04,Monas - Kampung Rambutan,1,,M7-R04_shp
447,M8,SH,M8-R03,Lebak Bulus - Pasar Baru,0,,M8-R03_shp
448,M8,SH,M8-R04,Pasar Baru - Lebak Bulus,0,,M8-R04_shp


In [148]:
dfTrips.loc[dfTrips['trip_id'] == 'M7-R03', 'direction_id'] = 0
dfTrips.loc[dfTrips['trip_id'] == 'M8-R03', 'direction_id'] = 1
display(dfTrips[dfTrips['route_id'].isin(selectRoute)],
        dfTrips)


Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id
445,M7,SH,M7-R03,Kampung Rambutan - Monas,0,,M7-R03_shp
446,M7,SH,M7-R04,Monas - Kampung Rambutan,1,,M7-R04_shp
447,M8,SH,M8-R03,Lebak Bulus - Pasar Baru,1,,M8-R03_shp
448,M8,SH,M8-R04,Pasar Baru - Lebak Bulus,0,,M8-R04_shp


Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id
0,1,SH,1-R05,Blok M - Kali Besar Barat,0,,1-R05_shp
1,1,SH,1-R06,Kali Besar Barat - Blok M,1,,1-R06_shp
2,10,SH,10-R01,Tanjung Priok - PGC 2,0,,10-R01_shp
3,10,SH,10-R02,PGC 2 - Tanjung Priok,1,,10-R02_shp
4,10A,HK,10A-R01,Rusun Marunda - Tanjung Priok,0,,10A-R01_shp
...,...,...,...,...,...,...,...
477,9N,SH_TBA,9N-R02,BKN - Pinang Ranti,1,,9N-R02_shp
478,7D,SH_TBA,7D-R14,Kampung Rambutan - Tegal Parang,0,,7D-R14_shp
479,7D,SH_TBA,7D-R15,Tegal Parang - Kampung Rambutan,1,,7D-R15_shp
480,8A,HM,8A-R04,Juanda - Jelambar,1,,8A-R04_shp


In [94]:
dfStopTimes = pd.read_csv('stop_times.csv')
dfStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,shape_dist_traveled
0,10A-R01,05:00:00,05:00:10,B03282P,0,,0,
1,10A-R01,05:00:44,05:00:54,B03283P,1,,0,
2,10A-R01,05:02:31,05:02:41,B05078P,2,,0,
3,10A-R01,05:05:34,05:05:44,B06039P,3,,0,
4,10A-R01,05:06:29,05:06:39,B00136P,4,,0,
...,...,...,...,...,...,...,...,...
13360,JAK.59-R06,05:42:55,05:43:05,B03922P,24,,0,
13361,JAK.59-R06,05:44:41,05:44:51,B04125P,25,,0,
13362,JAK.59-R06,05:54:42,05:54:52,B05477P,26,,0,
13363,JAK.59-R06,05:56:16,05:56:26,B04706P,27,,0,


In [95]:
dfStops = pd.read_csv('stops.csv')
dfStops

Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,wheelchair_boarding,parent_station,platform_code
0,B00001P,18 Office Park,,-6.299146,106.832100,1,,0,2.0,,
1,B00002P,ABA,,-6.194149,106.839390,1,,0,2.0,,
2,B00003P,Acacia Residence,,-6.263358,106.756750,1,,0,2.0,,
3,B00004P,ACC Simatupang,,-6.304475,106.848580,1,,0,2.0,,
4,B00005P,ACE Hardware,,-6.387532,106.827380,1,,0,2.0,,
...,...,...,...,...,...,...,...,...,...,...,...
7921,B06908P,Slipi Petamburan 2,,-6.201683,106.800020,1,,0,2.0,,
7922,B06909P,Slipi Petamburan 3,,-6.200982,106.799123,1,,0,2.0,,
7923,B06910P,Perpustakaan Riset BPK,,-6.206333,106.802700,1,,0,2.0,,
7924,B06911P,Senayan JCC 3,,-6.208123,106.804200,1,,0,2.0,,


In [96]:
dfFareRules = pd.read_csv('fare_rules.csv')
dfFareRules

Unnamed: 0,fare_id,route_id,origin_id,destination_id,contains_id
0,GR,10A,,,
1,GR,10B,,,
2,GR,11B,,,
3,GR,11C,,,
4,GR,11K,,,
...,...,...,...,...,...
222,GR,BW9,,,
223,PP,T21,,,
224,PP,1T,,,
225,FP,9N,,,


In [97]:
dfFareAttr = pd.read_csv('fare_attributes.csv')
dfFareAttr

Unnamed: 0,fare_id,price,currency_type,payment_method,transfers,agency_id,transfer_duration
0,FP,3500,IDR,0,,Tije,10800
1,PP,20000,IDR,0,,Tije,10800
2,GR,0,IDR,0,,Tije,10800
3,FP2,3500,IDR,0,1.0,Tije,10800


# **Merging all tables with necessary columns as complete relational data**

In [230]:
dfRoutes = dfStopTimes[['trip_id','stop_id','stop_sequence']].merge(dfStops[['stop_id','stop_name','stop_lat','stop_lon']],
                                                                   'left',
                                                                   'stop_id').merge(dfTrips[['trip_id','trip_headsign','direction_id','route_id']],
                                                                                    'left',
                                                                                    'trip_id').merge(dfCorridor[['route_id','route_long_name']],
                                                                                                     'left',
                                                                                                     'route_id').merge(dfFareRules[['route_id','fare_id']],
                                                                                                                       'left',
                                                                                                                       'route_id').merge(dfFareAttr[['fare_id','price']],
                                                                                                                                         'left',
                                                                                                                                         'fare_id')
dfRoutes = dfRoutes[['route_id','fare_id','price','route_long_name','direction_id','trip_id','trip_headsign','stop_id','stop_sequence','stop_name','stop_lat','stop_lon']].sort_values(['route_id','trip_id','stop_sequence','direction_id'])
dfRoutes

Unnamed: 0,route_id,fare_id,price,route_long_name,direction_id,trip_id,trip_headsign,stop_id,stop_sequence,stop_name,stop_lat,stop_lon
6979,1,FP,3500.0,Blok M - Kota,0.0,1-R05,Blok M - Kali Besar Barat,P00017,0,Blok M,-6.243312,106.80175
6980,1,FP,3500.0,Blok M - Kota,0.0,1-R05,Blok M - Kali Besar Barat,P00004,1,ASEAN,-6.239889,106.79897
6981,1,FP,3500.0,Blok M - Kota,0.0,1-R05,Blok M - Kali Besar Barat,P00133,2,Masjid Agung,-6.236483,106.79845
6982,1,FP,3500.0,Blok M - Kota,0.0,1-R05,Blok M - Kali Besar Barat,P00026,3,Bundaran Senayan,-6.227869,106.80094
6983,1,FP,3500.0,Blok M - Kota,0.0,1-R05,Blok M - Kali Besar Barat,P00071,4,Gelora Bung Karno,-6.224217,106.80580
...,...,...,...,...,...,...,...,...,...,...,...,...
2455,,,,,,JAK.10B-L01,,B00346P,27,Farmasi RP Soeroso,-6.187939,106.83608
2456,,,,,,JAK.10B-L01,,B00116P,28,Bapindo Menteng,-6.187095,106.83566
2457,,,,,,JAK.10B-L01,,B02292P,29,Masjid Cut Meutia 2,-6.187451,106.83381
2458,,,,,,JAK.10B-L01,,B00651P,30,Goethe Institute,-6.189522,106.83138


In [231]:
dfRoutes[dfRoutes['price'].isna()]

Unnamed: 0,route_id,fare_id,price,route_long_name,direction_id,trip_id,trip_headsign,stop_id,stop_sequence,stop_name,stop_lat,stop_lon
7210,3H,,,Jelambar - Kota,0.0,3H-R01,Jelambar - Kota,P00087,0,Jelambar,-6.166548,106.786510
7211,3H,,,Jelambar - Kota,0.0,3H-R01,Jelambar - Kota,P00208,1,RS Sumber Waras,-6.166294,106.796750
7212,3H,,,Jelambar - Kota,0.0,3H-R01,Jelambar - Kota,P00176,2,Petojo,-6.169984,106.816980
7213,3H,,,Jelambar - Kota,0.0,3H-R01,Jelambar - Kota,P00291,3,Harmoni Arah Utara,-6.162678,106.819663
7214,3H,,,Jelambar - Kota,0.0,3H-R01,Jelambar - Kota,P00293,4,Sawah Besar Arah Utara,-6.157418,106.818460
...,...,...,...,...,...,...,...,...,...,...,...,...
2455,,,,,,JAK.10B-L01,,B00346P,27,Farmasi RP Soeroso,-6.187939,106.836080
2456,,,,,,JAK.10B-L01,,B00116P,28,Bapindo Menteng,-6.187095,106.835660
2457,,,,,,JAK.10B-L01,,B02292P,29,Masjid Cut Meutia 2,-6.187451,106.833810
2458,,,,,,JAK.10B-L01,,B00651P,30,Goethe Institute,-6.189522,106.831380


In [233]:
dfRoutes.dropna(subset='route_id',inplace=True)
dfRoutes

Unnamed: 0,route_id,fare_id,price,route_long_name,direction_id,trip_id,trip_headsign,stop_id,stop_sequence,stop_name,stop_lat,stop_lon
6979,1,FP,3500.0,Blok M - Kota,0.0,1-R05,Blok M - Kali Besar Barat,P00017,0,Blok M,-6.243312,106.80175
6980,1,FP,3500.0,Blok M - Kota,0.0,1-R05,Blok M - Kali Besar Barat,P00004,1,ASEAN,-6.239889,106.79897
6981,1,FP,3500.0,Blok M - Kota,0.0,1-R05,Blok M - Kali Besar Barat,P00133,2,Masjid Agung,-6.236483,106.79845
6982,1,FP,3500.0,Blok M - Kota,0.0,1-R05,Blok M - Kali Besar Barat,P00026,3,Bundaran Senayan,-6.227869,106.80094
6983,1,FP,3500.0,Blok M - Kota,0.0,1-R05,Blok M - Kali Besar Barat,P00071,4,Gelora Bung Karno,-6.224217,106.80580
...,...,...,...,...,...,...,...,...,...,...,...,...
13225,T21,PP,20000.0,Palem Semi - Bundaran Senayan,1.0,T21-R02,Bundaran Senayan - Palem Semi,B00808P,10,Islamic,-6.228521,106.61473
13226,T21,PP,20000.0,Palem Semi - Bundaran Senayan,1.0,T21-R02,Bundaran Senayan - Palem Semi,B02861P,11,Plaza Europa,-6.221188,106.61667
13227,T21,PP,20000.0,Palem Semi - Bundaran Senayan,1.0,T21-R02,Bundaran Senayan - Palem Semi,B02557P,12,Palem Semi 2,-6.220361,106.61649
13228,T21,PP,20000.0,Palem Semi - Bundaran Senayan,1.0,T21-R02,Bundaran Senayan - Palem Semi,B01440P,13,Jln. Palem Jepang Baru,-6.218391,106.61589


In [234]:
dfRoutes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13143 entries, 6979 to 13229
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   route_id         13143 non-null  object 
 1   fare_id          12958 non-null  object 
 2   price            12958 non-null  float64
 3   route_long_name  13143 non-null  object 
 4   direction_id     13143 non-null  float64
 5   trip_id          13143 non-null  object 
 6   trip_headsign    13143 non-null  object 
 7   stop_id          13143 non-null  object 
 8   stop_sequence    13143 non-null  int64  
 9   stop_name        13143 non-null  object 
 10  stop_lat         13143 non-null  float64
 11  stop_lon         13143 non-null  float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.3+ MB


In [566]:
# dfRoute.to_csv('dfRoute.csv')

# **Creating Master Customers with FAKER library**

In [235]:
fake = Faker(locale='id_ID')
fake

<faker.proxy.Faker at 0x12dde4bd0>

## **Use .simple_profile() to generate fake/dummy profile**
And shuffle it so the profile composition is randomize when generating the transaction.

In [289]:
payCardIDTable = []
payCardNameTable = []
payCardSexTable = []
payCardBirthDate = []

for _ in range(1,2001,1) :
    payCardIDTable.append(fake.credit_card_number())
    if (_ >= 1 and _ <= 392) or (_ >= 951 and _ <= 1197) or (_ >= 1301 and _ <= 1342) or (_ >= 1351 and _ <= 1602) :
        profile = fake.simple_profile(sex='M')
        payCardNameTable.append(profile['name'])
        payCardSexTable.append(profile['sex'])
    elif (_ >= 393 and _ <= 950) or (_ >= 1198 and _ <= 1300) or (_ >= 1343 and _ <= 1350) or (_ >= 1603 and _ <= 2000) :
        profile = fake.simple_profile(sex='F')
        payCardNameTable.append(profile['name'])
        payCardSexTable.append(profile['sex'])
    if _ >= 1 and _ <= 950 : # 950 first profile is gen Y
        payCardBirthDate.append(fake.date_between(start_date = dt.date(1981,1,1),
                                                  end_date = dt.date(1996,12,31)).year)
    elif _ >= 951 and _ <= 1300 : # 350 next profile is Gen X
        payCardBirthDate.append(fake.date_between(start_date = dt.date(1965,1,1),
                                                  end_date = dt.date(1980,12,31)).year)
    elif _ >= 1301 and _ <= 1350 : # 50 next profile is Baby Boomers
        payCardBirthDate.append(fake.date_between(start_date = dt.date(1946,1,1),
                                                  end_date = dt.date(1964,12,31)).year)
    elif _ >= 1351 and _ <= 2000 : # 650 next profile is Gen Z
        payCardBirthDate.append(fake.date_between(start_date = dt.date(1997,1,1),
                                                  end_date = dt.date(2012,12,31)).year)

dfPayCards = pd.DataFrame(zip(payCardIDTable,payCardNameTable,payCardSexTable,payCardBirthDate),
                         columns=['payCardIDTable','payCardNameTable','payCardSexTable','payCardBirthDate'])
dfPayCards = dfPayCards.sample(frac=1,
                               random_state=99).reset_index(drop=True)
dfPayCards

Unnamed: 0,payCardIDTable,payCardNameTable,payCardSexTable,payCardBirthDate
0,180062659848800,Bajragin Usada,M,2008
1,4885331907664776,Gandi Widodo,F,1997
2,4996225095064169,Emong Wastuti,F,1992
3,639099174703,Surya Wacana,F,1978
4,570928206772,Embuh Mardhiyah,M,1982
...,...,...,...,...
1995,4685818286724028395,Kamila Mahendra,F,2004
1996,6502902290603767,Titi Siregar,M,1974
1997,213159426675861,drg. Zahra Nashiruddin,F,1976
1998,377840859133591,Ana Agustina,M,1976


In [290]:
dfPayCards['payCardIDTable'].apply(lambda x: len(str(x))).value_counts()

16    984
15    372
12    171
19    165
14    155
13    153
Name: payCardIDTable, dtype: int64

In [293]:
dfPayCards.insert(1,'payCardBank',np.where(dfPayCards['payCardIDTable'].apply(lambda x: len(str(x))) == 16, 'dki', ''))
dfPayCards['payCardBank'] = np.where(dfPayCards['payCardIDTable'].apply(lambda x: len(str(x))) == 15, 'emoney', dfPayCards['payCardBank'])
dfPayCards['payCardBank'] = np.where(dfPayCards['payCardIDTable'].apply(lambda x: len(str(x))) == 12, 'flazz', dfPayCards['payCardBank'])
dfPayCards['payCardBank'] = np.where(dfPayCards['payCardIDTable'].apply(lambda x: len(str(x))) == 19, 'brizzi', dfPayCards['payCardBank'])
dfPayCards['payCardBank'] = np.where(dfPayCards['payCardIDTable'].apply(lambda x: len(str(x))) == 14, 'bni', dfPayCards['payCardBank'])
dfPayCards['payCardBank'] = np.where(dfPayCards['payCardIDTable'].apply(lambda x: len(str(x))) == 13, 'online', dfPayCards['payCardBank'])
dfPayCards


Unnamed: 0,payCardIDTable,payCardBank,payCardNameTable,payCardSexTable,payCardBirthDate
0,180062659848800,emoney,Bajragin Usada,M,2008
1,4885331907664776,dki,Gandi Widodo,F,1997
2,4996225095064169,dki,Emong Wastuti,F,1992
3,639099174703,flazz,Surya Wacana,F,1978
4,570928206772,flazz,Embuh Mardhiyah,M,1982
...,...,...,...,...,...
1995,4685818286724028395,brizzi,Kamila Mahendra,F,2004
1996,6502902290603767,dki,Titi Siregar,M,1974
1997,213159426675861,emoney,drg. Zahra Nashiruddin,F,1976
1998,377840859133591,emoney,Ana Agustina,M,1976


# **Create Master for route choice variance**
So we can define usual go and back route for each person

## **Create Master for Go Variance 1**

In [294]:
var1GoCorridorID = []
var1GoDirection = []
var1GoTrip = []
var1GoTapInStops = []
var1GoStopStartSeqList = []
var1GoTapOutStops = []
var1GoStopEndSeqList = []

routeOptions = list(dfRoutes['route_id'].unique())

for _ in range(2000) :
    routeID = rd.choice(routeOptions)
    var1GoCorridorID.append(routeID)
    tripID = rd.choice(dfRoutes[dfRoutes['route_id'] == routeID]['trip_id'].to_list())
    var1GoTrip.append(tripID)
    directionID = dfRoutes[dfRoutes['trip_id'] == tripID]['direction_id'].values[0]
    var1GoDirection.append(directionID)
    stopSeq = dfRoutes[(dfRoutes['trip_id'] == tripID)]['stop_sequence'].sort_values().to_list()
    stopStartSeq = rd.choice(stopSeq[:-1])
    var1GoStopStartSeqList.append(stopStartSeq)
    stopStartID = dfRoutes[(dfRoutes['trip_id'] == tripID) & (dfRoutes['stop_sequence'] == stopStartSeq)]['stop_id'].to_list()[0]
    var1GoTapInStops.append(stopStartID)
    stopEndSeq = rd.choice([seq for seq in stopSeq if seq > stopStartSeq])
    var1GoStopEndSeqList.append(stopEndSeq)
    stopEndID = dfRoutes[(dfRoutes['trip_id'] == tripID) & (dfRoutes['stop_sequence'] == stopEndSeq)]['stop_id'].to_list()[0]
    var1GoTapOutStops.append(stopEndID)

dfPayCards['var1GoCorridorID'] = var1GoCorridorID
dfPayCards['var1GoDirection'] = var1GoDirection
dfPayCards['var1GoTrip'] = var1GoTrip
dfPayCards['var1GoTapInStops'] = var1GoTapInStops
dfPayCards['var1GoStopStartSeqList'] = var1GoStopStartSeqList
dfPayCards['var1GoTapOutStops'] = var1GoTapOutStops
dfPayCards['var1GoStopEndSeqList'] = var1GoStopEndSeqList
dfPayCards

Unnamed: 0,payCardIDTable,payCardBank,payCardNameTable,payCardSexTable,payCardBirthDate,var1GoCorridorID,var1GoDirection,var1GoTrip,var1GoTapInStops,var1GoStopStartSeqList,var1GoTapOutStops,var1GoStopEndSeqList
0,180062659848800,emoney,Bajragin Usada,M,2008,5,1.0,5-R04,P00142,7,P00253,12
1,4885331907664776,dki,Gandi Widodo,F,1997,6C,0.0,6C-R01,B01963P,13,B03307P,21
2,4996225095064169,dki,Emong Wastuti,F,1992,R1A,0.0,R1A-R01,B00499P,38,B04962P,39
3,639099174703,flazz,Surya Wacana,F,1978,11D,0.0,11D-R01,B05587P,23,B03090P,29
4,570928206772,flazz,Embuh Mardhiyah,M,1982,12,0.0,12-R03,P00239,5,P00098,15
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,4685818286724028395,brizzi,Kamila Mahendra,F,2004,JAK.75,1.0,JAK.75-R02,B05153P,11,B01247P,18
1996,6502902290603767,dki,Titi Siregar,M,1974,JAK.118,1.0,JAK.118-R02,B06081P,30,B06681P,33
1997,213159426675861,emoney,drg. Zahra Nashiruddin,F,1976,6T,0.0,6T-R01,B00245P,31,B06735P,37
1998,377840859133591,emoney,Ana Agustina,M,1976,8E,0.0,8E-R01,B02008P,11,B05454P,22


## **Create Master for Back Variance 1**

In [295]:
var1BackCorridorID = []
var1BackDirection = []
var1BackTrip = []
var1BackTapInStops = []
var1BackStopStartSeqList = []
var1BackTapOutStops = []
var1BackStopEndSeqList = []

routeOptions = list(dfRoutes['route_id'].unique())

for i in range(2000) :
    routeID = dfPayCards.iloc[i,5]
    var1BackCorridorID.append(routeID)
    directionID = 0 if dfPayCards.iloc[i,6] == 1 else 1
    var1BackDirection.append(directionID)
    tripID = rd.choice(dfRoutes[(dfRoutes['route_id'] == routeID) & (dfRoutes['direction_id'] == directionID)]['trip_id'].to_list())
    var1BackTrip.append(tripID)
    stopSeq = dfRoutes[(dfRoutes['trip_id'] == tripID)]['stop_sequence'].sort_values().to_list()
    stopStartSeq = rd.choice(stopSeq[:-1])
    var1BackStopStartSeqList.append(stopStartSeq)
    stopStartID = dfRoutes[(dfRoutes['trip_id'] == tripID) & (dfRoutes['stop_sequence'] == stopStartSeq)]['stop_id'].to_list()[0]
    var1BackTapInStops.append(stopStartID)
    stopEndSeq = rd.choice([seq for seq in stopSeq if seq > stopStartSeq])
    var1BackStopEndSeqList.append(stopEndSeq)
    stopEndID = dfRoutes[(dfRoutes['trip_id'] == tripID) & (dfRoutes['stop_sequence'] == stopEndSeq)]['stop_id'].to_list()[0]
    var1BackTapOutStops.append(stopEndID)

dfPayCards['var1BackCorridorID'] = var1BackCorridorID
dfPayCards['var1BackDirection'] = var1BackDirection
dfPayCards['var1BackTrip'] = var1BackTrip
dfPayCards['var1BackTapInStops'] = var1BackTapInStops
dfPayCards['var1BackStopStartSeqList'] = var1BackStopStartSeqList
dfPayCards['var1BackTapOutStops'] = var1BackTapOutStops
dfPayCards['var1BackStopEndSeqList'] = var1BackStopEndSeqList
dfPayCards

Unnamed: 0,payCardIDTable,payCardBank,payCardNameTable,payCardSexTable,payCardBirthDate,var1GoCorridorID,var1GoDirection,var1GoTrip,var1GoTapInStops,var1GoStopStartSeqList,var1GoTapOutStops,var1GoStopEndSeqList,var1BackCorridorID,var1BackDirection,var1BackTrip,var1BackTapInStops,var1BackStopStartSeqList,var1BackTapOutStops,var1BackStopEndSeqList
0,180062659848800,emoney,Bajragin Usada,M,2008,5,1.0,5-R04,P00142,7,P00253,12,5,0,5-R03,P00021,9,P00140,13
1,4885331907664776,dki,Gandi Widodo,F,1997,6C,0.0,6C-R01,B01963P,13,B03307P,21,6C,1,6C-R02,B03318P,10,P00235,11
2,4996225095064169,dki,Emong Wastuti,F,1992,R1A,0.0,R1A-R01,B00499P,38,B04962P,39,R1A,1,R1A-R02,B02389P,30,B04990P,31
3,639099174703,flazz,Surya Wacana,F,1978,11D,0.0,11D-R01,B05587P,23,B03090P,29,11D,1,11D-R02,B02947P,7,B02548P,18
4,570928206772,flazz,Embuh Mardhiyah,M,1982,12,0.0,12-R03,P00239,5,P00098,15,12,1,12-R04,P00241,15,P00272,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,4685818286724028395,brizzi,Kamila Mahendra,F,2004,JAK.75,1.0,JAK.75-R02,B05153P,11,B01247P,18,JAK.75,0,JAK.75-R01,B00236P,9,B01090P,28
1996,6502902290603767,dki,Titi Siregar,M,1974,JAK.118,1.0,JAK.118-R02,B06081P,30,B06681P,33,JAK.118,0,JAK.118-R01,B06692P,20,B06696P,25
1997,213159426675861,emoney,drg. Zahra Nashiruddin,F,1976,6T,0.0,6T-R01,B00245P,31,B06735P,37,6T,1,6T-R02,B00035P,23,B06749P,26
1998,377840859133591,emoney,Ana Agustina,M,1976,8E,0.0,8E-R01,B02008P,11,B05454P,22,8E,1,8E-R02,B02940P,18,B00146P,22


# **Create a class or function for generating the dummy data transaction**
*in a sequential ways with certain rules*

In [298]:
class TransportProvider(BaseProvider) :
    def __init__(self, generator):
        super().__init__(generator)
        self.dfRoute = self.load_dfRoute()
        self.dfPayCard = self.load_dfPayCard()

    def load_dfRoute(self):
        dfRouter = dfRoutes
        return dfRouter

    def load_dfPayCard(self):
        dfPayCardr = dfPayCards
        return dfPayCardr
    
    def routeDefiner(self,dfcust=None,tanggal=[2023,5,1],waktu=[7,9,2,0],notNA=0,goBack='random') :
        dummy = Faker(locale='id_ID')
        dfRoute = self.dfRoute
        if dfcust == None :
            dfPayCard = self.dfPayCard
        else :
            dfPayCard = self.dfPayCard.iloc[dfcust[0]:dfcust[1]]

        transID = []
        payCardID = []
        payCardBank = []
        payCardName = []
        payCardSex = []
        payCardBirthDate = []
        corridorID = []
        corridorName = []
        direction = []
        tapInStops = []
        tapInStopsName = []
        tapInStopsLat = []
        tapInStopsLon = []
        tapInTime = []
        tapOutStops = []
        tapOutStopsName = []
        tapOutStopsLat = []
        tapOutStopsLon = []
        tapOutTime = []
        payAmount = []
        stopStartSeqList = []
        stopEndSeqList = []
        
        for i in range(len(dfPayCard)) :
            # TRANSid
            transID.append(dummy.pystr_format(string_format='????###?#?##??').upper())

            # PROFILE
            payCardID.append(dfPayCard.iloc[i,0])
            payCardBank.append(dfPayCard.iloc[i,1])
            payCardName.append(dfPayCard.iloc[i,2])
            payCardSex.append(dfPayCard.iloc[i,3])
            payCardBirthDate.append(dfPayCard.iloc[i,4])

            # ROUTE
            if notNA == 0 :
                routeOptions = list(dfRoute['route_id'].unique())
            elif notNA == 1 :
                routeOptions = list(dfRoute[dfRoute['fare_id'].notna()]['route_id'].unique())
            elif notNA == 2 :
                routeOptions = list(dfRoute[dfRoute['fare_id'].isna()]['route_id'].unique())
            elif isinstance(notNA ,str) :
                routeOptions = [notNA]
            if goBack == 'random' :
                routeID = rd.choice(routeOptions)
                tripID = rd.choice(dfRoute[dfRoute['route_id'] == routeID]['trip_id'].to_list())
                directionID = dfRoute[dfRoute['trip_id'] == tripID]['direction_id'].values[0]
                stopSeq = dfRoute[(dfRoute['trip_id'] == tripID)]['stop_sequence'].sort_values().to_list()
                stopStartSeq = rd.choice(stopSeq[:-1])
                stopStartID = dfRoute[(dfRoute['trip_id'] == tripID) & (dfRoute['stop_sequence'] == stopStartSeq)]['stop_id'].to_list()[0]
                stopEndSeq = rd.choice([seq for seq in stopSeq if seq > stopStartSeq])
                stopEndID = dfRoute[(dfRoute['trip_id'] == tripID) & (dfRoute['stop_sequence'] == stopEndSeq)]['stop_id'].to_list()[0]
            elif goBack == 'go' :
                routeID = dfPayCard.iloc[i,5]
                tripID = dfPayCard.iloc[i,7]
                directionID = dfPayCard.iloc[i,6]
                stopStartSeq = dfPayCard.iloc[i,9]
                stopStartID = dfPayCard.iloc[i,8]
                stopEndSeq = dfPayCard.iloc[i,11]
                stopEndID = dfPayCard.iloc[i,10]   
            elif goBack == 'back' :
                routeID = dfPayCard.iloc[i,12]
                tripID = dfPayCard.iloc[i,14]
                directionID = dfPayCard.iloc[i,13]
                stopStartSeq = dfPayCard.iloc[i,-3]
                stopStartID = dfPayCard.iloc[i,-4]
                stopEndSeq = dfPayCard.iloc[i,-1]
                stopEndID = dfPayCard.iloc[i,-2]
            corridorID.append(routeID)
            corridorName.append(dfRoute[dfRoute['route_id'] == routeID]['route_long_name'].values[0])
            direction.append(directionID)
            stopStartSeqList.append(stopStartSeq)
            tapInStops.append(stopStartID)
            tapInStopsName.append(dfRoute[dfRoute['stop_id'] == stopStartID]['stop_name'].values[0])
            tapInStopsLat.append(dfRoute[dfRoute['stop_id'] == stopStartID]['stop_lat'].values[0])
            tapInStopsLon.append(dfRoute[dfRoute['stop_id'] == stopStartID]['stop_lon'].values[0])
            stopEndSeqList.append(stopEndSeq)
            tapOutStops.append(stopEndID)
            tapOutStopsName.append(dfRoute[dfRoute['stop_id'] == stopEndID]['stop_name'].values[0])
            tapOutStopsLat.append(dfRoute[dfRoute['stop_id'] == stopEndID]['stop_lat'].values[0])
            tapOutStopsLon.append(dfRoute[dfRoute['stop_id'] == stopEndID]['stop_lon'].values[0])

            # TIME
            # lastDay = calendar.monthrange(tanggal[0], tanggal[1])[1]
            # day = rd.randint(1, lastDay)
            startDatetime = datetime(tanggal[0], tanggal[1], tanggal[2], waktu[0], 0, 0)
            endDatetime = datetime(tanggal[0], tanggal[1], tanggal[2], waktu[1], 0, 0) - timedelta(seconds=1)
            stopStartTime = dummy.date_time_between_dates(startDatetime, endDatetime)
            tapInTime.append(stopStartTime)
            maxDuration = timedelta(hours=waktu[2])
            minDuration = timedelta(minutes=waktu[3])
            stopEndTime = dummy.date_time_between_dates(stopStartTime + minDuration, stopStartTime + maxDuration)
            tapOutTime.append(stopEndTime)

            # PAYamount
            payAmount.append(dfRoute[dfRoute['route_id'] == routeID]['price'].values[0])

        dfRouteDefiner = pd.DataFrame(zip(transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,
                                          corridorID,corridorName,direction,
                                          tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeqList,tapInTime,
                                          tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeqList,tapOutTime,payAmount),
                                      columns = ['transID','payCardID','payCardBank','payCardName','payCardSex','payCardBirthDate',
                                                 'corridorID','corridorName','direction',
                                                 'tapInStops','tapInStopsName','tapInStopsLat','tapInStopsLon','stopStartSeq','tapInTime',
                                                 'tapOutStops','tapOutStopsName','tapOutStopsLat','tapOutStopsLon','stopEndSeq','tapOutTime','payAmount'])

        return dfRouteDefiner

fake.add_provider(TransportProvider)

In [299]:
# TEST RUN
test = fake.routeDefiner(dfcust=[0,100],tanggal=[2023,4,1],waktu=[5,7,2,60],goBack='back')
display(test,
        test[test['payCardID'] == '6011454241671356'])

Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
0,AQLI340G0Q13MN,180062659848800,emoney,Bajragin Usada,M,2008,5,Matraman Baru - Ancol,0,P00021,Budi Utomo,-6.166063,106.839060,9,2023-04-01 06:20:57,P00140,Pademangan,-6.133704,106.83164,13,2023-04-01 07:46:24,3500.0
1,ZUGV329H9C68JD,4885331907664776,dki,Gandi Widodo,F,1997,6C,Stasiun Tebet - Karet via Patra Kuningan,1,B03318P,Sawo Kecik Raya,-6.224852,106.854890,10,2023-04-01 06:45:31,P00235,Stasiun Tebet,-6.225503,106.85814,11,2023-04-01 08:03:39,3500.0
2,CPRP329G5P74KI,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,1,B02389P,Mayang Permai,-6.116421,106.759630,30,2023-04-01 05:54:07,B04990P,Simpang Mandara Permai 2,-6.116512,106.75622,31,2023-04-01 07:31:21,3500.0
3,EOCY385E5C12HR,639099174703,flazz,Surya Wacana,F,1978,11D,Pulo Gebang - Pulo Gadung 2 via PIK,1,B02947P,Pool Taksi,-6.188631,106.934260,7,2023-04-01 06:44:54,B02548P,Pajak Dan Retribusi Cakung,-6.205256,106.93120,18,2023-04-01 07:51:44,3500.0
4,IPHW450W9A13US,570928206772,flazz,Embuh Mardhiyah,M,1982,12,Tanjung Priok - Pluit,1,P00241,Sunter Kelapa Gading Arah Utara,-6.142900,106.890760,15,2023-04-01 06:20:53,P00272,Walikota Jakarta Utara Arah Utara,-6.118842,106.89314,17,2023-04-01 07:54:29,3500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,TPHD621G6C12YN,502072062834,flazz,Kalim Namaga,M,1971,9D,Pasar Minggu - Tanah Abang,0,B03153P,RS Medistra,-6.240411,106.834120,25,2023-04-01 05:22:19,B05931P,YBK,-6.270872,106.84660,39,2023-04-01 06:59:52,3500.0
96,HTTR792X7D01KW,30385017224303,bni,"dr. Vega Rahimah, S.Pt",M,2003,JAK.117,Tanjung Priok - Tanah Merdeka,1,B01056P,Jln. Elpa Putih II,-6.110053,106.887210,28,2023-04-01 06:43:26,B05728P,Term. Tj. Priok 1,-6.109800,106.88118,30,2023-04-01 08:36:24,0.0
97,JEXC673Q7B99TK,6590854673757012,dki,Bajragin Najmudin,M,1986,3H,Jelambar - Kota,1,P00294,Sawah Besar Arah Selatan,-6.157368,106.818661,4,2023-04-01 06:10:58,P00292,Harmoni Arah Selatan,-6.162687,106.81992,5,2023-04-01 07:16:24,
98,GDQN413G2M81WY,4963710913075879668,brizzi,Ophelia Firmansyah,M,2000,14,Jakarta International Stadium - Senen,1,B03885P,Sbr. Jembatan Sukamulya III,-6.170489,106.852050,3,2023-04-01 06:47:02,B06067P,Sunter Muara 1,-6.144331,106.85745,10,2023-04-01 07:47:27,3500.0


Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount


# **Creating Transaction Variances**

## 300 People that constantly go and back during weekdays MORNING - EVENING

In [301]:
dfTransaction = pd.DataFrame()
for i in range(3,8) :
    for t,g in zip([[5,7,1,20],[16,18,2,50]],['go','back']) :
        transaction = fake.routeDefiner(dfcust=[0,300],tanggal=[2023,4,i],waktu=t,goBack=g)
        dfTransaction = pd.concat([dfTransaction,transaction])
for i in range(10,15) :
    for t,g in zip([[5,7,1,20],[16,18,2,50]],['go','back']) :
        transaction = fake.routeDefiner(dfcust=[0,300],tanggal=[2023,4,i],waktu=t,goBack=g)
        dfTransaction = pd.concat([dfTransaction,transaction])
for i in range(17,22) :
    for t,g in zip([[5,7,1,20],[16,18,2,50]],['go','back']) :
        transaction = fake.routeDefiner(dfcust=[0,300],tanggal=[2023,4,i],waktu=t,goBack=g)
        dfTransaction = pd.concat([dfTransaction,transaction])
for i in range(24,29) :
    for t,g in zip([[5,7,1,20],[16,18,2,50]],['go','back']) :
        transaction = fake.routeDefiner(dfcust=[0,300],tanggal=[2023,4,i],waktu=t,goBack=g)
        dfTransaction = pd.concat([dfTransaction,transaction])
        
display(dfTransaction,
        dfTransaction[dfTransaction['payCardID'] == '4996225095064169'])

Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
0,EIIW227B8L34VB,180062659848800,emoney,Bajragin Usada,M,2008,5,Matraman Baru - Ancol,1.0,P00142,Pal Putih,-6.184631,106.84402,7,2023-04-03 05:21:44,P00253,Tegalan,-6.203101,106.85715,12,2023-04-03 06:00:53,3500.0
1,LGXO740D2N47GZ,4885331907664776,dki,Gandi Widodo,F,1997,6C,Stasiun Tebet - Karet via Patra Kuningan,0.0,B01963P,Kemenkes 2,-6.228700,106.83302,13,2023-04-03 05:42:44,B03307P,Sampoerna Strategic,-6.217152,106.81892,21,2023-04-03 06:40:01,3500.0
2,DJWR385V2U57TO,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-03 05:59:06,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-03 06:50:55,3500.0
3,JTUZ800U7C86EH,639099174703,flazz,Surya Wacana,F,1978,11D,Pulo Gebang - Pulo Gadung 2 via PIK,0.0,B05587P,Taman Elok 1,-6.195743,106.93526,23,2023-04-03 05:44:51,B03090P,Raya Penggilingan,-6.183068,106.93194,29,2023-04-03 06:28:16,3500.0
4,VMLO535V7F95NJ,570928206772,flazz,Embuh Mardhiyah,M,1982,12,Tanjung Priok - Pluit,0.0,P00239,Sunter Boulevard Barat,-6.149650,106.88900,5,2023-04-03 06:17:35,P00098,Kali Besar Barat,-6.135355,106.81143,15,2023-04-03 06:57:03,3500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,XVCZ091B3T18CE,4582951929146910,dki,H. Jarwadi Suartini,M,2002,JAK.11,Tanah Abang - Kebayoran Lama,0.0,B02972P,Pospol Jati Bunder,-6.189707,106.81317,8,2023-04-28 17:04:04,B03151P,RS Medika Permata Hijau 4,-6.218438,106.77855,38,2023-04-28 18:18:35,0.0
296,BMJC637T5M91KP,180005992866880,emoney,R.A. Puti Permadi,M,1987,9D,Pasar Minggu - Tanah Abang,1.0,B03183P,RS Tria Dipa,-6.250297,106.84261,13,2023-04-28 17:12:51,B02972P,Pospol Jati Bunder,-6.189707,106.81317,46,2023-04-28 18:50:55,3500.0
297,AJSH165D3N08EF,4684821109015770,dki,Paramita Budiyanto,F,1997,8C,Kebayoran Lama - Tanah Abang,0.0,B01974P,Kesatrian Kowal Cut Nyak Dhien,-6.194307,106.81436,22,2023-04-28 17:50:36,B02972P,Pospol Jati Bunder,-6.189707,106.81317,27,2023-04-28 18:48:43,3500.0
298,EPPR716E5Q34DG,213146091952345,emoney,Raina Kusmawati,F,1996,JAK.43B,Tongtek - Tebet Eco Park (TEP) - Cililitan,1.0,B01795P,Kalibata City 2,-6.255816,106.85237,6,2023-04-28 17:45:40,B05517P,STEKPI,-6.255574,106.84966,8,2023-04-28 19:36:12,0.0


Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
2,DJWR385V2U57TO,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-03 05:59:06,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-03 06:50:55,3500.0
2,OYSZ254V7M55NZ,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,1.0,B02389P,Mayang Permai,-6.116421,106.75963,30,2023-04-03 17:36:54,B04990P,Simpang Mandara Permai 2,-6.116512,106.75622,31,2023-04-03 18:31:18,3500.0
2,DYQE022T3Z19UI,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-04 06:25:26,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-04 06:53:35,3500.0
2,ZTFJ102T6Y79RB,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,1.0,B02389P,Mayang Permai,-6.116421,106.75963,30,2023-04-04 16:33:32,B04990P,Simpang Mandara Permai 2,-6.116512,106.75622,31,2023-04-04 18:11:15,3500.0
2,THCL240K4U93FD,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-05 05:05:13,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-05 05:37:14,3500.0
2,TIZD791H3W95RT,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,1.0,B02389P,Mayang Permai,-6.116421,106.75963,30,2023-04-05 16:02:16,B04990P,Simpang Mandara Permai 2,-6.116512,106.75622,31,2023-04-05 17:42:21,3500.0
2,SLAN506W3P81TR,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-06 06:07:45,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-06 06:39:02,3500.0
2,LSIY645U7O81WC,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,1.0,B02389P,Mayang Permai,-6.116421,106.75963,30,2023-04-06 16:25:56,B04990P,Simpang Mandara Permai 2,-6.116512,106.75622,31,2023-04-06 17:19:03,3500.0
2,NSNZ553O6V53VD,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-07 06:35:01,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-07 07:26:58,3500.0
2,MHHC829X5I17IK,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,1.0,B02389P,Mayang Permai,-6.116421,106.75963,30,2023-04-07 17:32:41,B04990P,Simpang Mandara Permai 2,-6.116512,106.75622,31,2023-04-07 18:23:09,3500.0


## 500 people that constantly go and back during weekdays LATE MORING - LATE NIGHT

In [302]:
for i in range(3,8) :
    for t,g in zip([[6,10,2,20],[17,22,2,50]],['go','back']) :
        transaction = fake.routeDefiner(dfcust=[300,800],tanggal=[2023,4,i],waktu=t,goBack=g)
        dfTransaction = pd.concat([dfTransaction,transaction])
for i in range(10,15) :
    for t,g in zip([[6,10,2,20],[17,22,2,50]],['go','back']) :
        transaction = fake.routeDefiner(dfcust=[300,800],tanggal=[2023,4,i],waktu=t,goBack=g)
        dfTransaction = pd.concat([dfTransaction,transaction])
for i in range(17,22) :
    for t,g in zip([[6,10,2,20],[17,22,2,50]],['go','back']) :
        transaction = fake.routeDefiner(dfcust=[300,800],tanggal=[2023,4,i],waktu=t,goBack=g)
        dfTransaction = pd.concat([dfTransaction,transaction])
for i in range(24,29) :
    for t,g in zip([[6,10,2,20],[17,22,2,50]],['go','back']) :
        transaction = fake.routeDefiner(dfcust=[300,800],tanggal=[2023,4,i],waktu=t,goBack=g)
        dfTransaction = pd.concat([dfTransaction,transaction])
dfTransaction

Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
0,EIIW227B8L34VB,180062659848800,emoney,Bajragin Usada,M,2008,5,Matraman Baru - Ancol,1.0,P00142,Pal Putih,-6.184631,106.84402,7,2023-04-03 05:21:44,P00253,Tegalan,-6.203101,106.85715,12,2023-04-03 06:00:53,3500.0
1,LGXO740D2N47GZ,4885331907664776,dki,Gandi Widodo,F,1997,6C,Stasiun Tebet - Karet via Patra Kuningan,0.0,B01963P,Kemenkes 2,-6.228700,106.83302,13,2023-04-03 05:42:44,B03307P,Sampoerna Strategic,-6.217152,106.81892,21,2023-04-03 06:40:01,3500.0
2,DJWR385V2U57TO,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-03 05:59:06,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-03 06:50:55,3500.0
3,JTUZ800U7C86EH,639099174703,flazz,Surya Wacana,F,1978,11D,Pulo Gebang - Pulo Gadung 2 via PIK,0.0,B05587P,Taman Elok 1,-6.195743,106.93526,23,2023-04-03 05:44:51,B03090P,Raya Penggilingan,-6.183068,106.93194,29,2023-04-03 06:28:16,3500.0
4,VMLO535V7F95NJ,570928206772,flazz,Embuh Mardhiyah,M,1982,12,Tanjung Priok - Pluit,0.0,P00239,Sunter Boulevard Barat,-6.149650,106.88900,5,2023-04-03 06:17:35,P00098,Kali Besar Barat,-6.135355,106.81143,15,2023-04-03 06:57:03,3500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,VKAQ367F3C28ZY,4376541203354296,dki,Hafshah Permata,F,1985,JAK.29,Tanjung Priok - Sukapura,0.0,B01325P,Jln. Lorong 22,-6.108141,106.89932,42,2023-04-28 17:27:51,B05728P,Term. Tj. Priok 1,-6.109800,106.88118,50,2023-04-28 18:28:27,0.0
496,VXMV385H7O32BD,4949430360722628,dki,Fathonah Saefullah,F,1996,1Q,Rempoa - Blok M,1.0,B02373P,Masjid Pondok Indah 1,-6.261581,106.78234,14,2023-04-28 18:12:59,B01905P,Kel. Bintaro,-6.273047,106.76405,26,2023-04-28 20:07:00,3500.0
497,GZWU013Y7G06UB,213173837348075,emoney,"Ir. Emas Mandala, S.Sos",M,2012,11P,Rusun Pondok Bambu - Walikota Jakarta Timur,1.0,B06666P,Kodim 0505 1,-6.211958,106.94722,1,2023-04-28 20:42:31,B06665P,Pengadilan Negeri Jaktim 2,-6.212604,106.94082,4,2023-04-28 22:15:24,0.0
498,PQFW770X9O31SP,4857511536859376,dki,Laras Salahudin,F,2003,JAK.36,Cilangkap - Cililitan,0.0,B05110P,Simpang Raya Bogor Cililitan,-6.264191,106.86585,42,2023-04-28 18:46:53,B02637P,Ps. Kramat Jati 1,-6.269058,106.86691,44,2023-04-28 20:13:33,0.0


## 700 people once per Weekend

In [303]:
for i,k in zip(range(800,1500),range(801,1501)) :
    dateN = rd.randint(8,9)
    transaction = fake.routeDefiner(dfcust=[i,k],tanggal=[2023,4,dateN],waktu=[5,22,2,15])
    dfTransaction = pd.concat([dfTransaction,transaction])

for i,k in zip(range(800,1500),range(801,1501)) :
    dateN = rd.randint(15,16)
    transaction = fake.routeDefiner(dfcust=[i,k],tanggal=[2023,4,dateN],waktu=[5,22,2,15])
    dfTransaction = pd.concat([dfTransaction,transaction])

for i,k in zip(range(800,1500),range(801,1501)) :
    dateN = rd.randint(22,23)
    transaction = fake.routeDefiner(dfcust=[i,k],tanggal=[2023,4,dateN],waktu=[5,22,2,15])
    dfTransaction = pd.concat([dfTransaction,transaction])

for i,k in zip(range(800,1500),range(801,1501)) :
    dateN = rd.randint(29,30)
    transaction = fake.routeDefiner(dfcust=[i,k],tanggal=[2023,4,dateN],waktu=[5,22,2,15])
    dfTransaction = pd.concat([dfTransaction,transaction])

dfTransaction

Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
0,EIIW227B8L34VB,180062659848800,emoney,Bajragin Usada,M,2008,5,Matraman Baru - Ancol,1.0,P00142,Pal Putih,-6.184631,106.84402,7,2023-04-03 05:21:44,P00253,Tegalan,-6.203101,106.85715,12,2023-04-03 06:00:53,3500.0
1,LGXO740D2N47GZ,4885331907664776,dki,Gandi Widodo,F,1997,6C,Stasiun Tebet - Karet via Patra Kuningan,0.0,B01963P,Kemenkes 2,-6.228700,106.83302,13,2023-04-03 05:42:44,B03307P,Sampoerna Strategic,-6.217152,106.81892,21,2023-04-03 06:40:01,3500.0
2,DJWR385V2U57TO,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-03 05:59:06,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-03 06:50:55,3500.0
3,JTUZ800U7C86EH,639099174703,flazz,Surya Wacana,F,1978,11D,Pulo Gebang - Pulo Gadung 2 via PIK,0.0,B05587P,Taman Elok 1,-6.195743,106.93526,23,2023-04-03 05:44:51,B03090P,Raya Penggilingan,-6.183068,106.93194,29,2023-04-03 06:28:16,3500.0
4,VMLO535V7F95NJ,570928206772,flazz,Embuh Mardhiyah,M,1982,12,Tanjung Priok - Pluit,0.0,P00239,Sunter Boulevard Barat,-6.149650,106.88900,5,2023-04-03 06:17:35,P00098,Kali Besar Barat,-6.135355,106.81143,15,2023-04-03 06:57:03,3500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,LXUW913O4R70ED,4566988485083087,dki,Ana Anggraini,M,1973,5B,Stasiun Tebet - BNN,0.0,B01805P,Kampung Melayu Kecil,-6.224990,106.86306,2,2023-04-30 10:11:28,P00015,Bidara Cina,-6.229724,106.86721,9,2023-04-30 11:25:54,3500.0
0,KQSY277N1J01PJ,374240614589026,emoney,Raina Pradipta,F,1986,1P,Senen - Blok M,0.0,B05775P,Tosari 3,-6.200069,106.82335,20,2023-04-29 06:22:38,B05544P,Summitmas,-6.225826,106.80402,28,2023-04-29 08:08:59,3500.0
0,USBY114B7T73KE,6525432887576005,dki,Bagya Budiyanto,F,1991,M7B,BKN - Blok M,0.0,P00129,Mampang Prapatan,-6.242699,106.82572,17,2023-04-29 21:18:40,P00259,Tirtayasa,-6.239207,106.80777,22,2023-04-29 22:59:55,
0,DHEX181K6P51UV,4718462919104,online,Tgk. Shakila Prakasa,F,1988,JAK.112,Terminal Tanah Merah - Pulo Gadung,0.0,B05620P,Taman Maramba,-6.176230,106.90716,24,2023-04-30 09:03:50,B03000P,Pulo Gadung 5,-6.182228,106.90901,26,2023-04-30 09:45:23,0.0


## 200 random people anytime 2x go and back everyday at mid of the month

In [305]:
for i in range(13,20) :
    for t,g in zip([[5,14,2,20],[15,21,2,50]],['go','back']) :
        transaction = fake.routeDefiner(dfcust=[1500,1700],tanggal=[2023,4,i],waktu=t,goBack=g)
        dfTransaction = pd.concat([dfTransaction,transaction])
dfTransaction

Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
0,EIIW227B8L34VB,180062659848800,emoney,Bajragin Usada,M,2008,5,Matraman Baru - Ancol,1.0,P00142,Pal Putih,-6.184631,106.844020,7,2023-04-03 05:21:44,P00253,Tegalan,-6.203101,106.85715,12,2023-04-03 06:00:53,3500.0
1,LGXO740D2N47GZ,4885331907664776,dki,Gandi Widodo,F,1997,6C,Stasiun Tebet - Karet via Patra Kuningan,0.0,B01963P,Kemenkes 2,-6.228700,106.833020,13,2023-04-03 05:42:44,B03307P,Sampoerna Strategic,-6.217152,106.81892,21,2023-04-03 06:40:01,3500.0
2,DJWR385V2U57TO,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.814350,38,2023-04-03 05:59:06,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-03 06:50:55,3500.0
3,JTUZ800U7C86EH,639099174703,flazz,Surya Wacana,F,1978,11D,Pulo Gebang - Pulo Gadung 2 via PIK,0.0,B05587P,Taman Elok 1,-6.195743,106.935260,23,2023-04-03 05:44:51,B03090P,Raya Penggilingan,-6.183068,106.93194,29,2023-04-03 06:28:16,3500.0
4,VMLO535V7F95NJ,570928206772,flazz,Embuh Mardhiyah,M,1982,12,Tanjung Priok - Pluit,0.0,P00239,Sunter Boulevard Barat,-6.149650,106.889000,5,2023-04-03 06:17:35,P00098,Kali Besar Barat,-6.135355,106.81143,15,2023-04-03 06:57:03,3500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,SESN637R5X52CX,213119132205510,emoney,Atma Sirait,F,1988,JAK.39,Kalimalang - Duren Sawit,0.0,B05363P,SMPN 199,-6.230971,106.943090,23,2023-04-19 19:59:19,B02984P,PPKD Jaktim,-6.247777,106.94236,32,2023-04-19 21:46:46,0.0
196,EDDB868J0B37MK,4541880000962,online,Drs. Hani Pratama,F,2012,JAK.31,Blok M - Andara,0.0,B04766P,Simpang Cilandak Tengah Raya 1,-6.291253,106.801880,38,2023-04-19 18:51:40,B03817P,Sbr. Jln. Pinang IIIb,-6.308184,106.79797,49,2023-04-19 20:23:53,0.0
197,TXHF398R4I79OB,4321295069509396,dki,Gilda Kuswandari,F,1987,6T,Pasar Minggu - Velbak,1.0,B06740P,Sbr. Jln. Barito II,-6.248808,106.795903,7,2023-04-19 19:01:36,B03078P,Rawa Bambu,-6.294753,106.84078,34,2023-04-19 20:35:00,3500.0
198,HGDS800V6E65NQ,676371995694,flazz,"T. Tirta Andriani, S.Farm",F,1987,JAK.22,Dwikora - Penas Kalimalang,1.0,B02956P,Pos Jaga Trikora 1,-6.241638,106.879810,3,2023-04-19 15:48:47,B03321P,SD Angkasa 7,-6.250830,106.88358,7,2023-04-19 17:00:15,0.0


## 300 random people anytime once

In [306]:
for i,k in zip(range(1700,2000),range(1701,2001)) :
    dateN = rd.randint(1,30)
    transaction = fake.routeDefiner(dfcust=[i,k],tanggal=[2023,4,dateN],waktu=[5,22,3,20])
    dfTransaction = pd.concat([dfTransaction,transaction])
dfTransaction

Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
0,EIIW227B8L34VB,180062659848800,emoney,Bajragin Usada,M,2008,5,Matraman Baru - Ancol,1.0,P00142,Pal Putih,-6.184631,106.84402,7,2023-04-03 05:21:44,P00253,Tegalan,-6.203101,106.85715,12,2023-04-03 06:00:53,3500.0
1,LGXO740D2N47GZ,4885331907664776,dki,Gandi Widodo,F,1997,6C,Stasiun Tebet - Karet via Patra Kuningan,0.0,B01963P,Kemenkes 2,-6.228700,106.83302,13,2023-04-03 05:42:44,B03307P,Sampoerna Strategic,-6.217152,106.81892,21,2023-04-03 06:40:01,3500.0
2,DJWR385V2U57TO,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-03 05:59:06,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-03 06:50:55,3500.0
3,JTUZ800U7C86EH,639099174703,flazz,Surya Wacana,F,1978,11D,Pulo Gebang - Pulo Gadung 2 via PIK,0.0,B05587P,Taman Elok 1,-6.195743,106.93526,23,2023-04-03 05:44:51,B03090P,Raya Penggilingan,-6.183068,106.93194,29,2023-04-03 06:28:16,3500.0
4,VMLO535V7F95NJ,570928206772,flazz,Embuh Mardhiyah,M,1982,12,Tanjung Priok - Pluit,0.0,P00239,Sunter Boulevard Barat,-6.149650,106.88900,5,2023-04-03 06:17:35,P00098,Kali Besar Barat,-6.135355,106.81143,15,2023-04-03 06:57:03,3500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,ZWEC949B8Q87QG,4685818286724028395,brizzi,Kamila Mahendra,F,2004,6B,Ragunan - MH Thamrin via Semanggi,1.0,P00261,Tosari,-6.196892,106.82309,2,2023-04-21 18:18:37,P00228,SMK 57,-6.290967,106.82365,13,2023-04-21 19:55:49,3500.0
0,YHHK837P6Y95GN,6502902290603767,dki,Titi Siregar,M,1974,9N,Pinang Ranti - Pramuka,1.0,P00064,Garuda Taman Mini,-6.290154,106.88116,1,2023-04-18 21:52:31,P00179,Pinang Ranti,-6.291075,106.88634,2,2023-04-18 22:28:22,3500.0
0,YXPP627N4G95HO,213159426675861,emoney,drg. Zahra Nashiruddin,F,1976,1T,Cibubur - Balai Kota,1.0,B02873P,Plaza Sentral,-6.216247,106.81676,12,2023-04-04 10:29:47,B00226P,Buperta Cibubur,-6.370321,106.89628,14,2023-04-04 13:27:25,20000.0
0,RGVK175U2U98UV,377840859133591,emoney,Ana Agustina,M,1976,JAK.13,Tanah Abang - Jembatan Lima,1.0,B02505P,Museum Textile,-6.188656,106.80954,33,2023-04-15 19:59:26,B01787P,JPO Blok G,-6.188861,106.81135,34,2023-04-15 20:27:50,0.0


In [319]:
dfTransaction.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37900 entries, 0 to 0
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   transID           37900 non-null  object        
 1   payCardID         37900 non-null  object        
 2   payCardBank       37900 non-null  object        
 3   payCardName       37900 non-null  object        
 4   payCardSex        37900 non-null  object        
 5   payCardBirthDate  37900 non-null  int64         
 6   corridorID        37900 non-null  object        
 7   corridorName      37900 non-null  object        
 8   direction         37900 non-null  float64       
 9   tapInStops        37900 non-null  object        
 10  tapInStopsName    37900 non-null  object        
 11  tapInStopsLat     37900 non-null  float64       
 12  tapInStopsLon     37900 non-null  float64       
 13  stopStartSeq      37900 non-null  int64         
 14  tapInTime         37900 no

# **Creating randomize NaN Object**

In [334]:
dfTransNA = dfTransaction.copy()
dfTransNA.reset_index(drop=True,inplace=True)
dfTransNA

Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
0,EIIW227B8L34VB,180062659848800,emoney,Bajragin Usada,M,2008,5,Matraman Baru - Ancol,1.0,P00142,Pal Putih,-6.184631,106.84402,7,2023-04-03 05:21:44,P00253,Tegalan,-6.203101,106.85715,12,2023-04-03 06:00:53,3500.0
1,LGXO740D2N47GZ,4885331907664776,dki,Gandi Widodo,F,1997,6C,Stasiun Tebet - Karet via Patra Kuningan,0.0,B01963P,Kemenkes 2,-6.228700,106.83302,13,2023-04-03 05:42:44,B03307P,Sampoerna Strategic,-6.217152,106.81892,21,2023-04-03 06:40:01,3500.0
2,DJWR385V2U57TO,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-03 05:59:06,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-03 06:50:55,3500.0
3,JTUZ800U7C86EH,639099174703,flazz,Surya Wacana,F,1978,11D,Pulo Gebang - Pulo Gadung 2 via PIK,0.0,B05587P,Taman Elok 1,-6.195743,106.93526,23,2023-04-03 05:44:51,B03090P,Raya Penggilingan,-6.183068,106.93194,29,2023-04-03 06:28:16,3500.0
4,VMLO535V7F95NJ,570928206772,flazz,Embuh Mardhiyah,M,1982,12,Tanjung Priok - Pluit,0.0,P00239,Sunter Boulevard Barat,-6.149650,106.88900,5,2023-04-03 06:17:35,P00098,Kali Besar Barat,-6.135355,106.81143,15,2023-04-03 06:57:03,3500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37895,ZWEC949B8Q87QG,4685818286724028395,brizzi,Kamila Mahendra,F,2004,6B,Ragunan - MH Thamrin via Semanggi,1.0,P00261,Tosari,-6.196892,106.82309,2,2023-04-21 18:18:37,P00228,SMK 57,-6.290967,106.82365,13,2023-04-21 19:55:49,3500.0
37896,YHHK837P6Y95GN,6502902290603767,dki,Titi Siregar,M,1974,9N,Pinang Ranti - Pramuka,1.0,P00064,Garuda Taman Mini,-6.290154,106.88116,1,2023-04-18 21:52:31,P00179,Pinang Ranti,-6.291075,106.88634,2,2023-04-18 22:28:22,3500.0
37897,YXPP627N4G95HO,213159426675861,emoney,drg. Zahra Nashiruddin,F,1976,1T,Cibubur - Balai Kota,1.0,B02873P,Plaza Sentral,-6.216247,106.81676,12,2023-04-04 10:29:47,B00226P,Buperta Cibubur,-6.370321,106.89628,14,2023-04-04 13:27:25,20000.0
37898,RGVK175U2U98UV,377840859133591,emoney,Ana Agustina,M,1976,JAK.13,Tanah Abang - Jembatan Lima,1.0,B02505P,Museum Textile,-6.188656,106.80954,33,2023-04-15 19:59:26,B01787P,JPO Blok G,-6.188861,106.81135,34,2023-04-15 20:27:50,0.0


In [335]:
dfTransNA['tapInTime'] = dfTransNA['tapInTime'].astype('object')
dfTransNA['tapOutTime'] = dfTransNA['tapOutTime'].astype('object')
display(dfTransNA.info(),
        dfTransNA)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37900 entries, 0 to 37899
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   transID           37900 non-null  object 
 1   payCardID         37900 non-null  object 
 2   payCardBank       37900 non-null  object 
 3   payCardName       37900 non-null  object 
 4   payCardSex        37900 non-null  object 
 5   payCardBirthDate  37900 non-null  int64  
 6   corridorID        37900 non-null  object 
 7   corridorName      37900 non-null  object 
 8   direction         37900 non-null  float64
 9   tapInStops        37900 non-null  object 
 10  tapInStopsName    37900 non-null  object 
 11  tapInStopsLat     37900 non-null  float64
 12  tapInStopsLon     37900 non-null  float64
 13  stopStartSeq      37900 non-null  int64  
 14  tapInTime         37900 non-null  object 
 15  tapOutStops       37900 non-null  object 
 16  tapOutStopsName   37900 non-null  object

None

Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
0,EIIW227B8L34VB,180062659848800,emoney,Bajragin Usada,M,2008,5,Matraman Baru - Ancol,1.0,P00142,Pal Putih,-6.184631,106.84402,7,2023-04-03 05:21:44,P00253,Tegalan,-6.203101,106.85715,12,2023-04-03 06:00:53,3500.0
1,LGXO740D2N47GZ,4885331907664776,dki,Gandi Widodo,F,1997,6C,Stasiun Tebet - Karet via Patra Kuningan,0.0,B01963P,Kemenkes 2,-6.228700,106.83302,13,2023-04-03 05:42:44,B03307P,Sampoerna Strategic,-6.217152,106.81892,21,2023-04-03 06:40:01,3500.0
2,DJWR385V2U57TO,4996225095064169,dki,Emong Wastuti,F,1992,R1A,Pantai Maju - Kota,0.0,B00499P,Gg. Kunir II,-6.133132,106.81435,38,2023-04-03 05:59:06,B04962P,Simpang Kunir Kemukus,-6.133731,106.81475,39,2023-04-03 06:50:55,3500.0
3,JTUZ800U7C86EH,639099174703,flazz,Surya Wacana,F,1978,11D,Pulo Gebang - Pulo Gadung 2 via PIK,0.0,B05587P,Taman Elok 1,-6.195743,106.93526,23,2023-04-03 05:44:51,B03090P,Raya Penggilingan,-6.183068,106.93194,29,2023-04-03 06:28:16,3500.0
4,VMLO535V7F95NJ,570928206772,flazz,Embuh Mardhiyah,M,1982,12,Tanjung Priok - Pluit,0.0,P00239,Sunter Boulevard Barat,-6.149650,106.88900,5,2023-04-03 06:17:35,P00098,Kali Besar Barat,-6.135355,106.81143,15,2023-04-03 06:57:03,3500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37895,ZWEC949B8Q87QG,4685818286724028395,brizzi,Kamila Mahendra,F,2004,6B,Ragunan - MH Thamrin via Semanggi,1.0,P00261,Tosari,-6.196892,106.82309,2,2023-04-21 18:18:37,P00228,SMK 57,-6.290967,106.82365,13,2023-04-21 19:55:49,3500.0
37896,YHHK837P6Y95GN,6502902290603767,dki,Titi Siregar,M,1974,9N,Pinang Ranti - Pramuka,1.0,P00064,Garuda Taman Mini,-6.290154,106.88116,1,2023-04-18 21:52:31,P00179,Pinang Ranti,-6.291075,106.88634,2,2023-04-18 22:28:22,3500.0
37897,YXPP627N4G95HO,213159426675861,emoney,drg. Zahra Nashiruddin,F,1976,1T,Cibubur - Balai Kota,1.0,B02873P,Plaza Sentral,-6.216247,106.81676,12,2023-04-04 10:29:47,B00226P,Buperta Cibubur,-6.370321,106.89628,14,2023-04-04 13:27:25,20000.0
37898,RGVK175U2U98UV,377840859133591,emoney,Ana Agustina,M,1976,JAK.13,Tanah Abang - Jembatan Lima,1.0,B02505P,Museum Textile,-6.188656,106.80954,33,2023-04-15 19:59:26,B01787P,JPO Blok G,-6.188861,106.81135,34,2023-04-15 20:27:50,0.0


In [336]:
for i in range(1143) :
    a = np.random.randint(0,len(dfTransNA))
    dfTransNA.loc[a,'corridorName'] = np.nan
    dfTransNA.loc[a,'corridorID'] = np.nan
dfTransNA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37900 entries, 0 to 37899
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   transID           37900 non-null  object 
 1   payCardID         37900 non-null  object 
 2   payCardBank       37900 non-null  object 
 3   payCardName       37900 non-null  object 
 4   payCardSex        37900 non-null  object 
 5   payCardBirthDate  37900 non-null  int64  
 6   corridorID        36777 non-null  object 
 7   corridorName      36777 non-null  object 
 8   direction         37900 non-null  float64
 9   tapInStops        37900 non-null  object 
 10  tapInStopsName    37900 non-null  object 
 11  tapInStopsLat     37900 non-null  float64
 12  tapInStopsLon     37900 non-null  float64
 13  stopStartSeq      37900 non-null  int64  
 14  tapInTime         37900 non-null  object 
 15  tapOutStops       37900 non-null  object 
 16  tapOutStopsName   37900 non-null  object

In [337]:
for i in range(138) :
    a = np.random.randint(0,len(dfTransNA))
    dfTransNA.loc[a,'corridorID'] = np.nan
for i in range(831) :
    a = np.random.randint(0,len(dfTransNA))
    dfTransNA.loc[a,'corridorName'] = np.nan
dfTransNA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37900 entries, 0 to 37899
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   transID           37900 non-null  object 
 1   payCardID         37900 non-null  object 
 2   payCardBank       37900 non-null  object 
 3   payCardName       37900 non-null  object 
 4   payCardSex        37900 non-null  object 
 5   payCardBirthDate  37900 non-null  int64  
 6   corridorID        36643 non-null  object 
 7   corridorName      35970 non-null  object 
 8   direction         37900 non-null  float64
 9   tapInStops        37900 non-null  object 
 10  tapInStopsName    37900 non-null  object 
 11  tapInStopsLat     37900 non-null  float64
 12  tapInStopsLon     37900 non-null  float64
 13  stopStartSeq      37900 non-null  int64  
 14  tapInTime         37900 non-null  object 
 15  tapOutStops       37900 non-null  object 
 16  tapOutStopsName   37900 non-null  object

In [338]:
for i in range(1234) :
    a = np.random.randint(0,len(dfTransNA))
    dfTransNA.loc[a,'tapInStops'] = np.nan
for i in range(991) :
    a = np.random.randint(0,len(dfTransNA))
    dfTransNA.loc[a,'tapOutStops'] = np.nan
dfTransNA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37900 entries, 0 to 37899
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   transID           37900 non-null  object 
 1   payCardID         37900 non-null  object 
 2   payCardBank       37900 non-null  object 
 3   payCardName       37900 non-null  object 
 4   payCardSex        37900 non-null  object 
 5   payCardBirthDate  37900 non-null  int64  
 6   corridorID        36643 non-null  object 
 7   corridorName      35970 non-null  object 
 8   direction         37900 non-null  float64
 9   tapInStops        36687 non-null  object 
 10  tapInStopsName    37900 non-null  object 
 11  tapInStopsLat     37900 non-null  float64
 12  tapInStopsLon     37900 non-null  float64
 13  stopStartSeq      37900 non-null  int64  
 14  tapInTime         37900 non-null  object 
 15  tapOutStops       36921 non-null  object 
 16  tapOutStopsName   37900 non-null  object

In [339]:
# for i in range(1234) :
#     a = np.random.randint(0,len(dfTransNA))
#     dfTransNA.loc[a,'tapInStops'] = np.nan
#     dfTransNA.loc[a,'tapInStopsName'] = np.nan
#     dfTransNA.loc[a,'tapInStopsLat'] = np.nan
#     dfTransNA.loc[a,'tapInStopsLon'] = np.nan
#     dfTransNA.loc[a,'stopStartSeq'] = np.nan
for i in range(1362) :
    a = np.random.randint(0,len(dfTransNA))
    dfTransNA.loc[a,'tapOutStops'] = np.nan
    dfTransNA.loc[a,'tapOutStopsName'] = np.nan
    dfTransNA.loc[a,'tapOutStopsLat'] = np.nan
    dfTransNA.loc[a,'tapOutStopsLon'] = np.nan
    dfTransNA.loc[a,'stopEndSeq'] = np.nan
    dfTransNA.loc[a,'tapOutTime'] = np.nan
dfTransNA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37900 entries, 0 to 37899
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   transID           37900 non-null  object 
 1   payCardID         37900 non-null  object 
 2   payCardBank       37900 non-null  object 
 3   payCardName       37900 non-null  object 
 4   payCardSex        37900 non-null  object 
 5   payCardBirthDate  37900 non-null  int64  
 6   corridorID        36643 non-null  object 
 7   corridorName      35970 non-null  object 
 8   direction         37900 non-null  float64
 9   tapInStops        36687 non-null  object 
 10  tapInStopsName    37900 non-null  object 
 11  tapInStopsLat     37900 non-null  float64
 12  tapInStopsLon     37900 non-null  float64
 13  stopStartSeq      37900 non-null  int64  
 14  tapInTime         37900 non-null  object 
 15  tapOutStops       35611 non-null  object 
 16  tapOutStopsName   36556 non-null  object

In [341]:
# Check total row that can be used if user choose to dropna all
dfTransNA.notna().all(axis=1).sum()

31730

In [342]:
dfTransNA['transID'].nunique()

37900

In [343]:
# dfTransaction.to_csv('dfTransjakartaV1Full.csv',index=False)
dfTransNA.to_csv('dfTransjakarta.csv',index=False)