# MSc Data Science Dissertation

## Passenger Profiles

#### Name: Chandana Karunaratne
#### Student ID: 1621633
#### Date: 24 August 2017

In [None]:
# The code below is used to create travel profiles for two passengers. These profiles contain the frequency distribution 
# of their stations of entry and the frequency distribution of their travel times when using their most frequently used 
# station of entry.

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import pylab



In [2]:
# Read in a file containing all station keys and their corresponding names

stationdic_df = pd.read_csv("StationsDictionary_CK2.csv")


In [3]:
# Show the head of the stationdic_df dataframe

stationdic_df.head()

Unnamed: 0,STATIONKEY,STATIONNAME
0,-1,Unknown
1,0,?
2,1,Acton Town
3,2,Barbican
4,3,Aldgate


In [4]:
# Read in the files containing journey data for each passenger

user1_df = pd.read_csv("Traveller Matrix_User1_CK6.csv")
user2_df = pd.read_csv("Traveller Matrix_User3_CK3.csv")


### Passenger 1:

In [5]:
# Show the head of the dataframe containing journey data for the first passenger

user1_df.head()

Unnamed: 0,tid,daykey,prestigeid,stationoffirstentrykey,stationoffirstentry,routeid,stationofexitkey,tramstationkey,stationkey,stationofentrykey,...,samestationexitflag,cardtypekey,transactiontime,receiveddaykey,trainingflag,utsdate,busroutename,direction,busstopid,daytype
0,2765100,12365,54908468,1349,719,U,85,-1,-1,-1,...,N,15,340,12365,N,12365,AAA,X,,Friday
1,3202772,12365,54908468,85,590,U,1349,-1,-1,-1,...,N,15,1046,12365,N,12365,AAA,X,,Friday
2,4870433,12365,54908468,1349,719,U,-1,-1,1349,-1,...,U,15,310,-1,X,-1,AAA,X,,Friday
3,5365403,12365,54908468,85,590,U,-1,-1,85,-1,...,U,15,1001,-1,X,-1,AAA,X,,Friday
4,21962353,12400,54908468,1349,719,U,154,-1,-1,-1,...,N,15,359,12400,N,12400,AAA,X,,Friday


In [6]:
# Show the frequency distribution of the stations of entry used by Passenger 1

user1_df['stationoffirstentrykey'].value_counts()

 1349    62
 85      34
 210     26
-1       11
 1922     8
 159      6
 50       4
 13       2
 2061     2
Name: stationoffirstentrykey, dtype: int64

In [7]:
# Assign a timeblock to each value of 'transactiontime' so that each transaction time falls into a 2-hour time interval 
# throughout a 24-hour day

# Source: https://stackoverflow.com/questions/26886653/pandas-create-new-column-based-on-values-from-other-columns?noredirect=1&lq=1

def timeblock(row):
    if (row['transactiontime'] >= 0 and row['transactiontime'] < 120):
        return 1
    elif (row['transactiontime'] >= 120 and row['transactiontime'] < 240):
        return 2
    elif (row['transactiontime'] >= 240 and row['transactiontime'] < 360):
        return 3
    elif (row['transactiontime'] >= 360 and row['transactiontime'] < 480):
        return 4
    elif (row['transactiontime'] >= 480 and row['transactiontime'] < 600):
        return 5
    elif (row['transactiontime'] >= 600 and row['transactiontime'] < 720):
        return 6
    elif (row['transactiontime'] >= 720 and row['transactiontime'] < 840):
        return 7
    elif (row['transactiontime'] >= 840 and row['transactiontime'] < 960):
        return 8
    elif (row['transactiontime'] >= 960 and row['transactiontime'] < 1080):
        return 9
    elif (row['transactiontime'] >= 1080 and row['transactiontime'] < 1200):
        return 10
    elif (row['transactiontime'] >= 1200 and row['transactiontime'] < 1320):
        return 11
    elif (row['transactiontime'] >= 1320 and row['transactiontime'] < 1440):
        return 12
    else:
        return 99      # assign the value 99 for transaction times that fall outside the 24-hour window


user1_df['Timeblock'] = user1_df.apply(lambda row: timeblock (row), axis=1)
    
    

In [8]:
# Show the head of the user1_df dataframe to check whether timeblocks were assigned correctly

user1_df.head()

Unnamed: 0,tid,daykey,prestigeid,stationoffirstentrykey,stationoffirstentry,routeid,stationofexitkey,tramstationkey,stationkey,stationofentrykey,...,cardtypekey,transactiontime,receiveddaykey,trainingflag,utsdate,busroutename,direction,busstopid,daytype,Timeblock
0,2765100,12365,54908468,1349,719,U,85,-1,-1,-1,...,15,340,12365,N,12365,AAA,X,,Friday,3
1,3202772,12365,54908468,85,590,U,1349,-1,-1,-1,...,15,1046,12365,N,12365,AAA,X,,Friday,9
2,4870433,12365,54908468,1349,719,U,-1,-1,1349,-1,...,15,310,-1,X,-1,AAA,X,,Friday,3
3,5365403,12365,54908468,85,590,U,-1,-1,85,-1,...,15,1001,-1,X,-1,AAA,X,,Friday,9
4,21962353,12400,54908468,1349,719,U,154,-1,-1,-1,...,15,359,12400,N,12400,AAA,X,,Friday,3


In [9]:
# We now obtain a dataset of all journeys for Passenger 1 which commenced at her most frequently used station of entry

user1_df_filtered = user1_df.query('stationoffirstentrykey==1349')

In [10]:
# Show the head of the user1_df_filtered dataframe 

user1_df_filtered.head()

Unnamed: 0,tid,daykey,prestigeid,stationoffirstentrykey,stationoffirstentry,routeid,stationofexitkey,tramstationkey,stationkey,stationofentrykey,...,cardtypekey,transactiontime,receiveddaykey,trainingflag,utsdate,busroutename,direction,busstopid,daytype,Timeblock
0,2765100,12365,54908468,1349,719,U,85,-1,-1,-1,...,15,340,12365,N,12365,AAA,X,,Friday,3
2,4870433,12365,54908468,1349,719,U,-1,-1,1349,-1,...,15,310,-1,X,-1,AAA,X,,Friday,3
4,21962353,12400,54908468,1349,719,U,154,-1,-1,-1,...,15,359,12400,N,12400,AAA,X,,Friday,3
6,24119298,12400,54908468,1349,719,U,-1,-1,1349,-1,...,15,311,-1,X,-1,AAA,X,,Friday,3
8,42054896,12347,54908468,1349,719,U,85,-1,-1,-1,...,15,341,12347,N,12347,AAA,X,,Monday,3


In [11]:
# Show Passenger 1's frequency distribution of her travel times when using her most frequently used station of entry.

user1_df_filtered['Timeblock'].value_counts()

3    58
8     3
9     1
Name: Timeblock, dtype: int64

### Passenger 2:

In [12]:
# Show the head of the dataframe containing journey data for the second passenger

user2_df.head()

Unnamed: 0,tid,daykey,prestigeid,stationoffirstentrykey,stationoffirstentry,routeid,stationofexitkey,tramstationkey,stationkey,stationofentrykey,...,samestationexitflag,cardtypekey,transactiontime,receiveddaykey,trainingflag,utsdate,busroutename,direction,busstopid,daytype
0,2765105,12365,47155273,-1,-1,U,187,-1,-1,-1,...,N,3,306,12365,N,12365,AAA,X,,Friday
1,2765200,12365,47155273,187,702,U,187,-1,-1,-1,...,N,3,320,12365,N,12365,AAA,X,,Friday
2,2765668,12365,47155273,187,702,U,187,-1,-1,-1,...,N,3,320,12365,N,12365,AAA,X,,Friday
3,2765767,12365,47155273,187,702,U,187,-1,-1,-1,...,N,3,350,12365,N,12365,AAA,X,,Friday
4,2766526,12365,47155273,187,702,U,187,-1,-1,-1,...,N,3,326,12365,N,12365,AAA,X,,Friday


In [15]:
# Show the frequency distribution of the stations of entry used by Passenger 2

user2_df['stationoffirstentrykey'].value_counts()

 187     3411
 96       111
 17        82
-1         26
 2067      16
 2061      13
 2066      10
 1349       2
 200        2
 120        2
Name: stationoffirstentrykey, dtype: int64

In [16]:
# Assign a timeblock to each value of 'transactiontime' so that each transaction time falls into a 2-hour time interval 
# throughout a 24-hour day

# Source: https://stackoverflow.com/questions/26886653/pandas-create-new-column-based-on-values-from-other-columns?noredirect=1&lq=1

def timeblock(row):
    if (row['transactiontime'] >= 0 and row['transactiontime'] < 120):
        return 1
    elif (row['transactiontime'] >= 120 and row['transactiontime'] < 240):
        return 2
    elif (row['transactiontime'] >= 240 and row['transactiontime'] < 360):
        return 3
    elif (row['transactiontime'] >= 360 and row['transactiontime'] < 480):
        return 4
    elif (row['transactiontime'] >= 480 and row['transactiontime'] < 600):
        return 5
    elif (row['transactiontime'] >= 600 and row['transactiontime'] < 720):
        return 6
    elif (row['transactiontime'] >= 720 and row['transactiontime'] < 840):
        return 7
    elif (row['transactiontime'] >= 840 and row['transactiontime'] < 960):
        return 8
    elif (row['transactiontime'] >= 960 and row['transactiontime'] < 1080):
        return 9
    elif (row['transactiontime'] >= 1080 and row['transactiontime'] < 1200):
        return 10
    elif (row['transactiontime'] >= 1200 and row['transactiontime'] < 1320):
        return 11
    elif (row['transactiontime'] >= 1320 and row['transactiontime'] < 1440):
        return 12
    else:
        return 99      # assign the value 99 for transaction times that fall outside the 24-hour window


user2_df['Timeblock'] = user2_df.apply(lambda row: timeblock (row), axis=1)
    
    

In [17]:
# Show the head of the user2_df dataframe to check whether timeblocks were assigned correctly

user2_df.head()

Unnamed: 0,tid,daykey,prestigeid,stationoffirstentrykey,stationoffirstentry,routeid,stationofexitkey,tramstationkey,stationkey,stationofentrykey,...,cardtypekey,transactiontime,receiveddaykey,trainingflag,utsdate,busroutename,direction,busstopid,daytype,Timeblock
0,2765105,12365,47155273,-1,-1,U,187,-1,-1,-1,...,3,306,12365,N,12365,AAA,X,,Friday,3
1,2765200,12365,47155273,187,702,U,187,-1,-1,-1,...,3,320,12365,N,12365,AAA,X,,Friday,3
2,2765668,12365,47155273,187,702,U,187,-1,-1,-1,...,3,320,12365,N,12365,AAA,X,,Friday,3
3,2765767,12365,47155273,187,702,U,187,-1,-1,-1,...,3,350,12365,N,12365,AAA,X,,Friday,3
4,2766526,12365,47155273,187,702,U,187,-1,-1,-1,...,3,326,12365,N,12365,AAA,X,,Friday,3


In [18]:
# We now obtain a dataset of all journeys for Passenger 2 which commenced at his most frequently used station of entry

user2_df_filtered = user2_df.query('stationoffirstentrykey==187')

In [19]:
# Show the head of the user2_df_filtered dataframe 

user2_df_filtered.head()

Unnamed: 0,tid,daykey,prestigeid,stationoffirstentrykey,stationoffirstentry,routeid,stationofexitkey,tramstationkey,stationkey,stationofentrykey,...,cardtypekey,transactiontime,receiveddaykey,trainingflag,utsdate,busroutename,direction,busstopid,daytype,Timeblock
1,2765200,12365,47155273,187,702,U,187,-1,-1,-1,...,3,320,12365,N,12365,AAA,X,,Friday,3
2,2765668,12365,47155273,187,702,U,187,-1,-1,-1,...,3,320,12365,N,12365,AAA,X,,Friday,3
3,2765767,12365,47155273,187,702,U,187,-1,-1,-1,...,3,350,12365,N,12365,AAA,X,,Friday,3
4,2766526,12365,47155273,187,702,U,187,-1,-1,-1,...,3,326,12365,N,12365,AAA,X,,Friday,3
5,2769284,12365,47155273,187,702,U,187,-1,-1,-1,...,3,382,12365,N,12365,AAA,X,,Friday,4


In [20]:
# Show Passenger 2's frequency distribution of his travel times when using his most frequently used station of entry.

user2_df_filtered['Timeblock'].value_counts()

5     940
7     571
6     552
4     512
8     269
9     268
10    149
3     126
11     24
Name: Timeblock, dtype: int64