In [1]:
import pandas as pd 
dat = pd.read_excel("airport_choice_survey_EN_ver2.0_Capstone.xlsx")
dat

Unnamed: 0,ID,Airport,Airline,Age,Gender,Nationality,TripPurpose,TripDuration,FlyingCompanion,ProvinceResidence,...,SeatClass,Airfare,NoTransport,ModeTransport,AccessCost,AccessTime,Occupation,Income,MileageAirline,Mileage
0,1,1,1.0,49.0,1.0,1,2,7,0,3,...,1.0,80.0,1,6,8000.0,40.0,1,5.0,1,150000.0
1,2,1,1.0,49.0,2.0,1,1,4,4,3,...,1.0,41.0,1,6,8000.0,50.0,9,3.0,,
2,3,1,1.0,25.0,1.0,1,1,10,2,3,...,1.0,,2,4,1000.0,20.0,12,,,
3,4,1,1.0,29.0,1.0,1,2,7,2,3,...,1.0,40.0,1,1,,,8,7.0,2,100000.0
4,5,1,1.0,34.0,2.0,1,2,4,0,3,...,1.0,40.0,1,6,8000.0,50.0,1,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,484,2,4.0,39.0,1.0,1,4,8,3,8,...,1.0,62.0,1,1,,45.0,12,1.0,5,1.0
484,485,2,4.0,37.0,2.0,1,4,8,3,8,...,1.0,25.0,1,1,,45.0,5,5.0,,
485,486,2,1.0,63.0,1.0,5,2,28,1,8,...,1.0,,2,5,,30.0,2,7.0,3,
486,487,2,4.0,61.0,1.0,3,2,2,4,8,...,2.0,17.0,1,2,30000.0,40.0,1,7.0,2,30000.0


In [2]:
## Missing Value
# handle missing values of "Airline"
# discard the observation if both "Airline" and "FlightNo" are missing
mis_airline = dat[dat['Airline'].isnull() & dat['FlightNo'].isnull()].index
dat.drop(mis_airline, inplace=True)

In [3]:
# handle missing values of "SeatClass"
# group by 'Airline' and 'Destination' and fill missing values with mode
filled_seatclass = dat['SeatClass'].copy()

grouped = dat.groupby(['Airline', 'Destination'])['SeatClass']
for group_name, group_data in grouped:
    mode = group_data.mode()
    if not mode.empty:
        filled_seatclass.loc[group_data.index] = filled_seatclass.loc[group_data.index].fillna(mode.iloc[0])

dat = dat.assign(SeatClass=filled_seatclass)

In [4]:
# handle missing values of "Airfare"
# group the data by airline, destination and seat class, and calculate the average airfare
avgairfare_by_group = dat.groupby(['Airline', 'Destination', 'SeatClass'])['Airfare'].mean()

# assign the airfare by the average airfare by group
for index, row in dat[dat['Airfare'].isnull()].iterrows():
    if (row['Airline'], row['Destination'], row['SeatClass']) in avgairfare_by_group.index:
        dat.at[index, 'Airfare'] = avgairfare_by_group.loc[(row['Airline'], row['Destination'], row['SeatClass'])]

# discard the observation if we cannot assign the airfare
mis_airfare = dat[dat['Airfare'].isnull()].index
dat.drop(mis_airfare, inplace=True)

In [5]:
# handle missing values of "AccessTime"
# group the data by airport and province of residence, and calculate the average access time
avgaccesstime_by_group = dat.groupby(['Airport', 'ProvinceResidence'])['AccessTime'].mean()

# assign the access time by the average access time by group
for index, row in dat[dat['AccessTime'].isnull()].iterrows():
    if (row['Airport'], row['ProvinceResidence']) in avgaccesstime_by_group.index:
        dat.at[index, 'AccessTime'] = avgaccesstime_by_group.loc[(row['Airport'], row['ProvinceResidence'])]

In [6]:
# handle missing values of "Age"
# calculate the average age of respondents
avgage = dat['Age'].mean()

# assign the missing age by avg age
dat['Age'] = dat['Age'].fillna(avgage)

In [7]:
# handle missing values of "Gender"
# find the mode of gender
mostfreq_gender = dat['Gender'].mode()[0]

# assign the missing gender by the mode
dat['Gender'] = dat['Gender'].fillna(mostfreq_gender)

In [8]:
# handle missing values of "Nationality"
# find the mode of nationality
mostfreq_nationality = dat['Nationality'].mode()[0]

# assign the missing gender by the mode
dat['Nationality'] = dat['Nationality'].fillna(mostfreq_nationality)

In [9]:
# handle missing value of "Destination"
# find the mode of destination
mostfreq_dest = dat['Destination'].mode()[0]

# assign the missing gender by the mode
dat['Destination'] = dat['Destination'].fillna(mostfreq_dest)

In [10]:
# Drop observations with destination 4 as we only want to compare the common destinations of Gimpo and Incheon
dest_4 = dat[dat['Destination'] == 4].index
dat.drop(dest_4, inplace=True)

In [11]:
# Drop columns DepartureHr and DepartureMn as we have the relevant info from DepartureTime
dat = dat.drop(columns=['DepartureHr', 'DepartureMn'])

# Drop columns AccessCost, Mileage, MileageAirline and Income as there are too many missing values
dat = dat.drop(columns=['AccessCost', 'Mileage', 'MileageAirline', 'Income'])

# Drop column FlightNo as it is irrelevant to the analysis on airport and airline
dat = dat.drop(columns=['FlightNo'])

dat

Unnamed: 0,ID,Airport,Airline,Age,Gender,Nationality,TripPurpose,TripDuration,FlyingCompanion,ProvinceResidence,...,NoTripsLastYear,FrequentFlightDestination,Destination,DepartureTime,SeatClass,Airfare,NoTransport,ModeTransport,AccessTime,Occupation
0,1,1,1.0,49.0,1.0,1,2,7,0,3,...,3,1,3.0,2,1.0,80.000000,1,6,40.00,1
1,2,1,1.0,49.0,2.0,1,1,4,4,3,...,1,1,2.0,1,1.0,41.000000,1,6,50.00,9
2,3,1,1.0,25.0,1.0,1,1,10,2,3,...,1,2,1.0,1,1.0,42.496429,2,4,20.00,12
3,4,1,1.0,29.0,1.0,1,2,7,2,3,...,2,5,2.0,2,1.0,40.000000,1,1,48.86,8
4,5,1,1.0,34.0,2.0,1,2,4,0,3,...,2,2,1.0,2,1.0,40.000000,1,6,50.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,484,2,4.0,39.0,1.0,1,4,8,3,8,...,10,2,1.0,3,1.0,62.000000,1,1,45.00,12
484,485,2,4.0,37.0,2.0,1,4,8,3,8,...,6,2,1.0,3,1.0,25.000000,1,1,45.00,5
485,486,2,1.0,63.0,1.0,5,2,28,1,8,...,3,2,2.0,3,1.0,46.000000,2,5,30.00,2
486,487,2,4.0,61.0,1.0,3,2,2,4,8,...,15,4,2.0,3,2.0,17.000000,1,2,40.00,1


In [12]:
dat.isna().any()

ID                           False
Airport                      False
Airline                      False
Age                          False
Gender                       False
Nationality                  False
TripPurpose                  False
TripDuration                 False
FlyingCompanion              False
ProvinceResidence            False
GroupTravel                  False
NoTripsLastYear              False
FrequentFlightDestination    False
Destination                  False
DepartureTime                False
SeatClass                    False
Airfare                      False
NoTransport                  False
ModeTransport                False
AccessTime                   False
Occupation                   False
dtype: bool

In [13]:
## airline model
# calculate choice prob of Nationality
def count_helper2(column_name, col_value):
    counter = 0
    counter2 = 0

    dic = {}    
    i = 0 
    for item in dat["Airline"]:
        dic[i] = [item]
        i += 1  
    j = 0
    for item in (dat[column_name]):
        dic[j].append(item)
        j += 1   
    for lst in dic.values():
        if (lst[0] == 1 or lst[0] == 2) and lst[1] == col_value:
            counter += 1
        if lst[1] == col_value:
            counter2 += 1
    print(counter, counter2)
    return col_value, round(counter/counter2, 4)

print(count_helper2("Nationality", 1))
print(count_helper2("Nationality", 2))
print(count_helper2("Nationality", 3))
print(count_helper2("Nationality", 4))
print(count_helper2("Nationality", 5))

197 347
(1, 0.5677)
14 32
(2, 0.4375)
19 41
(3, 0.4634)
10 20
(4, 0.5)
7 11
(5, 0.6364)


In [14]:
# calcualte choice prob of TripPurpose
print(count_helper2("TripPurpose", 1))
print(count_helper2("TripPurpose", 2))
print(count_helper2("TripPurpose", 3))
print(count_helper2("TripPurpose", 4))

149 297
(1, 0.5017)
65 101
(2, 0.6436)
14 25
(3, 0.56)
19 28
(4, 0.6786)


In [15]:
# calcualte choice prob of ProvinceResidence
print(count_helper2("ProvinceResidence", 1))
print(count_helper2("ProvinceResidence", 2))
print(count_helper2("ProvinceResidence", 3))
print(count_helper2("ProvinceResidence", 4))
print(count_helper2("ProvinceResidence", 5))
print(count_helper2("ProvinceResidence", 6))
print(count_helper2("ProvinceResidence", 7))
print(count_helper2("ProvinceResidence", 8))

96 171
(1, 0.5614)
7 21
(2, 0.3333)
72 120
(3, 0.6)
11 13
(4, 0.8462)
17 27
(5, 0.6296)
5 9
(6, 0.5556)
0 4
(7, 0.0)
39 86
(8, 0.4535)


In [16]:
# calcualte choice prob of FrequentFlightDestination
print(count_helper2("FrequentFlightDestination", 1))
print(count_helper2("FrequentFlightDestination", 2))
print(count_helper2("FrequentFlightDestination", 3))
print(count_helper2("FrequentFlightDestination", 4))
print(count_helper2("FrequentFlightDestination", 5))
print(count_helper2("FrequentFlightDestination", 6))
print(count_helper2("FrequentFlightDestination", 7))

77 143
(1, 0.5385)
55 99
(2, 0.5556)
63 109
(3, 0.578)
5 10
(4, 0.5)
8 22
(5, 0.3636)
12 27
(6, 0.4444)
9 12
(7, 0.75)


In [17]:
# calcualte choice prob of Destination
print(count_helper2("Destination", 1))
print(count_helper2("Destination", 2))
print(count_helper2("Destination", 3))

86 132
(1, 0.6515)
96 156
(2, 0.6154)
65 163
(3, 0.3988)


In [18]:
# calcualte choice prob of DepartureTime
print(count_helper2("DepartureTime", 1))
print(count_helper2("DepartureTime", 2))
print(count_helper2("DepartureTime", 3))
print(count_helper2("DepartureTime", 4))

18 42
(1, 0.4286)
126 200
(2, 0.63)
94 184
(3, 0.5109)
9 25
(4, 0.36)


In [19]:
# calcualte choice prob of SeatClass
print(count_helper2("SeatClass", 1))
print(count_helper2("SeatClass", 2))
print(count_helper2("SeatClass", 3))

221 410
(1, 0.539)
22 32
(2, 0.6875)
4 9
(3, 0.4444)


In [20]:
# calcualte choice prob of ModeTransport
print(count_helper2("ModeTransport", 1))
print(count_helper2("ModeTransport", 2))
print(count_helper2("ModeTransport", 3))
print(count_helper2("ModeTransport", 4))
print(count_helper2("ModeTransport", 5))
print(count_helper2("ModeTransport", 6))
print(count_helper2("ModeTransport", 7))
print(count_helper2("ModeTransport", 8))
print(count_helper2("ModeTransport", 9))
print(count_helper2("ModeTransport", 10))
print(count_helper2("ModeTransport", 11))

59 103
(1, 0.5728)
51 91
(2, 0.5604)
18 35
(3, 0.5143)
6 10
(4, 0.6)
47 97
(5, 0.4845)
35 69
(6, 0.5072)
5 5
(7, 1.0)
14 29
(8, 0.4828)
6 6
(9, 1.0)
4 4
(10, 1.0)
2 2
(11, 1.0)


In [21]:
# calcualte choice prob of Occupation
print(count_helper2("Occupation", 1))
print(count_helper2("Occupation", 2))
print(count_helper2("Occupation", 3))
print(count_helper2("Occupation", 4))
print(count_helper2("Occupation", 5))
print(count_helper2("Occupation", 6))
print(count_helper2("Occupation", 7))
print(count_helper2("Occupation", 8))
print(count_helper2("Occupation", 9))
print(count_helper2("Occupation", 10))
print(count_helper2("Occupation", 11))
print(count_helper2("Occupation", 12))

21 33
(1, 0.6364)
45 112
(2, 0.4018)
8 15
(3, 0.5333)
3 6
(4, 0.5)
13 29
(5, 0.4483)
3 4
(6, 0.75)
12 18
(7, 0.6667)
31 50
(8, 0.62)
34 53
(9, 0.6415)
6 8
(10, 0.75)
11 12
(11, 0.9167)
60 111
(12, 0.5405)


In [22]:
# data progressing for airline model
# turn binary variables into 1 and 0
dat_airline = dat
dat_airline['Airport[Incheon]'] = 0
dat_airline.loc[dat_airline['Airport'].isin([1]), 'Airport[Incheon]'] = 1

dat_airline['Gender[Male]'] = 0
dat_airline.loc[dat_airline['Gender'].isin([1]), 'Gender[Male]'] = 1

dat_airline['GroupTravel[Yes]'] = 0
dat_airline.loc[dat_airline['GroupTravel'].isin([1]), 'GroupTravel[Yes]'] = 1
dat_airline

Unnamed: 0,ID,Airport,Airline,Age,Gender,Nationality,TripPurpose,TripDuration,FlyingCompanion,ProvinceResidence,...,DepartureTime,SeatClass,Airfare,NoTransport,ModeTransport,AccessTime,Occupation,Airport[Incheon],Gender[Male],GroupTravel[Yes]
0,1,1,1.0,49.0,1.0,1,2,7,0,3,...,2,1.0,80.000000,1,6,40.00,1,1,1,0
1,2,1,1.0,49.0,2.0,1,1,4,4,3,...,1,1.0,41.000000,1,6,50.00,9,1,0,0
2,3,1,1.0,25.0,1.0,1,1,10,2,3,...,1,1.0,42.496429,2,4,20.00,12,1,1,0
3,4,1,1.0,29.0,1.0,1,2,7,2,3,...,2,1.0,40.000000,1,1,48.86,8,1,1,0
4,5,1,1.0,34.0,2.0,1,2,4,0,3,...,2,1.0,40.000000,1,6,50.00,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,484,2,4.0,39.0,1.0,1,4,8,3,8,...,3,1.0,62.000000,1,1,45.00,12,0,1,0
484,485,2,4.0,37.0,2.0,1,4,8,3,8,...,3,1.0,25.000000,1,1,45.00,5,0,0,0
485,486,2,1.0,63.0,1.0,5,2,28,1,8,...,3,1.0,46.000000,2,5,30.00,2,0,1,0
486,487,2,4.0,61.0,1.0,3,2,2,4,8,...,3,2.0,17.000000,1,2,40.00,1,0,1,0


In [23]:
dat_airline['Airline[KoreanFSC]'] = 0
dat_airline.loc[dat['Airline'].isin([1, 2]), 'Airline[KoreanFSC]'] = 1

# nationality
dat_airline['Nationality[Korea]'] = 0
dat_airline.loc[dat_airline['Nationality'].isin([1]), 'Nationality[Korea]'] = 1

dat_airline['Nationality[ChinaJapan]'] = 0
dat_airline.loc[dat_airline['Nationality'].isin([2, 3]), 'Nationality[ChinaJapan]'] = 1


# truppurpose
dat_airline['TripPurpose[Leisure]'] = 0
dat_airline.loc[dat_airline['TripPurpose'].isin([1]), 'TripPurpose[Leisure]'] = 1

dat_airline['TripPurpose[Business]'] = 0
dat_airline.loc[dat_airline['TripPurpose'].isin([2]), 'TripPurpose[Business]'] = 1

# ProvinceResidence
dat_airline['ProvinceResidence[Seoul]'] = 0
dat_airline.loc[dat_airline['ProvinceResidence'].isin([1]), 
        'ProvinceResidence[Seoul]'] = 1

dat_airline['ProvinceResidence[Incheon]'] = 0
dat_airline.loc[dat_airline['ProvinceResidence'].isin([2]), 
        'ProvinceResidence[Incheon]'] = 1

dat_airline['ProvinceResidence[Kyungki-do]'] = 0
dat_airline.loc[dat_airline['ProvinceResidence'].isin([3]), 
        'ProvinceResidence[Kyungki-do]'] = 1

# FrequentFlightDestination
dat_airline['FrequentFlightDestination[SEAsia]'] = 0
dat_airline.loc[dat_airline['FrequentFlightDestination'].isin([1, 4]), 'FrequentFlightDestination[SEAsia]'] = 1

dat_airline['FrequentFlightDestination[China]'] = 0
dat_airline.loc[dat_airline['FrequentFlightDestination'].isin([2]), 'FrequentFlightDestination[China]'] = 1

dat_airline['FrequentFlightDestination[Japan]'] = 0
dat_airline.loc[dat_airline['FrequentFlightDestination'].isin([3, 5]), 'FrequentFlightDestination[Japan]'] = 1

# Destination
dat_airline['Destination[China]'] = 0
dat_airline.loc[dat_airline['Destination'].isin([1]), 'Destination[China]'] = 1

dat_airline['Destination[Japan]'] = 0
dat_airline.loc[dat_airline['Destination'].isin([2]), 'Destination[Japan]'] = 1


# DepatrureTime
dat_airline['DepartureTime[0612]'] = 0
dat_airline.loc[dat_airline['DepartureTime'].isin([1]), 'DepartureTime[0612]'] = 1

dat_airline['DepartureTime[1218]'] = 0
dat_airline.loc[dat_airline['DepartureTime'].isin([2]), 'DepartureTime[1218]'] = 1

dat_airline['DepartureTime[1821]'] = 0
dat_airline.loc[dat_airline['DepartureTime'].isin([3]), 'DepartureTime[1821]'] = 1

# SeatClass
dat_airline['SeatClass[Economy]'] = 0
dat_airline.loc[dat_airline['SeatClass'].isin([1]),'SeatClass[Economy]'] = 1

# ModeTransport
dat_airline['ModeTransport[Car]'] = 0
dat_airline.loc[dat_airline['ModeTransport'].isin([1]), 'ModeTransport[Car]'] = 1

dat_airline['ModeTransport[Taxi]'] = 0
dat_airline.loc[dat_airline['ModeTransport'].isin([2]), 'ModeTransport[Taxi]'] = 1

dat_airline['ModeTransport[BusLimo]'] = 0
dat_airline.loc[dat_airline['ModeTransport'].isin([3, 6]), 'ModeTransport[BusLimo]'] = 1

dat_airline['ModeTransport[SubwayKTX]'] = 0
dat_airline.loc[dat_airline['ModeTransport'].isin([5, 8]), 'ModeTransport[SubwayKTX]'] = 1

# Occupation
dat_airline['Occupation[EntrepreneurSelfE]'] = 0
dat_airline.loc[dat_airline['Occupation'].isin([1, 7]), 'Occupation[EntrepreneurSelfE]'] = 1

dat_airline['Occupation[BusProf]'] = 0
dat_airline.loc[dat_airline['Occupation'].isin([2, 5]), 'Occupation[BusProf]'] = 1

dat_airline['Occupation[StudHousewife]'] = 0
dat_airline.loc[dat_airline['Occupation'].isin([7, 8]), 'Occupation[StudHousewife]'] = 1

dat_airline


Unnamed: 0,ID,Airport,Airline,Age,Gender,Nationality,TripPurpose,TripDuration,FlyingCompanion,ProvinceResidence,...,DepartureTime[1218],DepartureTime[1821],SeatClass[Economy],ModeTransport[Car],ModeTransport[Taxi],ModeTransport[BusLimo],ModeTransport[SubwayKTX],Occupation[EntrepreneurSelfE],Occupation[BusProf],Occupation[StudHousewife]
0,1,1,1.0,49.0,1.0,1,2,7,0,3,...,1,0,1,0,0,1,0,1,0,0
1,2,1,1.0,49.0,2.0,1,1,4,4,3,...,0,0,1,0,0,1,0,0,0,0
2,3,1,1.0,25.0,1.0,1,1,10,2,3,...,0,0,1,0,0,0,0,0,0,0
3,4,1,1.0,29.0,1.0,1,2,7,2,3,...,1,0,1,1,0,0,0,0,0,1
4,5,1,1.0,34.0,2.0,1,2,4,0,3,...,1,0,1,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,484,2,4.0,39.0,1.0,1,4,8,3,8,...,0,1,1,1,0,0,0,0,0,0
484,485,2,4.0,37.0,2.0,1,4,8,3,8,...,0,1,1,1,0,0,0,0,1,0
485,486,2,1.0,63.0,1.0,5,2,28,1,8,...,0,1,1,0,0,0,1,0,1,0
486,487,2,4.0,61.0,1.0,3,2,2,4,8,...,0,1,0,0,1,0,0,1,0,0


In [24]:
dat_airline_c = dat_airline[["Airline[KoreanFSC]", "Airport[Incheon]", "Age", 
                             "Gender[Male]",
                             "Nationality[Korea]", "Nationality[ChinaJapan]",
                             "TripPurpose[Leisure]", "TripPurpose[Business]", 
                             "TripDuration", 
                             "FlyingCompanion", "ProvinceResidence[Seoul]",
                             "ProvinceResidence[Incheon]", "ProvinceResidence[Kyungki-do]", 
                            "GroupTravel[Yes]", "NoTripsLastYear", 
                             "FrequentFlightDestination[SEAsia]",
                            "FrequentFlightDestination[China]", "FrequentFlightDestination[Japan]",
                            "Destination[China]", "Destination[Japan]",
                            "DepartureTime[0612]", "DepartureTime[1218]", "DepartureTime[1821]",
                            "SeatClass[Economy]", 
                             "ModeTransport[Car]", "ModeTransport[Taxi]", "ModeTransport[BusLimo]", 
                             "ModeTransport[SubwayKTX]",
                            "Occupation[EntrepreneurSelfE]", "Occupation[BusProf]",
                            "Occupation[StudHousewife]", 
                            "Airfare", "NoTransport", "AccessTime"]]
dat_airline_c

Unnamed: 0,Airline[KoreanFSC],Airport[Incheon],Age,Gender[Male],Nationality[Korea],Nationality[ChinaJapan],TripPurpose[Leisure],TripPurpose[Business],TripDuration,FlyingCompanion,...,ModeTransport[Car],ModeTransport[Taxi],ModeTransport[BusLimo],ModeTransport[SubwayKTX],Occupation[EntrepreneurSelfE],Occupation[BusProf],Occupation[StudHousewife],Airfare,NoTransport,AccessTime
0,1,1,49.0,1,1,0,0,1,7,0,...,0,0,1,0,1,0,0,80.000000,1,40.00
1,1,1,49.0,0,1,0,1,0,4,4,...,0,0,1,0,0,0,0,41.000000,1,50.00
2,1,1,25.0,1,1,0,1,0,10,2,...,0,0,0,0,0,0,0,42.496429,2,20.00
3,1,1,29.0,1,1,0,0,1,7,2,...,1,0,0,0,0,0,1,40.000000,1,48.86
4,1,1,34.0,0,1,0,0,1,4,0,...,0,0,1,0,1,0,0,40.000000,1,50.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,0,0,39.0,1,1,0,0,0,8,3,...,1,0,0,0,0,0,0,62.000000,1,45.00
484,0,0,37.0,0,1,0,0,0,8,3,...,1,0,0,0,0,1,0,25.000000,1,45.00
485,1,0,63.0,1,0,0,0,1,28,1,...,0,0,0,1,0,1,0,46.000000,2,30.00
486,0,0,61.0,1,0,1,0,1,2,4,...,0,1,0,0,1,0,0,17.000000,1,40.00


In [25]:
# develop airport model
dat_airline_c["intercept"] = 1.0
dat_airline_slice = dat_airline_c[["Airline[KoreanFSC]",
                                    "GroupTravel[Yes]",
                                    "Destination[China]", "Destination[Japan]",
                                   "Airfare", "Age",
                                   "TripDuration", "Airport[Incheon]",
                                   "Nationality[ChinaJapan]", "DepartureTime[1218]"
                                   ,"intercept"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_airline_c["intercept"] = 1.0


In [26]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_airline_train, X_airline_test, y_airline_train, y_airline_test = train_test_split(dat_airline_slice.iloc[:, 1:], 
                                                                                    dat_airline_slice["Airline[KoreanFSC]"], 
                                                                                    test_size = 0.3, random_state = 109)

logit_airline = sm.Logit(y_airline_train, X_airline_train).fit()
print(logit_airline.summary())

Optimization terminated successfully.
         Current function value: 0.559207
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:     Airline[KoreanFSC]   No. Observations:                  315
Model:                          Logit   Df Residuals:                      305
Method:                           MLE   Df Model:                            9
Date:                Wed, 06 Mar 2024   Pseudo R-squ.:                  0.1842
Time:                        13:04:04   Log-Likelihood:                -176.15
converged:                       True   LL-Null:                       -215.92
Covariance Type:            nonrobust   LLR p-value:                 1.993e-13
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
GroupTravel[Yes]            1.0017      0.417      2.404      0.016       0.185   

In [27]:
y_airline_pred = logit_airline.predict(X_airline_test)
X_airline_test.loc[:, "prediction"] = 0
X_airline_test.loc[y_airline_pred > 0.5, "prediction"] = 1
pd.crosstab(y_airline_test,X_airline_test['prediction'],rownames =['actual'],colnames=['predicted'])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,38,28
1,21,49


In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
accuracy_logit = accuracy_score(y_airline_test, X_airline_test['prediction'])
precision_logit = precision_score(y_airline_test, X_airline_test['prediction'])
recall_logit = recall_score(y_airline_test, X_airline_test['prediction'])

print(f"Accuracy of Logit model: {round(accuracy_logit, 4)}")
print(f"Precision of Logit model: {round(precision_logit, 4)}")
print(f"Recall of Logit model: {round(recall_logit, 4)}")

Accuracy of Logit model: 0.6397
Precision of Logit model: 0.6364
Recall of Logit model: 0.7


In [29]:
# Akaike Information Criterion
AIC = -2*-176.15+2*10
AIC

372.3

In [30]:
# Bayesian Information Criterion
import math
BIC = -2*-176.15+10*math.log(451)
BIC

413.4146733950268

In [31]:
# special vector machine
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC 
X_airline_train_svm, X_airline_test_svm, y_airline_train_svm, y_airline_test_svm = train_test_split(dat_airline_slice.iloc[:, 1:], 
                                                                                    dat_airline_slice["Airline[KoreanFSC]"], 
                                                                                    test_size = 0.3, random_state = 109)

scaler = StandardScaler()  
scaler.fit(X_airline_train_svm)
X_airline_train_svm = scaler.transform(X_airline_train_svm)  
X_airline_test_svm = scaler.transform(X_airline_test_svm) 

# set kernel to linear
svclassifier = SVC(kernel='linear') 
svclassifier.fit(X_airline_train_svm, y_airline_train_svm)  

y_pred_svm = svclassifier.predict(X_airline_test_svm)  
print(metrics.confusion_matrix(y_airline_test_svm, y_pred_svm)) 

accuracy_svm_linear = accuracy_score(y_airline_test_svm, y_pred_svm)
print(f"Accuracy of SVM (linear kernel): {round(accuracy_svm_linear, 4)}")
precision_svm_linear = precision_score(y_airline_test_svm, y_pred_svm)
print(f"Precision of SVM (linear kernel): {round(precision_svm_linear, 4)}")
recall_svm_linear = recall_score(y_airline_test_svm, y_pred_svm)
print(f"Recall of SVM (linear kernel): {round(recall_svm_linear, 4)}")


[[37 29]
 [21 49]]
Accuracy of SVM (linear kernel): 0.6324
Precision of SVM (linear kernel): 0.6282
Recall of SVM (linear kernel): 0.7


In [32]:
# calucalte the area under ROC (AUC)
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_airline_test_svm, y_pred_svm)
print(f"AUC: {auc}")


AUC: 0.6303030303030303


In [33]:
# set cubic polynomial kernel
svclassifier2 = SVC(kernel='poly', degree=3)   
svclassifier2.fit(X_airline_train_svm, y_airline_train_svm)  
y_pred2_svm = svclassifier2.predict(X_airline_test_svm) 
print(metrics.confusion_matrix(y_airline_test_svm, y_pred2_svm)) 

accuracy_svm_poly = accuracy_score(y_airline_test_svm, y_pred2_svm)
print(f"Accuracy of SVM (poly kernel): {round(accuracy_svm_poly, 4)}")
precision_svm_poly = precision_score(y_airline_test_svm, y_pred2_svm)
print(f"Precision of SVM (poly kernel): {round(precision_svm_poly, 4)}")
recall_svm_poly = recall_score(y_airline_test_svm, y_pred2_svm)
print(f"Recall of SVM (poly kernel): {round(recall_svm_poly, 4)}")


[[40 26]
 [24 46]]
Accuracy of SVM (poly kernel): 0.6324
Precision of SVM (poly kernel): 0.6389
Recall of SVM (poly kernel): 0.6571


In [34]:
# set RBF kernel
svclassifier3 = SVC(kernel='rbf')   
svclassifier3.fit(X_airline_train_svm, y_airline_train_svm)  
y_pred3_svm = svclassifier3.predict(X_airline_test_svm) 
print(metrics.confusion_matrix(y_airline_test_svm, y_pred3_svm)) 

accuracy_svm_rbf = accuracy_score(y_airline_test_svm, y_pred3_svm)
print(f"Accuracy of SVM (rbf kernel): {round(accuracy_svm_rbf, 4)}")
precision_svm_rbf = precision_score(y_airline_test_svm, y_pred3_svm)
print(f"Precision of SVM (rbf kernel): {round(precision_svm_rbf, 4)}")
recall_svm_rbf = recall_score(y_airline_test_svm, y_pred3_svm)
print(f"Recall of SVM (rbf kernel): {round(recall_svm_rbf, 4)}")

[[45 21]
 [23 47]]
Accuracy of SVM (rbf kernel): 0.6765
Precision of SVM (rbf kernel): 0.6912
Recall of SVM (rbf kernel): 0.6714


In [35]:
# neural network
from sklearn.neural_network import MLPClassifier 
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(dat_airline_slice.iloc[:, 1:], 
                                                                dat_airline_slice["Airline[KoreanFSC]"], 
                                                                test_size = 0.3, random_state = 109)
scaler = StandardScaler()  
scaler.fit(X_train_nn)
X_train_nn = scaler.transform(X_train_nn)  
X_test_nn = scaler.transform(X_test_nn) 

# one hidden layer with three neurons
mlp = MLPClassifier(hidden_layer_sizes=(3), max_iter=1000)  
mlp.fit(X_train_nn, y_train_nn)  
y_pred_nn = mlp.predict(X_test_nn)  
print(metrics.confusion_matrix(y_test_nn, y_pred_nn)) 

accuracy_nn = accuracy_score(y_test_nn, y_pred_nn)
print(f"Accuracy of Neural Network (three neurons): {round(accuracy_nn, 4)}")
precision_nn = precision_score(y_test_nn, y_pred_nn)
print(f"Precision of Neural Network (three neurons): {round(precision_nn, 4)}")
recall_nn = recall_score(y_test_nn, y_pred_nn)
print(f"Recall of Neural Network (three neurons): {round(recall_nn, 4)}")

[[34 32]
 [16 54]]
Accuracy of Neural Network (three neurons): 0.6471
Precision of Neural Network (three neurons): 0.6279
Recall of Neural Network (three neurons): 0.7714
