# Mapping ports to airports

In [41]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
ports = pd.read_csv('../curated/ports_clean.csv')

In [11]:
ports.head()

Unnamed: 0,port,port_name,port_city,port_state,port_state_name
0,ALC,"ALCAN, AK",ALCAN,AK,Alaska
1,ANC,"ANCHORAGE, AK",ANCHORAGE,AK,Alaska
2,BAR,"BAKER AAF - BAKER ISLAND, AK",BAKER AAF - BAKER ISLAND,AK,Alaska
3,DAC,"DALTONS CACHE, AK",DALTONS CACHE,AK,Alaska
4,PIZ,"DEW STATION PT LAY DEW, AK",DEW STATION PT LAY DEW,AK,Alaska


In [49]:
airports = pd.read_csv('../curated/us_international_airports.csv')

In [5]:
airports.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,state
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022",KS
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968",AK
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172",AL
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087",AR


Treat common airport terms as stopwords, because they can be wrongly matched:

In [271]:
def stop_words(df):
    return df.iloc[:100,0].to_list()

In [272]:
airport_terms_set = airport_terms.pipe(stop_words)

In [274]:
def airport_corpus(row):
    return f"{row['name']} {row['municipality']}"

In [275]:
def port_corpus(row):
    return row['port_city']

In [276]:
def calc_corpus(df, func):
    return df.assign(corpus=df.apply(axis=1, func=func))

In [277]:
def row_bag(row):
    vectorizer = CountVectorizer(stop_words='english')
    vectorizer.fit_transform([row['corpus']])
    return set(vectorizer.get_feature_names()).difference(airport_terms_set)

In [278]:
def calc_bags(df):
    return df.assign(bag=df.apply(axis=1, func=row_bag))

In [279]:
%%time
airport_bags = (
    airports
    .pipe(calc_corpus, airport_corpus)
    .pipe(calc_bags)
)

CPU times: user 10.9 s, sys: 47.1 ms, total: 11 s
Wall time: 11 s


In [280]:
airport_bags.head(1)

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,state,corpus,bag
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}"


In [281]:
%%time
port_bags = (
    ports
    .pipe(calc_corpus, port_corpus)
    .pipe(calc_bags)
)

CPU times: user 289 ms, sys: 14.6 ms, total: 303 ms
Wall time: 291 ms


In [282]:
port_bags.head(1)

Unnamed: 0,port,port_name,port_city,port_state,port_state_name,corpus,bag
0,ALC,"ALCAN, AK",ALCAN,AK,Alaska,ALCAN,{alcan}


In [283]:
def cross_bags(df):
    return df.merge(port_bags, left_on='state', right_on='port_state', how='inner')

In [284]:
%%time
cross = (
    airport_bags
    .pipe(cross_bags)
)

CPU times: user 137 ms, sys: 31.1 ms, total: 168 ms
Wall time: 167 ms


In [285]:
cross

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,state,corpus_x,bag_x,port,port_name,port_city,port_state,port_state_name,corpus_y,bag_y
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}",ERI,"ERIE, PA",ERIE,PA,Pennsylvania,ERIE,{erie}
1,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}",MDT,"HARRISBURG, PA",HARRISBURG,PA,Pennsylvania,HARRISBURG,{harrisburg}
2,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}",HSB,"HARRISONBURG, PA",HARRISONBURG,PA,Pennsylvania,HARRISONBURG,{harrisonburg}
3,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}",PHI,"PHILADELPHIA, PA",PHILADELPHIA,PA,Pennsylvania,PHILADELPHIA,{philadelphia}
4,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}",PIT,"PITTSBURG, PA",PITTSBURG,PA,Pennsylvania,PITTSBURG,{pittsburg}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354197,VI32,seaplane_base,Christiansted Harbor Seaplane Base,,,VI,VI-U-A,Christiansted St Croix,VI32,SSB,VI32,"-64.70490264892578, 17.74720001220703",VI,Christiansted Harbor Seaplane Base Christianst...,"{christiansted, croix}",CHA,"CHARLOTTE AMALIE, VI",CHARLOTTE AMALIE,VI,U.S. Virgin Islands,CHARLOTTE AMALIE,"{amalie, charlotte}"
354198,VI32,seaplane_base,Christiansted Harbor Seaplane Base,,,VI,VI-U-A,Christiansted St Croix,VI32,SSB,VI32,"-64.70490264892578, 17.74720001220703",VI,Christiansted Harbor Seaplane Base Christianst...,"{christiansted, croix}",CHR,"CHRISTIANSTED, VI",CHRISTIANSTED,VI,U.S. Virgin Islands,CHRISTIANSTED,{christiansted}
354199,VI32,seaplane_base,Christiansted Harbor Seaplane Base,,,VI,VI-U-A,Christiansted St Croix,VI32,SSB,VI32,"-64.70490264892578, 17.74720001220703",VI,Christiansted Harbor Seaplane Base Christianst...,"{christiansted, croix}",CRU,"CRUZ BAY, ST JOHN, VI","CRUZ BAY, ST JOHN",VI,U.S. Virgin Islands,"CRUZ BAY, ST JOHN",{cruz}
354200,VI32,seaplane_base,Christiansted Harbor Seaplane Base,,,VI,VI-U-A,Christiansted St Croix,VI32,SSB,VI32,"-64.70490264892578, 17.74720001220703",VI,Christiansted Harbor Seaplane Base Christianst...,"{christiansted, croix}",FRK,"FREDERIKSTED, VI",FREDERIKSTED,VI,U.S. Virgin Islands,FREDERIKSTED,{frederiksted}


In [286]:
def jaccard(row):
    set1 = row['bag_x']
    set2 = row['bag_y']
    num = len(set1.intersection(set2))
    den = len(set1.union(set2))
    if den == 0:
        return 0
    else:
        return num / den

In [287]:
def cross_jaccard(df):
    return df.assign(jaccard=df.apply(axis=1, func=jaccard))

In [288]:
%%time
distances = cross.pipe(cross_jaccard)

CPU times: user 3.37 s, sys: 88.2 ms, total: 3.46 s
Wall time: 3.46 s


In [289]:
distances

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,state,corpus_x,bag_x,port,port_name,port_city,port_state,port_state_name,corpus_y,bag_y,jaccard
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}",ERI,"ERIE, PA",ERIE,PA,Pennsylvania,ERIE,{erie},0.0
1,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}",MDT,"HARRISBURG, PA",HARRISBURG,PA,Pennsylvania,HARRISBURG,{harrisburg},0.0
2,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}",HSB,"HARRISONBURG, PA",HARRISONBURG,PA,Pennsylvania,HARRISONBURG,{harrisonburg},0.0
3,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}",PHI,"PHILADELPHIA, PA",PHILADELPHIA,PA,Pennsylvania,PHILADELPHIA,{philadelphia},0.0
4,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA,Total Rf Heliport Bensalem,"{bensalem, total, rf}",PIT,"PITTSBURG, PA",PITTSBURG,PA,Pennsylvania,PITTSBURG,{pittsburg},0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354197,VI32,seaplane_base,Christiansted Harbor Seaplane Base,,,VI,VI-U-A,Christiansted St Croix,VI32,SSB,VI32,"-64.70490264892578, 17.74720001220703",VI,Christiansted Harbor Seaplane Base Christianst...,"{christiansted, croix}",CHA,"CHARLOTTE AMALIE, VI",CHARLOTTE AMALIE,VI,U.S. Virgin Islands,CHARLOTTE AMALIE,"{amalie, charlotte}",0.0
354198,VI32,seaplane_base,Christiansted Harbor Seaplane Base,,,VI,VI-U-A,Christiansted St Croix,VI32,SSB,VI32,"-64.70490264892578, 17.74720001220703",VI,Christiansted Harbor Seaplane Base Christianst...,"{christiansted, croix}",CHR,"CHRISTIANSTED, VI",CHRISTIANSTED,VI,U.S. Virgin Islands,CHRISTIANSTED,{christiansted},0.5
354199,VI32,seaplane_base,Christiansted Harbor Seaplane Base,,,VI,VI-U-A,Christiansted St Croix,VI32,SSB,VI32,"-64.70490264892578, 17.74720001220703",VI,Christiansted Harbor Seaplane Base Christianst...,"{christiansted, croix}",CRU,"CRUZ BAY, ST JOHN, VI","CRUZ BAY, ST JOHN",VI,U.S. Virgin Islands,"CRUZ BAY, ST JOHN",{cruz},0.0
354200,VI32,seaplane_base,Christiansted Harbor Seaplane Base,,,VI,VI-U-A,Christiansted St Croix,VI32,SSB,VI32,"-64.70490264892578, 17.74720001220703",VI,Christiansted Harbor Seaplane Base Christianst...,"{christiansted, croix}",FRK,"FREDERIKSTED, VI",FREDERIKSTED,VI,U.S. Virgin Islands,FREDERIKSTED,{frederiksted},0.0


In [290]:
def max_port_similarities(df):
    return df.groupby('port').max('jaccard')

In [291]:
%%time
max_similarities = (
    distances
    .pipe(max_port_similarities)
)

CPU times: user 79.7 ms, sys: 22.2 ms, total: 102 ms
Wall time: 100 ms


In [292]:
max_similarities

Unnamed: 0_level_0,elevation_ft,jaccard
port,Unnamed: 1_level_1,Unnamed: 2_level_1
48Y,1799.0,0.5
5KE,5742.0,1.0
ABE,5130.0,0.5
ABG,2156.0,0.5
ABQ,8617.0,1.0
ABS,2156.0,0.5
ACY,1051.0,1.0
ADS,5000.0,0.5
ADT,5000.0,0.2
ADW,2933.0,0.25


In [293]:
def merge_similarities(df):
    on = ['port', 'jaccard']
    return (
        df
        .drop(columns=['elevation_ft'])
        .reset_index()
        .merge(distances, left_on=on, right_on=on, how='inner')
    )

In [294]:
%%time
matches = max_similarities.pipe(merge_similarities)

CPU times: user 200 ms, sys: 38.1 ms, total: 238 ms
Wall time: 237 ms


In [295]:
pd.set_option('display.max_columns', None)

In [296]:
def remove_unmatched(df):
    return df[df['jaccard'] > 0]

In [297]:
def same_state(df):
    return df[df['state'] == df['port_state']]

In [298]:
pd.set_option('display.max_rows', 3000)

In [299]:
matches.head()

Unnamed: 0,port,jaccard,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,state,corpus_x,bag_x,port_name,port_city,port_state,port_state_name,corpus_y,bag_y
0,48Y,0.5,48Y,small_airport,Piney Pinecreek Border Airport,1078.0,,US,US-MN,Pinecreek,48Y,,48Y,"-95.98259735107422, 48.99959945678711",MN,Piney Pinecreek Border Airport Pinecreek,"{pinecreek, piney, border}","PINECREEK BORDER ARPT, MN",PINECREEK BORDER ARPT,MN,Minnesota,PINECREEK BORDER ARPT,"{pinecreek, arpt, border}"
1,5KE,1.0,5KE,seaplane_base,Ketchikan Harbor Seaplane Base,,,US,US-AK,Ketchikan,,WFB,5KE,"-131.677002, 55.349899",AK,Ketchikan Harbor Seaplane Base Ketchikan,{ketchikan},"KETCHIKAN, AK",KETCHIKAN,AK,Alaska,KETCHIKAN,{ketchikan}
2,5KE,1.0,PAKT,medium_airport,Ketchikan International Airport,89.0,,US,US-AK,Ketchikan,PAKT,KTN,KTN,"-131.7140045, 55.35559845",AK,Ketchikan International Airport Ketchikan,{ketchikan},"KETCHIKAN, AK",KETCHIKAN,AK,Alaska,KETCHIKAN,{ketchikan}
3,ABE,0.5,94WA,small_airport,Wishkah River Ranch Airport,36.0,,US,US-WA,Aberdeen,94WA,,94WA,"-123.77400207519531, 47.084800720214844",WA,Wishkah River Ranch Airport Aberdeen,"{wishkah, aberdeen}","ABERDEEN, WA",ABERDEEN,WA,Washington,ABERDEEN,{aberdeen}
4,ABE,0.5,WT00,heliport,Grays Harbor Community Hospital Heliport,700.0,,US,US-WA,Aberdeen,WT00,,WT00,"-123.847139, 46.979706",WA,Grays Harbor Community Hospital Heliport Aberdeen,"{grays, aberdeen}","ABERDEEN, WA",ABERDEEN,WA,Washington,ABERDEEN,{aberdeen}


In [302]:
(
    matches
    .pipe(remove_unmatched)
    #.pipe(same_state)
    [['state', 'port', 'port_name', 'municipality', 'name', 'jaccard']]
    #[['state', 'port', 'name']]
    #.groupby(['state', 'port']).count()
    .sort_values('jaccard', ascending=False)
)

Unnamed: 0,state,port,port_name,municipality,name,jaccard
30081,CA,LNB,"LONG BEACH, CA",Long Beach,Long Beach Memorial Medical Center Heliport,1.0
18805,ND,FAR,"FARGO, ND",Fargo,West Fargo Municipal Airport,1.0
30090,CA,LOS,"LOS ANGELES, CA",Los Angeles,Los Angeles International Airport,1.0
30089,CA,LOS,"LOS ANGELES, CA",Los Angeles,AT&T Center Heliport,1.0
30087,WA,LON,"LONGVIEW, WA",Longview,St John's Medical Center Heliport,1.0
30083,NM,LOB,"LORDSBURG, NM",Lordsburg,Lordsburg Municipal Airport,1.0
1,AK,5KE,"KETCHIKAN, AK",Ketchikan,Ketchikan Harbor Seaplane Base,1.0
27803,LA,LKC,"LAKE CHARLES, LA",Lake Charles,Lake Charles Memorial Heliport,1.0
27802,LA,LKC,"LAKE CHARLES, LA",Lake Charles,Lake Charles Regional Airport,1.0
27798,HI,LIH,"LIHUE, HI",Lihue,Lihue Airport,1.0


This are some of those matches that would have been hard with my previous methods:
```
49115	FL	SRQ	BRADENTON - SARASOTA, FL	Sarasota/Bradenton	Sarasota Bradenton International Airport	1.000000

25557	PR	JCP	CULEBRA - BENJAMIN RIVERA, PR	Culebra Island	Benjamin Rivera Noriega Airport	0.750000

312	AK	PIZ	DEW STATION PT LAY DEW, AK	Point Lay	Point Lay LRRS Airport	0.250000

51	ID	BOI	AIR TERM. (GOWEN FLD) BOISE, ID	Boise	Boise Air Terminal/Gowen Field	0.400000
```

There are some false positives, some of the matches with jaccard similarity 0.25 are bad, but others are correct.

In [305]:
(
    matches
    .pipe(remove_unmatched)
    #.pipe(same_state)
)['port'].drop_duplicates().shape

(428,)

Lets get one match for each port:
- favor larger airports and those that are likely to be international
- do this by projecting airport types to ordinals, and an "international" flag
- the latter is determined by whether that term appears in the name, it is not surefire given that there are exceptions (such as Detroit's largest airports)
- sort by (international, type) descending, group by (port) and get the first

In [325]:
airports['type'].drop_duplicates()

0              heliport
1         small_airport
4                closed
36        seaplane_base
1329        balloonport
6180     medium_airport
13507     large_airport
Name: type, dtype: object

In [328]:
airport_sizes = {   
    'closed': 0,
    'balloonport': 1,
    'seaplane_base': 2,
    'heliport': 3,
    'small_airport': 4,
    'medium_airport': 5,
    'large_airport': 6
}

In [330]:
def airport_type_to_size(df):
    return df.assign(size=df['type'].replace(airport_sizes))

In [334]:
def airport_international_flag(df):
    return (
        df
        .assign(
            international=df['name'].str.lower().str.contains('international')
        )
    )

In [352]:
def one_match(df):
    return (
        df.
        sort_values(
            ['port', 'international', 'jaccard', 'size'], 
            ascending=[True, False, False, False]
        )
        .groupby('port').first().reset_index()
    )

In [356]:
def save_port_to_airports_mapping(df):
    (
        df
        .sort_values(['port'])
        [['port','ident']]
    ).to_csv('../curated/port_airports.csv', index=False)

In [357]:
(
    matches
    .pipe(remove_unmatched)
    .pipe(airport_type_to_size)
    .pipe(airport_international_flag)
    .pipe(one_match)
    #[['state', 'port', 'port_name', 'municipality', 'name', 'jaccard']]
    .sort_values(['state', 'municipality'])
)

Unnamed: 0,port,jaccard,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,state,corpus_x,bag_x,port_name,port_city,port_state,port_state_name,corpus_y,bag_y,size,international
20,ANC,1.0,6AK5,small_airport,Fire Island Airport,55.0,,US,US-AK,Anchorage,6AK5,,6AK5,"-150.16099548339844, 61.16830062866211",AK,Fire Island Airport Anchorage,{anchorage},"ANCHORAGE, AK",ANCHORAGE,AK,Alaska,ANCHORAGE,{anchorage},4,False
115,EGL,1.0,PAEG,small_airport,Eagle Airport,908.0,,US,US-AK,Eagle,PAEG,EAA,EAA,"-141.151001, 64.77639771",AK,Eagle Airport Eagle,{eagle},"EAGLE, AK",EAGLE,AK,Alaska,EAGLE,{eagle},4,False
255,MOS,0.333333,MOS,small_airport,Moses Point Airport,14.0,,US,US-AK,Elim,MOS,,MOS,"-162.0570068359375, 64.69819641113281",AK,Moses Point Airport Elim,"{moses, elim}","MOSES POINT INTERMEDIATE, AK",MOSES POINT INTERMEDIATE,AK,Alaska,MOSES POINT INTERMEDIATE,"{intermediate, moses}",4,False
140,FRB,1.0,PAFA,large_airport,Fairbanks International Airport,439.0,,US,US-AK,Fairbanks,PAFA,FAI,FAI,"-147.8560028, 64.81510162",AK,Fairbanks International Airport Fairbanks,{fairbanks},"FAIRBANKS, AK",FAIRBANKS,AK,Alaska,FAIRBANKS,{fairbanks},6,True
177,HOM,1.0,PAHO,medium_airport,Homer Airport,84.0,,US,US-AK,Homer,PAHO,HOM,HOM,"-151.4770050048828, 59.645599365234375",AK,Homer Airport Homer,{homer},"HOMER, AK",HOMER,AK,Alaska,HOMER,{homer},5,False
187,HYD,1.0,4Z7,seaplane_base,Hyder Seaplane Base,,,US,US-AK,Hyder,,WHD,4Z7,"-130.009975, 55.903324",AK,Hyder Seaplane Base Hyder,{hyder},"HYDER, AK",HYDER,AK,Alaska,HYDER,{hyder},2,False
200,JUN,1.0,PAJN,medium_airport,Juneau International Airport,21.0,,US,US-AK,Juneau,PAJN,JNU,JNU,"-134.5760040283203, 58.35499954223633",AK,Juneau International Airport Juneau,{juneau},"JUNEAU, AK",JUNEAU,AK,Alaska,JUNEAU,{juneau},5,True
1,5KE,1.0,PAKT,medium_airport,Ketchikan International Airport,89.0,,US,US-AK,Ketchikan,PAKT,KTN,KTN,"-131.7140045, 55.35559845",AK,Ketchikan International Airport Ketchikan,{ketchikan},"KETCHIKAN, AK",KETCHIKAN,AK,Alaska,KETCHIKAN,{ketchikan},5,True
202,KET,1.0,PAKT,medium_airport,Ketchikan International Airport,89.0,,US,US-AK,Ketchikan,PAKT,KTN,KTN,"-131.7140045, 55.35559845",AK,Ketchikan International Airport Ketchikan,{ketchikan},"KETCHIKAN, AK",KETCHIKAN,AK,Alaska,KETCHIKAN,{ketchikan},5,True
268,NIK,0.25,1AK5,heliport,Offshore Systems-Kenai Heliport,150.0,,US,US-AK,Nikiski,1AK5,,1AK5,"-151.30799865722656, 60.73970031738281",AK,Offshore Systems-Kenai Heliport Nikiski,"{nikiski, offshore, systems, kenai}","NIKISKI, AK",NIKISKI,AK,Alaska,NIKISKI,{nikiski},3,False


In [358]:
(
    matches
    .pipe(remove_unmatched)
    .pipe(airport_type_to_size)
    .pipe(airport_international_flag)
    .pipe(one_match)
    .pipe(save_port_to_airports_mapping)
)

In [359]:
pd.read_csv('../curated/port_airports.csv')

Unnamed: 0,port,ident
0,48Y,48Y
1,5KE,PAKT
2,ABE,94WA
3,ABG,VT38
4,ABQ,NM00
5,ABS,VT38
6,ACY,JY28
7,ADS,KADS
8,ADT,84TX
9,ADW,KADW


See how many immigration records can be matched to airports:

In [361]:
port_airports = (
    matches
    .pipe(remove_unmatched)
    .pipe(airport_type_to_size)
    .pipe(airport_international_flag)
    .pipe(one_match)
)

In [364]:
i94 = pd.read_csv('../../immigration_data_sample.csv')

There are 962 air arrivals out of 1000:

In [365]:
def only_air(df):
    return df[df['i94mode'] == 1]

In [370]:
(
    i94
    .pipe(only_air)
)

Unnamed: 0.1,Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,20573.0,61.0,2.0,1.0,20160422,,,G,O,,M,1955.0,07202016,F,,JL,5.658267e+10,00782,WT
1,2171295,4422636.0,2016.0,4.0,582.0,582.0,MCA,20567.0,1.0,TX,20568.0,26.0,2.0,1.0,20160423,MTR,,G,R,,M,1990.0,10222016,M,,*GA,9.436200e+10,XBLNG,B2
2,589494,1195600.0,2016.0,4.0,148.0,112.0,OGG,20551.0,1.0,FL,20571.0,76.0,2.0,1.0,20160407,,,G,O,,M,1940.0,07052016,M,,LH,5.578047e+10,00464,WT
3,2631158,5291768.0,2016.0,4.0,297.0,297.0,LOS,20572.0,1.0,CA,20581.0,25.0,2.0,1.0,20160428,DOH,,G,O,,M,1991.0,10272016,M,,QR,9.478970e+10,00739,B2
5,721257,1481650.0,2016.0,4.0,577.0,577.0,ATL,20552.0,1.0,GA,20606.0,51.0,2.0,1.0,20160408,,,T,N,,M,1965.0,10072016,M,,DL,7.368526e+08,910,B2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2117909,4288772.0,2016.0,4.0,135.0,135.0,LVG,20567.0,1.0,NV,20572.0,32.0,2.0,1.0,20160423,,,G,O,,M,1984.0,07212016,M,,VS,5.914065e+10,00043,WT
996,1463022,2947585.0,2016.0,4.0,261.0,261.0,PSP,20560.0,1.0,HI,20567.0,35.0,1.0,1.0,20160416,JDD,,G,O,,M,1981.0,10152016,M,,SV,9.371186e+10,00041,B1
997,1414569,2883298.0,2016.0,4.0,111.0,111.0,MIA,20560.0,1.0,FL,20566.0,39.0,2.0,1.0,20160416,,,G,O,,M,1977.0,07142016,M,,AF,5.627747e+10,00090,WT
998,1094181,2264857.0,2016.0,4.0,582.0,582.0,ATL,20556.0,1.0,WI,20559.0,35.0,1.0,1.0,20160412,MTR,,G,O,,M,1981.0,10112016,M,,EV,9.334035e+10,05510,B1


In [366]:
def merge_immigration_with_airports(df, how):
    return df.merge(port_airports, left_on='i94port', right_on='port', how=how)

In [373]:
pd.set_option('display.max_rows', 50)

922 could be matched to a port:

In [369]:
(
    i94
    .pipe(only_air)
    .pipe(merge_immigration_with_airports, 'inner')
)

Unnamed: 0.1,Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,port,jaccard,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,state,corpus_x,bag_x,port_name,port_city,port_state,port_state_name,corpus_y,bag_y,size,international
0,2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,20573.0,61.0,2.0,1.0,20160422,,,G,O,,M,1955.0,07202016,F,,JL,5.658267e+10,00782,WT,HHW,0.5,HI38,heliport,The Queen's Medical Center Heliport,102.0,,US,US-HI,Honolulu,HI38,,HI38,"-157.853645, 21.308197",HI,The Queen's Medical Center Heliport Honolulu,"{honolulu, queen}","HONOLULU, HI",HONOLULU,HI,Hawaii,HONOLULU,{honolulu},3,False
1,1346671,2720247.0,2016.0,4.0,209.0,209.0,HHW,20559.0,1.0,HI,20563.0,54.0,2.0,1.0,20160415,,,G,O,,M,1962.0,07132016,M,,JL,5.621252e+10,00792,WT,HHW,0.5,HI38,heliport,The Queen's Medical Center Heliport,102.0,,US,US-HI,Honolulu,HI38,,HI38,"-157.853645, 21.308197",HI,The Queen's Medical Center Heliport Honolulu,"{honolulu, queen}","HONOLULU, HI",HONOLULU,HI,Hawaii,HONOLULU,{honolulu},3,False
2,2718518,5475279.0,2016.0,4.0,209.0,209.0,HHW,20573.0,1.0,HI,20577.0,39.0,2.0,1.0,20160429,,,G,O,,M,1977.0,07272016,M,,DL,5.945308e+10,00578,WT,HHW,0.5,HI38,heliport,The Queen's Medical Center Heliport,102.0,,US,US-HI,Honolulu,HI38,,HI38,"-157.853645, 21.308197",HI,The Queen's Medical Center Heliport Honolulu,"{honolulu, queen}","HONOLULU, HI",HONOLULU,HI,Hawaii,HONOLULU,{honolulu},3,False
3,2625403,5284360.0,2016.0,4.0,254.0,276.0,HHW,20572.0,1.0,HI,20576.0,11.0,2.0,1.0,20160428,,,G,O,,M,2005.0,07262016,M,,OZ,5.940346e+10,00232,WT,HHW,0.5,HI38,heliport,The Queen's Medical Center Heliport,102.0,,US,US-HI,Honolulu,HI38,,HI38,"-157.853645, 21.308197",HI,The Queen's Medical Center Heliport Honolulu,"{honolulu, queen}","HONOLULU, HI",HONOLULU,HI,Hawaii,HONOLULU,{honolulu},3,False
4,688961,1396100.0,2016.0,4.0,209.0,209.0,HHW,20552.0,1.0,HI,20556.0,46.0,2.0,1.0,20160408,,,G,O,,M,1970.0,07062016,F,,NH,5.580982e+10,00184,WT,HHW,0.5,HI38,heliport,The Queen's Medical Center Heliport,102.0,,US,US-HI,Honolulu,HI38,,HI38,"-157.853645, 21.308197",HI,The Queen's Medical Center Heliport Honolulu,"{honolulu, queen}","HONOLULU, HI",HONOLULU,HI,Hawaii,HONOLULU,{honolulu},3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
917,2834630,5714501.0,2016.0,4.0,148.0,112.0,FMY,20574.0,1.0,FL,20600.0,37.0,2.0,1.0,20160430,,,G,O,,M,1979.0,07282016,M,,AB,5.953445e+10,07008,WT,FMY,0.5,KFMY,medium_airport,Page Field,17.0,,US,US-FL,Fort Myers,KFMY,FMY,FMY,"-81.86329650879999, 26.58659935",FL,Page Field Fort Myers,"{myers, page}","FORT MYERS, FL",FORT MYERS,FL,Florida,FORT MYERS,{myers},5,False
918,366458,751810.0,2016.0,4.0,582.0,582.0,OAK,20548.0,1.0,CA,20552.0,36.0,2.0,1.0,20160404,GDL,,G,O,,M,1980.0,10032016,M,,Y4,9.276826e+10,00990,B2,OAK,0.5,KOAK,large_airport,Metropolitan Oakland International Airport,9.0,,US,US-CA,Oakland,KOAK,OAK,OAK,"-122.221001, 37.721298",CA,Metropolitan Oakland International Airport Oak...,"{metropolitan, oakland}","OAKLAND, CA",OAKLAND,CA,California,OAKLAND,{oakland},6,True
919,1323990,2693162.0,2016.0,4.0,130.0,130.0,OAK,20559.0,1.0,CA,20566.0,54.0,2.0,1.0,20160415,,,G,O,,M,1962.0,07132016,F,,DY,5.621531e+10,07067,WT,OAK,0.5,KOAK,large_airport,Metropolitan Oakland International Airport,9.0,,US,US-CA,Oakland,KOAK,OAK,OAK,"-122.221001, 37.721298",CA,Metropolitan Oakland International Airport Oak...,"{metropolitan, oakland}","OAKLAND, CA",OAKLAND,CA,California,OAKLAND,{oakland},6,True
920,1382732,2814946.0,2016.0,4.0,582.0,582.0,ONT,20559.0,1.0,CA,,51.0,2.0,1.0,20160415,CDJ,,G,,,,1965.0,10132016,F,,AM,9.355623e+10,00780,B2,ONT,1.0,KONT,large_airport,Ontario International Airport,944.0,,US,US-CA,Ontario,KONT,ONT,ONT,"-117.60099792480469, 34.055999755859375",CA,Ontario International Airport Ontario,{ontario},"ONTARIO, CA",ONTARIO,CA,California,ONTARIO,{ontario},6,True


Lets look at those that couldn't be matched:

In [382]:
def unmatched_ports(df):
    return (
        df[df['ident'].isnull()]
        ['i94port']
        .drop_duplicates()
    )

In [384]:
ports_unmatched_to_airport = (
    i94
    .pipe(only_air)
    .pipe(merge_immigration_with_airports, 'left')
    .pipe(unmatched_ports)
)

In [385]:
ports_unmatched_to_airport

1      MCA
13     SAI
14     NAS
48     VCV
49     DUB
88     SHA
194    MON
375    TOR
421    MAA
901    NCA
952    HAM
Name: i94port, dtype: object

Lets focus on the port codes of those:
- only one those ports couldn't be matched to a clean port
- the rest are presumably foreign ports which I should have dropped sooner to begin with

In [377]:
ports = pd.read_csv('../curated/ports_clean.csv')

In [386]:
def merge_unmatched_ports(df):
    return df.merge(ports_unmatched_to_airport, left_on='port', right_on='i94port', how='inner')

In [387]:
(
    ports
    .pipe(merge_unmatched_ports)
)

Unnamed: 0,port,port_name,port_city,port_state,port_state_name,i94port
0,MCA,"MCALLEN, TX",MCALLEN,TX,Texas,MCA


In [389]:
airports[airports['iata_code'] == 'MFE']

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,state
15146,KMFE,medium_airport,Mc Allen Miller International Airport,107.0,,US,US-TX,Mc Allen,KMFE,MFE,MFE,"-98.23860168, 26.17580032",TX


This seems like an important airport, and worth adding manually

- it is also quite clear why it couldn't be matched, "MCALLEN" vs "Mc Allen"
- TODO maybe better to use a different kind of shingle, 1-char n-grams after removing spaces

First confirm that it wasn't matched, to prevent accidental duplicates:

In [390]:
def manually_add_mcallen_airport(df):
    print("TODO")

### common words in airport names

In [187]:
from collections import Counter

In [199]:
airport_bags['bag']

0                          {bensalem, heliport, total, rf}
1                            {aero, leoti, ranch, airport}
2                           {point, lowell, field, anchor}
3                                 {airpark, harvest, epps}
4                    {newport, heliport, hospital, clinic}
                               ...                        
22887    {heliport, waterfront, amalie, thomas, st, cha...
22888              {heliport, reef, frenchman, thomas, st}
22889    {beach, heliport, resort, amalie, stouffer, gr...
22890    {seaplane, base, amalie, harbor, thomas, st, c...
22891    {christiansted, croix, seaplane, base, harbor,...
Name: bag, Length: 22892, dtype: object

In [248]:
def term_frequency(df):
    counter = Counter()
    for bag in df['name'].to_list():
        for term in bag.split(' '):
            counter.update([term.lower()])
    return (
        pd.DataFrame(counter.items(), columns=['term', 'count'])
        .sort_values('count', ascending=False)
    )

In [250]:
airport_terms = (
    airports
    .pipe(term_frequency)
)

In [254]:
airport_terms[:100]

Unnamed: 0,term,count
6,airport,12032
2,heliport,6417
8,field,1996
12,hospital,1551
65,municipal,1256
5,ranch,1216
44,center,936
147,county,876
43,medical,761
77,base,702
