See https://stackoverflow.com/questions/43938672/efficient-string-matching-in-apache-spark/45602605#45602605

In [17]:
import findspark
findspark.init()

from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH, StopWordsRemover, SQLTransformer

import pyspark.sql.functions as F

import pandas as pd
pd.set_option('display.max_rows', 1000)

from etl import SparkETL
from stopwords import Stopwords

In [2]:
etl = SparkETL()

In [3]:
spark = etl.get_spark()

22/05/07 14:56:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
airports = etl.read_clean_table('airport')

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [5]:
ports = etl.read_clean_table('port')

In [6]:
def db(df):
    return (
        df
        .select(
            'airport_id',
            'state_id',
            'international',
            'type_id',
            F.lower(F.concat(
                F.col('city'), F.lit(' '), F.col('name')
            ))
            .alias('text')
        )
        .where(~F.col('text').isNull())
    )

In [67]:
def query(df):
    return df.select(
        'port_id',
        F.col('state_id').alias('port_state_id'),
        F.lower(F.col('name')).alias('text'),
    )

In [68]:
stopwords = (
    Stopwords([
        airports.pipe(db),
        ports.pipe(query)
    ])
    .stopwords(50)
)

                                                                                

In [69]:
def create_model(df):
    """
    df is the db
    """
    
    # to prevent https://stackoverflow.com/questions/55628049/string-matching-using-ml-pipeline-is-throwing-error-failed-to-execute-user-defin
    # see https://stackoverflow.com/questions/53371039/apache-spark-ml-pipeline-filter-empty-rows-in-dataset
    emptyRemover = SQLTransformer().setStatement(
          "SELECT * FROM __THIS__ WHERE size(tokens) > 0"
    )
    
    return (
        Pipeline(stages=[
            RegexTokenizer(
                pattern=Stopwords.token_separator, inputCol="text", outputCol="tokens0", minTokenLength=3
            ),
            #NGram(n=1, inputCol="tokens", outputCol="ngrams"),
            #HashingTF(inputCol="ngrams", outputCol="vectors"),
            StopWordsRemover(stopWords=stopwords, inputCol="tokens0", outputCol="tokens"),
            emptyRemover,
            HashingTF(inputCol="tokens", outputCol="vectors"),
            MinHashLSH(inputCol="vectors", outputCol="lsh")
        ])
        .fit(df)
    )

In [70]:
airport_db = db(airports)

In [71]:
model = create_model(airport_db)

22/05/07 17:07:31 WARN StopWordsRemover: Default locale set was [en_VN]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [72]:
db_hashed = model.transform(airport_db)

In [73]:
db_hashed.toPandas()

                                                                                

Unnamed: 0,airport_id,state_id,international,type_id,text,tokens0,tokens,vectors,lsh
0,ZNC,AK,False,4,nyac nyac airport,"[nyac, nyac, airport]","[nyac, nyac]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[1069733105.0]]
1,Z95,AZ,False,4,cibecue cibecue airport,"[cibecue, cibecue, airport]","[cibecue, cibecue]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[716829069.0]]
2,Z93,AK,False,4,copper center copper center 2 airport,"[copper, center, copper, center, airport]","[copper, copper]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[1683449085.0]]
3,Z92,MI,False,4,harsens island harsens island airport,"[harsens, island, harsens, island, airport]","[harsens, harsens]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[1533694155.0]]
4,Z91,AK,False,4,birch creek birch creek airport,"[birch, creek, birch, creek, airport]","[birch, birch]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[1441056856.0]]
...,...,...,...,...,...,...,...,...,...
22766,NSAS,AS,False,4,ofu village ofu village airport,"[ofu, village, ofu, village, airport]","[ofu, village, ofu, village]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[1279419397.0]]
22767,AS-TAV,AS,False,4,tau village tau airport,"[tau, village, tau, airport]","[tau, village, tau]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[1619637907.0]]
22768,PTRO,PW,False,5,babelthuap island babelthuap airport,"[babelthuap, island, babelthuap, airport]","[babelthuap, babelthuap]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[626057715.0]]
22769,C23,PW,False,4,babelthuap island peleliu airport,"[babelthuap, island, peleliu, airport]","[babelthuap, peleliu]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[626057715.0]]


In [74]:
query_hashed = model.transform(ports.pipe(query))

In [75]:
query_hashed.toPandas()

Unnamed: 0,port_id,port_state_id,text,tokens0,tokens,vectors,lsh
0,ALC,AK,alcan,[alcan],[alcan],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[1012094596.0]]
1,ANC,AK,anchorage,[anchorage],[anchorage],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[303125146.0]]
2,BAR,AK,baker aaf - baker island,"[baker, aaf, baker, island]","[baker, aaf, baker]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[258879029.0]]
3,DAC,AK,daltons cache,"[daltons, cache]","[daltons, cache]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[96918252.0]]
4,PIZ,AK,dew station pt lay dew,"[dew, station, lay, dew]","[dew, lay, dew]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[103039126.0]]
5,DTH,AK,dutch harbor,"[dutch, harbor]","[dutch, harbor]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[678093091.0]]
6,EGL,AK,eagle,[eagle],[eagle],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[1454893026.0]]
7,FRB,AK,fairbanks,[fairbanks],[fairbanks],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[607454173.0]]
8,HOM,AK,homer,[homer],[homer],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[1168838925.0]]
9,HYD,AK,hyder,[hyder],[hyder],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[[508556509.0]]


In [81]:
distances = model.stages[-1].approxSimilarityJoin(db_hashed, query_hashed, 1)

In [82]:
distances.toPandas()

                                                                                

Unnamed: 0,datasetA,datasetB,distCol
0,"(WV52, WV, False, 4, green bank green bank obs...","(CUT, MT, cut bank, [cut, bank], [cut, bank], ...",0.750000
1,"(WI97, WI, False, 4, marshall mathaire field, ...","(HYD, AK, hyder, [hyder], [hyder], (0.0, 0.0, ...",0.500000
2,"(WA09, WA, False, 4, roche harbor roche harbor...","(FRI, WA, friday harbor, [friday, harbor], [fr...",0.666667
3,"(S00, MT, False, 4, opheim opheim airport, [op...","(OPH, MT, opheim, [opheim], [opheim], (0.0, 0....",0.000000
4,"(PN66, PA, False, 4, new alexandria dunlea air...","(AXB, NY, alexandria bay, [alexandria, bay], [...",0.500000
...,...,...,...
3854,"(65TX, TX, False, 4, salt flat flying eagle ra...","(SLC, UT, salt lake city, [salt, lake, city], ...",0.666667
3855,"(3OK3, OK, False, 2, locust grove cedar crest ...","(CID, IA, cedar rapids/iowa city, [cedar, rapi...",0.833333
3856,"(34OR, OR, False, 2, portland providence medic...","(POM, ME, portland, [portland], [portland], (0...",0.500000
3857,"(31AR, AR, False, 2, clinton van buren county ...","(VNB, ME, van buren, [van, buren], [van, buren...",0.333333


In [76]:
def project_distances(df):
    return (
        df
        .select(
            F.col('datasetA.airport_id').alias('airport_id'),
            F.col('datasetA.text').alias('airport_text'),
            F.col('datasetA.state_id').alias('state_id'),
            F.col('datasetA.international').alias('international'),
            F.col('datasetA.type_id').alias('type_id'),
            F.col('datasetB.port_id').alias('port_id'),
            F.col('datasetB.port_state_id').alias('port_state_id'),
            F.col('datasetB.text').alias('port_text'),
            'distCol'
        )
    )

In [78]:
def same_state(df):
    return df.where(F.col('port_state_id') == F.col('state_id'))

In [105]:
def sort_distances(df):
    return df.sort(
        F.col('port_state_id'),
        F.col('port_id'),
        F.desc(F.col('international')),
        F.desc(F.col('type_id')),
        F.desc(F.col('distCol')),
    )

In [109]:
def best_distance(df):
    return (
        df
        .groupby('port_id')
        .agg(
            F.first('port_state_id'),
            F.first('port_text'),
            F.first('airport_id').alias('airport_id'),
            F.first('airport_text'),
            F.first('distCol').alias('distance')
        )
    )

In [110]:
def project_schema(df):
    return df.select('port_id', 'airport_id')

In [114]:
def port_to_airport(df):
    return (
        df
        .pipe(project_distances)
        .pipe(same_state)
        .pipe(sort_distances)
        .pipe(best_distance)
        .pipe(project_schema)
    )

In [116]:
etl.save_clean_table(port_to_airport(distances).coalesce(1), 'port_to_airport')

                                                                                

In [118]:
etl.read_clean_table('port_to_airport').toPandas()

Unnamed: 0,port_id,airport_id
0,BGM,KBGR
1,FMY,FL90
2,LEB,KLEB
3,DNS,S28
4,EGL,PAEG
5,HEL,KHLN
6,PSE,TJPS
7,FRN,FHB
8,HVR,KHVR
9,BUR,8CL3


TODO NYC JFK not matched