In [1]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

Parse config file for path configurations - Done

In [2]:
config = configparser.ConfigParser()
config.read('etl.cfg')

input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')

Create Spark session:

In [5]:
# Create Spark session - Using for droduction only
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

### Describe and Gather Data

Take a look on datasets description includes schema, sample records, number of rows, attributes, number of data file (if need).  And then choose datasets will be using for data modeling.

Dataset assessment by counting number of records and data size. Take a look on schema, data column structure, attributes and some of sample records.

#### I94 Immigration

In [3]:
# Using for production
# i94immi_dataset = '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'
# i94_immi_df = spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')

# Using for local development
i94immi_dataset = 'immigration_data_sample.csv'
i94immi_df = pd.read_csv(i94immi_dataset,sep=",")

In [4]:
pd.set_option('display.max_columns', 50)
i94immi_df.head(10)

Unnamed: 0.1,Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,20573.0,61.0,2.0,1.0,20160422,,,G,O,,M,1955.0,7202016,F,,JL,56582670000.0,00782,WT
1,2171295,4422636.0,2016.0,4.0,582.0,582.0,MCA,20567.0,1.0,TX,20568.0,26.0,2.0,1.0,20160423,MTR,,G,R,,M,1990.0,10222016,M,,*GA,94362000000.0,XBLNG,B2
2,589494,1195600.0,2016.0,4.0,148.0,112.0,OGG,20551.0,1.0,FL,20571.0,76.0,2.0,1.0,20160407,,,G,O,,M,1940.0,7052016,M,,LH,55780470000.0,00464,WT
3,2631158,5291768.0,2016.0,4.0,297.0,297.0,LOS,20572.0,1.0,CA,20581.0,25.0,2.0,1.0,20160428,DOH,,G,O,,M,1991.0,10272016,M,,QR,94789700000.0,00739,B2
4,3032257,985523.0,2016.0,4.0,111.0,111.0,CHM,20550.0,3.0,NY,20553.0,19.0,2.0,1.0,20160406,,,Z,K,,M,1997.0,7042016,F,,,42322570000.0,LAND,WT
5,721257,1481650.0,2016.0,4.0,577.0,577.0,ATL,20552.0,1.0,GA,20606.0,51.0,2.0,1.0,20160408,,,T,N,,M,1965.0,10072016,M,,DL,736852600.0,910,B2
6,1072780,2197173.0,2016.0,4.0,245.0,245.0,SFR,20556.0,1.0,CA,20635.0,48.0,2.0,1.0,20160412,,,T,O,,M,1968.0,10112016,F,,CX,786312200.0,870,B2
7,112205,232708.0,2016.0,4.0,113.0,135.0,NYC,20546.0,1.0,NY,20554.0,33.0,2.0,1.0,20160402,,,G,O,,M,1983.0,6302016,F,,BA,55474490000.0,00117,WT
8,2577162,5227851.0,2016.0,4.0,131.0,131.0,CHI,20572.0,1.0,IL,20575.0,39.0,2.0,1.0,20160428,,,O,O,,M,1977.0,7262016,,,LX,59413420000.0,00008,WT
9,10930,13213.0,2016.0,4.0,116.0,116.0,LOS,20545.0,1.0,CA,20553.0,35.0,2.0,1.0,20160401,,,O,O,,M,1981.0,6292016,,,AA,55449790000.0,00109,WT


In [15]:
i94immi_df.columns

Index(['Unnamed: 0', 'cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94port',
       'arrdate', 'i94mode', 'i94addr', 'depdate', 'i94bir', 'i94visa',
       'count', 'dtadfile', 'visapost', 'occup', 'entdepa', 'entdepd',
       'entdepu', 'matflag', 'biryear', 'dtaddto', 'gender', 'insnum',
       'airline', 'admnum', 'fltno', 'visatype'],
      dtype='object')

In [16]:
# Show schema
i94immi_df.info(50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 29 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1000 non-null   int64  
 1   cicid       1000 non-null   float64
 2   i94yr       1000 non-null   float64
 3   i94mon      1000 non-null   float64
 4   i94cit      1000 non-null   float64
 5   i94res      1000 non-null   float64
 6   i94port     1000 non-null   object 
 7   arrdate     1000 non-null   float64
 8   i94mode     1000 non-null   float64
 9   i94addr     941 non-null    object 
 10  depdate     951 non-null    float64
 11  i94bir      1000 non-null   float64
 12  i94visa     1000 non-null   float64
 13  count       1000 non-null   float64
 14  dtadfile    1000 non-null   int64  
 15  visapost    382 non-null    object 
 16  occup       4 non-null      object 
 17  entdepa     1000 non-null   object 
 18  entdepd     954 non-null    object 
 19  entdepu     0 non-null      

In [6]:
# Indexing
i94immi_df.index

RangeIndex(start=0, stop=1000, step=1)

In [12]:
# Show dataset shape
i94immi_df.shape

(1000, 29)

In [13]:
# Show dataset sample records
i94immi_df

Unnamed: 0.1,Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,...,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,...,,M,1955.0,07202016,F,,JL,5.658267e+10,00782,WT
1,2171295,4422636.0,2016.0,4.0,582.0,582.0,MCA,20567.0,1.0,TX,...,,M,1990.0,10222016,M,,*GA,9.436200e+10,XBLNG,B2
2,589494,1195600.0,2016.0,4.0,148.0,112.0,OGG,20551.0,1.0,FL,...,,M,1940.0,07052016,M,,LH,5.578047e+10,00464,WT
3,2631158,5291768.0,2016.0,4.0,297.0,297.0,LOS,20572.0,1.0,CA,...,,M,1991.0,10272016,M,,QR,9.478970e+10,00739,B2
4,3032257,985523.0,2016.0,4.0,111.0,111.0,CHM,20550.0,3.0,NY,...,,M,1997.0,07042016,F,,,4.232257e+10,LAND,WT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2117909,4288772.0,2016.0,4.0,135.0,135.0,LVG,20567.0,1.0,NV,...,,M,1984.0,07212016,M,,VS,5.914065e+10,00043,WT
996,1463022,2947585.0,2016.0,4.0,261.0,261.0,PSP,20560.0,1.0,HI,...,,M,1981.0,10152016,M,,SV,9.371186e+10,00041,B1
997,1414569,2883298.0,2016.0,4.0,111.0,111.0,MIA,20560.0,1.0,FL,...,,M,1977.0,07142016,M,,AF,5.627747e+10,00090,WT
998,1094181,2264857.0,2016.0,4.0,582.0,582.0,ATL,20556.0,1.0,WI,...,,M,1981.0,10112016,M,,EV,9.334035e+10,05510,B1


In [9]:
# N/A values estimation
i94immi_df.isna().sum()

Unnamed: 0       0
cicid            0
i94yr            0
i94mon           0
i94cit           0
i94res           0
i94port          0
arrdate          0
i94mode          0
i94addr         59
depdate         49
i94bir           0
i94visa          0
count            0
dtadfile         0
visapost       618
occup          996
entdepa          0
entdepd         46
entdepu       1000
matflag         46
biryear          0
dtaddto          0
gender         141
insnum         965
airline         33
admnum           0
fltno            8
visatype         0
dtype: int64

In [10]:
# NULL values estimation
i94immi_df.isnull().sum()

Unnamed: 0       0
cicid            0
i94yr            0
i94mon           0
i94cit           0
i94res           0
i94port          0
arrdate          0
i94mode          0
i94addr         59
depdate         49
i94bir           0
i94visa          0
count            0
dtadfile         0
visapost       618
occup          996
entdepa          0
entdepd         46
entdepu       1000
matflag         46
biryear          0
dtaddto          0
gender         141
insnum         965
airline         33
admnum           0
fltno            8
visatype         0
dtype: int64

#### World Temperature Data

In [11]:
# worldtempe_dataset = '../../data2/GlobalLandTemperaturesByCity.csv'
worldtempe_dataset = 'GlobalLandTemperaturesByCity_part9.csv'
worldtempe_df = pd.read_csv(worldtempe_dataset,sep=",")

In [12]:
# Show schema
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   dt                             100000 non-null  object 
 1   AverageTemperature             96552 non-null   float64
 2   AverageTemperatureUncertainty  96552 non-null   float64
 3   City                           100000 non-null  object 
 4   Country                        100000 non-null  object 
 5   Latitude                       100000 non-null  object 
 6   Longitude                      100000 non-null  object 
dtypes: float64(2), object(5)
memory usage: 5.3+ MB


In [13]:
# Indexing
worldtempe_df.index

RangeIndex(start=0, stop=100000, step=1)

In [14]:
# Show dataset shape
worldtempe_df.shape

(100000, 7)

In [15]:
# Show dataset sample records
worldtempe_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1748-09-01,,,Belgorod,Russia,50.63N,36.76E
1,1748-10-01,,,Belgorod,Russia,50.63N,36.76E
2,1748-11-01,,,Belgorod,Russia,50.63N,36.76E
3,1748-12-01,,,Belgorod,Russia,50.63N,36.76E
4,1749-01-01,,,Belgorod,Russia,50.63N,36.76E


In [16]:
# N/A values estimation
worldtempe_df.isna().sum()

dt                                  0
AverageTemperature               3448
AverageTemperatureUncertainty    3448
City                                0
Country                             0
Latitude                            0
Longitude                           0
dtype: int64

In [17]:
# NULL values estimation
worldtempe_df.isnull().sum()

dt                                  0
AverageTemperature               3448
AverageTemperatureUncertainty    3448
City                                0
Country                             0
Latitude                            0
Longitude                           0
dtype: int64

#### U.S. City Demographic Data

In [18]:
# citydemo_dataset = './us-cities-demographics.csv'
citydemo_dataset = './us-cities-demographics.csv'
citydemo_df = pd.read_csv(citydemo_dataset,sep=";")

In [19]:
# Show schema
citydemo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2891 entries, 0 to 2890
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   City                    2891 non-null   object 
 1   State                   2891 non-null   object 
 2   Median Age              2891 non-null   float64
 3   Male Population         2888 non-null   float64
 4   Female Population       2888 non-null   float64
 5   Total Population        2891 non-null   int64  
 6   Number of Veterans      2878 non-null   float64
 7   Foreign-born            2878 non-null   float64
 8   Average Household Size  2875 non-null   float64
 9   State Code              2891 non-null   object 
 10  Race                    2891 non-null   object 
 11  Count                   2891 non-null   int64  
dtypes: float64(6), int64(2), object(4)
memory usage: 271.2+ KB


In [20]:
# Indexing
citydemo_df.index

RangeIndex(start=0, stop=2891, step=1)

In [21]:
# Show dataset shape
citydemo_df.shape

(2891, 12)

In [22]:
# Show dataset sample records
citydemo_df.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [23]:
# N/A values estimation
citydemo_df.isna().sum()

City                       0
State                      0
Median Age                 0
Male Population            3
Female Population          3
Total Population           0
Number of Veterans        13
Foreign-born              13
Average Household Size    16
State Code                 0
Race                       0
Count                      0
dtype: int64

In [24]:
# NULL values estimation
citydemo_df.isnull().sum()

City                       0
State                      0
Median Age                 0
Male Population            3
Female Population          3
Total Population           0
Number of Veterans        13
Foreign-born              13
Average Household Size    16
State Code                 0
Race                       0
Count                      0
dtype: int64

In [25]:
citydemo_df.describe()

Unnamed: 0,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,Count
count,2891.0,2888.0,2888.0,2891.0,2878.0,2878.0,2875.0,2891.0
mean,35.494881,97328.43,101769.6,198966.8,9367.832523,40653.6,2.742543,48963.77
std,4.401617,216299.9,231564.6,447555.9,13211.219924,155749.1,0.433291,144385.6
min,22.9,29281.0,27348.0,63215.0,416.0,861.0,2.0,98.0
25%,32.8,39289.0,41227.0,80429.0,3739.0,9224.0,2.43,3435.0
50%,35.3,52341.0,53809.0,106782.0,5397.0,18822.0,2.65,13780.0
75%,38.0,86641.75,89604.0,175232.0,9368.0,33971.75,2.95,54447.0
max,70.5,4081698.0,4468707.0,8550405.0,156961.0,3212500.0,4.98,3835726.0


In [26]:
# Check unique values of column "City"
citydemo_df['City'].nunique()

567

In [27]:
# Check unique values of column "State"
citydemo_df['State'].nunique()

49

In [28]:
citydemo_df.columns

Index(['City', 'State', 'Median Age', 'Male Population', 'Female Population',
       'Total Population', 'Number of Veterans', 'Foreign-born',
       'Average Household Size', 'State Code', 'Race', 'Count'],
      dtype='object')

#### Airport Code

In [29]:
airport_dataset = './airport-codes_csv.csv'
airport_df = pd.read_csv(airport_dataset,sep=",")

In [30]:
# Show schema
airport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55075 entries, 0 to 55074
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ident         55075 non-null  object 
 1   type          55075 non-null  object 
 2   name          55075 non-null  object 
 3   elevation_ft  48069 non-null  float64
 4   continent     27356 non-null  object 
 5   iso_country   54828 non-null  object 
 6   iso_region    55075 non-null  object 
 7   municipality  49399 non-null  object 
 8   gps_code      41030 non-null  object 
 9   iata_code     9189 non-null   object 
 10  local_code    28686 non-null  object 
 11  coordinates   55075 non-null  object 
dtypes: float64(1), object(11)
memory usage: 5.0+ MB


In [31]:
# Indexing
airport_df.index

RangeIndex(start=0, stop=55075, step=1)

In [32]:
# Show dataset shape
airport_df.shape

(55075, 12)

In [33]:
# Show dataset sample records
airport_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [34]:
# N/A values estimation
airport_df.isna().sum()

ident               0
type                0
name                0
elevation_ft     7006
continent       27719
iso_country       247
iso_region          0
municipality     5676
gps_code        14045
iata_code       45886
local_code      26389
coordinates         0
dtype: int64

In [35]:
# NULL values estimation
airport_df.isnull().sum()

ident               0
type                0
name                0
elevation_ft     7006
continent       27719
iso_country       247
iso_region          0
municipality     5676
gps_code        14045
iata_code       45886
local_code      26389
coordinates         0
dtype: int64

- Extract dictionary informations from *I94_SAS_Labels_Descriptions.SAS*.
    - I94CIT & I94RES --> i94cntyl.txt
    - I94PORT --> i94prtl.txt
    - I94MODE --> i94model.txt
    - I94ADDR --> i94addrl
    - I94VISA --> i94visa.txt

In [36]:
with open('./I94_SAS_Labels_Descriptions.SAS') as f:
    f_content = f.read()
    f_content = f_content.replace('\t', '')

In [37]:
def code_mapper(file, idx):
    f_content2 = f_content[f_content.index(idx):]
    f_content2 = f_content2[:f_content2.index(';')].split('\n')
    f_content2 = [i.replace("'", "") for i in f_content2]
    dic = [i.split('=') for i in f_content2[1:]]
    dic = dict([i[0].strip(), i[1].strip()] for i in dic if len(i) == 2)
    return dic

In [38]:
i94_cit_and_res = code_mapper(f_content, "i94cntyl")
i94_port = code_mapper(f_content, "i94prtl")
i94_mode = code_mapper(f_content, "i94model")
i94_addr = code_mapper(f_content, "i94addrl")
i94_visa = {'1':'Business',
            '2': 'Pleasure',
            '3' : 'Student'}

In [39]:
i94_cit_and_res

{'582': 'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)',
 '236': 'AFGHANISTAN',
 '101': 'ALBANIA',
 '316': 'ALGERIA',
 '102': 'ANDORRA',
 '324': 'ANGOLA',
 '529': 'ANGUILLA',
 '518': 'ANTIGUA-BARBUDA',
 '687': 'ARGENTINA',
 '151': 'ARMENIA',
 '532': 'ARUBA',
 '438': 'AUSTRALIA',
 '103': 'AUSTRIA',
 '152': 'AZERBAIJAN',
 '512': 'BAHAMAS',
 '298': 'BAHRAIN',
 '274': 'BANGLADESH',
 '513': 'BARBADOS',
 '104': 'BELGIUM',
 '581': 'BELIZE',
 '386': 'BENIN',
 '509': 'BERMUDA',
 '153': 'BELARUS',
 '242': 'BHUTAN',
 '688': 'BOLIVIA',
 '717': 'BONAIRE, ST EUSTATIUS, SABA',
 '164': 'BOSNIA-HERZEGOVINA',
 '336': 'BOTSWANA',
 '689': 'BRAZIL',
 '525': 'BRITISH VIRGIN ISLANDS',
 '217': 'BRUNEI',
 '105': 'BULGARIA',
 '393': 'BURKINA FASO',
 '243': 'BURMA',
 '375': 'BURUNDI',
 '310': 'CAMEROON',
 '326': 'CAPE VERDE',
 '526': 'CAYMAN ISLANDS',
 '383': 'CENTRAL AFRICAN REPUBLIC',
 '384': 'CHAD',
 '690': 'CHILE',
 '245': 'CHINA, PRC',
 '721': 'CURACAO',
 '270': 'CHRISTMAS ISLAND',
 '271': 'COCO

In [40]:
i94_port

{'ALC': 'ALCAN, AK',
 'ANC': 'ANCHORAGE, AK',
 'BAR': 'BAKER AAF - BAKER ISLAND, AK',
 'DAC': 'DALTONS CACHE, AK',
 'PIZ': 'DEW STATION PT LAY DEW, AK',
 'DTH': 'DUTCH HARBOR, AK',
 'EGL': 'EAGLE, AK',
 'FRB': 'FAIRBANKS, AK',
 'HOM': 'HOMER, AK',
 'HYD': 'HYDER, AK',
 'JUN': 'JUNEAU, AK',
 '5KE': 'KETCHIKAN, AK',
 'KET': 'KETCHIKAN, AK',
 'MOS': 'MOSES POINT INTERMEDIATE, AK',
 'NIK': 'NIKISKI, AK',
 'NOM': 'NOM, AK',
 'PKC': 'POKER CREEK, AK',
 'ORI': 'PORT LIONS SPB, AK',
 'SKA': 'SKAGWAY, AK',
 'SNP': 'ST. PAUL ISLAND, AK',
 'TKI': 'TOKEEN, AK',
 'WRA': 'WRANGELL, AK',
 'HSV': 'MADISON COUNTY - HUNTSVILLE, AL',
 'MOB': 'MOBILE, AL',
 'LIA': 'LITTLE ROCK, AR (BPS)',
 'ROG': 'ROGERS ARPT, AR',
 'DOU': 'DOUGLAS, AZ',
 'LUK': 'LUKEVILLE, AZ',
 'MAP': 'MARIPOSA AZ',
 'NAC': 'NACO, AZ',
 'NOG': 'NOGALES, AZ',
 'PHO': 'PHOENIX, AZ',
 'POR': 'PORTAL, AZ',
 'SLU': 'SAN LUIS, AZ',
 'SAS': 'SASABE, AZ',
 'TUC': 'TUCSON, AZ',
 'YUI': 'YUMA, AZ',
 'AND': 'ANDRADE, CA',
 'BUR': 'BURBANK, CA',
 '

In [41]:
i94_mode

{'1': 'Air', '2': 'Sea', '3': 'Land', '9': 'Not reported'}

In [42]:
i94_addr

{'AL': 'ALABAMA',
 'AK': 'ALASKA',
 'AZ': 'ARIZONA',
 'AR': 'ARKANSAS',
 'CA': 'CALIFORNIA',
 'CO': 'COLORADO',
 'CT': 'CONNECTICUT',
 'DE': 'DELAWARE',
 'DC': 'DIST. OF COLUMBIA',
 'FL': 'FLORIDA',
 'GA': 'GEORGIA',
 'GU': 'GUAM',
 'HI': 'HAWAII',
 'ID': 'IDAHO',
 'IL': 'ILLINOIS',
 'IN': 'INDIANA',
 'IA': 'IOWA',
 'KS': 'KANSAS',
 'KY': 'KENTUCKY',
 'LA': 'LOUISIANA',
 'ME': 'MAINE',
 'MD': 'MARYLAND',
 'MA': 'MASSACHUSETTS',
 'MI': 'MICHIGAN',
 'MN': 'MINNESOTA',
 'MS': 'MISSISSIPPI',
 'MO': 'MISSOURI',
 'MT': 'MONTANA',
 'NC': 'N. CAROLINA',
 'ND': 'N. DAKOTA',
 'NE': 'NEBRASKA',
 'NV': 'NEVADA',
 'NH': 'NEW HAMPSHIRE',
 'NJ': 'NEW JERSEY',
 'NM': 'NEW MEXICO',
 'NY': 'NEW YORK',
 'OH': 'OHIO',
 'OK': 'OKLAHOMA',
 'OR': 'OREGON',
 'PA': 'PENNSYLVANIA',
 'PR': 'PUERTO RICO',
 'RI': 'RHODE ISLAND',
 'SC': 'S. CAROLINA',
 'SD': 'S. DAKOTA',
 'TN': 'TENNESSEE',
 'TX': 'TEXAS',
 'UT': 'UTAH',
 'VT': 'VERMONT',
 'VI': 'VIRGIN ISLANDS',
 'VA': 'VIRGINIA',
 'WV': 'W. VIRGINIA',
 'WA': 'WAS

In [43]:
i94_visa

{'1': 'Business', '2': 'Pleasure', '3': 'Student'}

In [44]:
def convert_city_to_i94port(city):
    results = [v for k, v in i94_port.items() if re.match(city, k)]

In [45]:
from pyspark.sql.functions import udf,col
convert_city_to_i94portUDF = udf(lambda z:convert_city_to_i94port(z))

temp_df_final = temp_df.withColumn('i94_port', convert_city_to_i94portUDF(col("city")))
temp_df_final.show(2)

NameError: name 'temp_df' is not defined