In [1]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

#### Extract dictionary informations from *I94_SAS_Labels_Descriptions.SAS*.
    - I94CIT & I94RES --> i94cntyl.txt
    - I94PORT --> i94prtl.txt
    - I94MODE --> i94model.txt
    - I94ADDR --> i94addrl
    - I94VISA --> i94visa.txt

In [2]:
with open('./I94_SAS_Labels_Descriptions.SAS') as f:
    f_content = f.read()
    f_content = f_content.replace('\t', '')

In [12]:
def code_mapper(file, idx):
    f_content2 = f_content[f_content.index(idx):]
    f_content2 = f_content2[:f_content2.index(';')].split('\n')
    f_content2 = [i.replace("'", "") for i in f_content2]
    dic = [i.split('=') for i in f_content2[1:]]
    dic = dict([i[0].strip(), i[1].strip()] for i in dic if len(i) == 2)
    return dic

In [13]:
i94_cit_and_res = code_mapper(f_content, "i94cntyl")
i94_port = code_mapper(f_content, "i94prtl")
i94_mode = code_mapper(f_content, "i94model")
i94_addr = code_mapper(f_content, "i94addrl")
i94_visa = {'1':'Business',
            '2': 'Pleasure',
            '3' : 'Student'}

In [14]:
# i94_cit_and_res
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in i94_cit_and_res.items() ]))
df.to_csv('i94_cit_and_res.csv')

df

Unnamed: 0,582,236,101,316,102,324,529,518,687,151,...,400,485,503,589,592,791,849,914,944,996
0,"MEXICO Air Sea, and Not Reported (I-94, no lan...",AFGHANISTAN,ALBANIA,ALGERIA,ANDORRA,ANGOLA,ANGUILLA,ANTIGUA-BARBUDA,ARGENTINA,ARMENIA,...,No Country Code (400),No Country Code (485),No Country Code (503),No Country Code (589),No Country Code (592),No Country Code (791),No Country Code (849),No Country Code (914),No Country Code (944),No Country Code (996)


In [15]:
# i94_port

df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in i94_port.items() ]))
df.to_csv('i94_port.csv')

df

Unnamed: 0,ALC,ANC,BAR,DAC,PIZ,DTH,EGL,FRB,HOM,HYD,...,.GA,CLX,CP,FSC,NK,ADU,AKT,LIT,A2A,OSN
0,"ALCAN, AK","ANCHORAGE, AK","BAKER AAF - BAKER ISLAND, AK","DALTONS CACHE, AK","DEW STATION PT LAY DEW, AK","DUTCH HARBOR, AK","EAGLE, AK","FAIRBANKS, AK","HOMER, AK","HYDER, AK",...,No PORT Code (.GA),No PORT Code (CLX),No PORT Code (CP),No PORT Code (FSC),No PORT Code (NK),No PORT Code (ADU),No PORT Code (AKT),No PORT Code (LIT),No PORT Code (A2A),No PORT Code (OSN)


In [16]:
# i94_mode

df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in i94_mode.items() ]))
df.to_csv('i94_mode.csv')

df

Unnamed: 0,1,2,3,9
0,Air,Sea,Land,Not reported


In [17]:
# i94_addr

df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in i94_addr.items() ]))
df.to_csv('i94_addr.csv')

df

Unnamed: 0,AL,AK,AZ,AR,CA,CO,CT,DE,DC,FL,...,TX,UT,VT,VI,VA,WV,WA,WI,WY,99
0,ALABAMA,ALASKA,ARIZONA,ARKANSAS,CALIFORNIA,COLORADO,CONNECTICUT,DELAWARE,DIST. OF COLUMBIA,FLORIDA,...,TEXAS,UTAH,VERMONT,VIRGIN ISLANDS,VIRGINIA,W. VIRGINIA,WASHINGTON,WISCONSON,WYOMING,All Other Codes


In [18]:
# i94_visa

df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in i94_visa.items() ]))
df.to_csv('i94_visa.csv')

df

Unnamed: 0,1,2,3
0,Business,Pleasure,Student


In [44]:
def convert_city_to_i94port(city):
    results = [v for k, v in i94_port.items() if re.match(city, k)]

In [None]:
from pyspark.sql.functions import udf,col
convert_city_to_i94portUDF = udf(lambda z:convert_city_to_i94port(z))

temp_df_final = temp_df.withColumn('i94_port', convert_city_to_i94portUDF(col("city")))
temp_df_final.show(2)