In [6]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os
from pyspark.sql.types import StructType as R, StructField as Fld,\
    DoubleType as Dbl, StringType as Str, IntegerType as Int,\
    TimestampType as Timestamp, DateType as Date, LongType as Long


In [2]:
# Parse configurations - Done
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

#### Path 
input_data_source = '.'

output_processed_data = './storage'

#

i94immi_data_source = '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'

worldtempe_data_source = '../../data2/GlobalLandTemperaturesByCity.csv'

citydemo_data_source = './us-cities-demographics.csv'

airport_data_source = './airport-codes_csv.csv'

#

i94_immi_splited_dir = './storage/.sas7bdat'

world_tempe_splited_dir = './storage/.csv'

In [10]:
# Create spark session
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [None]:
# df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv(worldtempe_data_source)
# df = spark.read.options(header='True',inferSchema='True',delimiter=';').csv(citydemo_data_source)
# df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv(airport_data_source)
# df = spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')

In [94]:
airport_dataset = './airport-codes_csv.csv'
airport_df = pd.read_csv(airport_dataset,sep=",")
pd.set_option('display.max_columns', 50)
airport_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


```
==========================================================================================================
extract from SAS_Labels
==========================================================================================================
```

In [7]:
def extract_data_from_SAS_labels_descriptions(input_label):
    '''
    A procedure that returns a cleaned list of code value pairs for the provided input label
    
    Parameters
    ----------
    input_label : str
        name of the label in the SAS labels descriptions file
    
    Returns
    -------
    code_value_list : list(tuple(str, str))
        a list of code values pairs extracted from the SAS labels descriptions file and cleaned
    '''

    with open('I94_SAS_Labels_Descriptions.SAS') as labels_descriptions:
            raw_labels = labels_descriptions.read()

    # extract only label data
    labels = raw_labels[raw_labels.index(input_label):]
    labels = labels[:labels.index(';')]
    
    
    # in each line remove unnecessary spaces and extract the code and its corresponding value 
    lines = labels.splitlines()
    code_value_list = []
    for line in lines:
        try:
            code, value = line.split('=')
            code = code.strip().strip("'").strip('"')
            value = value.strip().strip("'").strip('"').strip()
            code_value_list.append((code, value))
        except:
            pass
        
    return code_value_list

In [8]:
# define the schema the will be used for all label descriptions
schema = R([
        Fld("code", Str()),
        Fld("name", Str())
    ])

In [11]:
countries_df = spark.createDataFrame(
        data=extract_data_from_SAS_labels_descriptions('I94RES'),
        schema=schema
)

countries_df.limit(5).toPandas()

Unnamed: 0,code,name
0,582,"MEXICO Air Sea, and Not Reported (I-94, no lan..."
1,236,AFGHANISTAN
2,101,ALBANIA
3,316,ALGERIA
4,102,ANDORRA


In [12]:
ports_df = spark.createDataFrame(
        data=extract_data_from_SAS_labels_descriptions('I94PORT'),
        schema=schema
)
ports_df.limit(5).toPandas()

Unnamed: 0,code,name
0,ALC,"ALCAN, AK"
1,ANC,"ANCHORAGE, AK"
2,BAR,"BAKER AAF - BAKER ISLAND, AK"
3,DAC,"DALTONS CACHE, AK"
4,PIZ,"DEW STATION PT LAY DEW, AK"


In [13]:
states_df = spark.createDataFrame(
        data=extract_data_from_SAS_labels_descriptions('I94ADDR'),
        schema=schema
)

states_df.limit(5).toPandas()

Unnamed: 0,code,name
0,AL,ALABAMA
1,AK,ALASKA
2,AZ,ARIZONA
3,AR,ARKANSAS
4,CA,CALIFORNIA


In [14]:
travel_modes_df = spark.createDataFrame(
        data=extract_data_from_SAS_labels_descriptions('I94MODE'),
        schema=schema
)

travel_modes_df.limit(5).toPandas()

Unnamed: 0,code,name
0,1,Air
1,2,Sea
2,3,Land
3,9,Not reported


In [15]:
visa_categories_df = spark.createDataFrame(
        data=extract_data_from_SAS_labels_descriptions('I94VISA'),
        schema=schema
)

visa_categories_df.limit(5).toPandas()

Unnamed: 0,code,name
0,1,Business
1,2,Pleasure
2,3,Student


Performing verify extraced data

In [16]:
len(ports_df.toPandas()['code'].unique())

660

In [17]:
ports_df.toPandas()['code'].unique()

array(['ALC', 'ANC', 'BAR', 'DAC', 'PIZ', 'DTH', 'EGL', 'FRB', 'HOM',
       'HYD', 'JUN', '5KE', 'KET', 'MOS', 'NIK', 'NOM', 'PKC', 'ORI',
       'SKA', 'SNP', 'TKI', 'WRA', 'HSV', 'MOB', 'LIA', 'ROG', 'DOU',
       'LUK', 'MAP', 'NAC', 'NOG', 'PHO', 'POR', 'SLU', 'SAS', 'TUC',
       'YUI', 'AND', 'BUR', 'CAL', 'CAO', 'FRE', 'ICP', 'LNB', 'LOS',
       'BFL', 'OAK', 'ONT', 'OTM', 'BLT', 'PSP', 'SAC', 'SLS', 'SDP',
       'SFR', 'SNJ', 'SLO', 'SLI', 'SPC', 'SYS', 'SAA', 'STO', 'TEC',
       'TRV', 'APA', 'ASE', 'COS', 'DEN', 'DRO', 'BDL', 'BGC', 'GRT',
       'HAR', 'NWH', 'NWL', 'TST', 'WAS', 'DOV', 'DVD', 'WLL', 'BOC',
       'SRQ', 'CAN', 'DAB', 'FRN', 'FTL', 'FMY', 'FPF', 'HUR', 'GNV',
       'JAC', 'KEY', 'LEE', 'MLB', 'MIA', 'APF', 'OPF', 'ORL', 'PAN',
       'PEN', 'PCF', 'PEV', 'PSJ', 'SFB', 'SGJ', 'SAU', 'FPR', 'SPE',
       'TAM', 'WPB', 'ATL', 'BRU', 'AGS', 'SAV', 'AGA', 'HHW', 'OGG',
       'KOA', 'LIH', 'CID', 'DSM', 'BOI', 'EPI', 'IDA', 'PTL', 'SPI',
       'CHI', 'DPA',

In [18]:
states_df.toPandas()['code'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
       'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NC', 'ND', 'NE', 'NV', 'NH',
       'NJ', 'NM', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD',
       'TN', 'TX', 'UT', 'VT', 'VI', 'VA', 'WV', 'WA', 'WI', 'WY', '99'],
      dtype=object)

In [19]:
states_df.toPandas()['name'].unique()

array(['ALABAMA', 'ALASKA', 'ARIZONA', 'ARKANSAS', 'CALIFORNIA',
       'COLORADO', 'CONNECTICUT', 'DELAWARE', 'DIST. OF COLUMBIA',
       'FLORIDA', 'GEORGIA', 'GUAM', 'HAWAII', 'IDAHO', 'ILLINOIS',
       'INDIANA', 'IOWA', 'KANSAS', 'KENTUCKY', 'LOUISIANA', 'MAINE',
       'MARYLAND', 'MASSACHUSETTS', 'MICHIGAN', 'MINNESOTA',
       'MISSISSIPPI', 'MISSOURI', 'MONTANA', 'N. CAROLINA', 'N. DAKOTA',
       'NEBRASKA', 'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY', 'NEW MEXICO',
       'NEW YORK', 'OHIO', 'OKLAHOMA', 'OREGON', 'PENNSYLVANIA',
       'PUERTO RICO', 'RHODE ISLAND', 'S. CAROLINA', 'S. DAKOTA',
       'TENNESSEE', 'TEXAS', 'UTAH', 'VERMONT', 'VIRGIN ISLANDS',
       'VIRGINIA', 'W. VIRGINIA', 'WASHINGTON', 'WISCONSON', 'WYOMING',
       'All Other Codes'], dtype=object)

In [20]:
len(states_df.toPandas()['code'].unique())

55

In [32]:
states_df.show()

+----+-----------------+
|code|             name|
+----+-----------------+
|  AL|          ALABAMA|
|  AK|           ALASKA|
|  AZ|          ARIZONA|
|  AR|         ARKANSAS|
|  CA|       CALIFORNIA|
|  CO|         COLORADO|
|  CT|      CONNECTICUT|
|  DE|         DELAWARE|
|  DC|DIST. OF COLUMBIA|
|  FL|          FLORIDA|
|  GA|          GEORGIA|
|  GU|             GUAM|
|  HI|           HAWAII|
|  ID|            IDAHO|
|  IL|         ILLINOIS|
|  IN|          INDIANA|
|  IA|             IOWA|
|  KS|           KANSAS|
|  KY|         KENTUCKY|
|  LA|        LOUISIANA|
+----+-----------------+
only showing top 20 rows



```
==========================================================================================================
Airport
==========================================================================================================
```

In [7]:
airport_df[airport_df['iso_country'].isna()].count()

ident           247
type            247
name            247
elevation_ft    231
continent       247
iso_country       0
iso_region      247
municipality    145
gps_code         74
iata_code        31
local_code        0
coordinates     247
dtype: int64

In [8]:
# Check airport distribution by country
airport_df[airport_df['iso_country'].isna()].shape

(247, 12)

In [4]:
# Check country unique
airport_df['iso_country'].nunique()

243

In [5]:
airport_df = airport_df[airport_df['iso_country'] == 'US']
airport_df['iso_country'].nunique()

1

In [6]:
airport_df.isnull().sum()

ident               0
type                0
name                0
elevation_ft      239
continent       22756
iso_country         0
iso_region          0
municipality      102
gps_code         1773
iata_code       20738
local_code       1521
coordinates         0
dtype: int64

In [7]:
airport_df.duplicated().sum()

0

In [8]:
airport_df = airport_df.drop(['elevation_ft', 'continent', 'gps_code', 'local_code', 'coordinates'], axis=1)

In [9]:
airport_df.head()

Unnamed: 0,ident,type,name,iso_country,iso_region,municipality,iata_code
0,00A,heliport,Total Rf Heliport,US,US-PA,Bensalem,
1,00AA,small_airport,Aero B Ranch Airport,US,US-KS,Leoti,
2,00AK,small_airport,Lowell Field,US,US-AK,Anchor Point,
3,00AL,small_airport,Epps Airpark,US,US-AL,Harvest,
4,00AR,closed,Newport Hospital & Clinic Heliport,US,US-AR,Newport,


In [19]:
airport_df = airport_df.dropna(subset=["iata_code"])

In [24]:
airport_df.shape

(2019, 7)

```
==========================================================================================================
Convert column name lower with _
==========================================================================================================
```

In [25]:
def convert_column_names(df):
    cols = df.columns
    column_name_changed = []

    for col in cols:
        new_column = col.lstrip().rstrip().lower().replace (" ", "_").replace ("-", "_") #strip beginning spaces, makes lowercase, add underscpre
        column_name_changed.append(new_column)

    df.columns = column_name_changed

In [27]:
convert_column_names(airport_df)
airport_df.columns

Index(['ident', 'type', 'name', 'iso_country', 'iso_region', 'municipality',
       'iata_code'],
      dtype='object')

```
==========================================================================================================
Adding i94_port to Temperature dataset
==========================================================================================================
```

In [None]:
def convert_city_to_i94port(city):

    for port in i94_port:
        if city.lower() in i94_port[port].lower():
            return port


temp_df_final = temp_df.withColumn('i94_port', convert_city_to_i94port(temp_df['city']))

In [None]:
# # Mapping the U.S cities to city port abbreviations (i94port from SAS label)

# # Add iport94 code based on city name
df_temperature_data = df_temperature_data.withColumn("i94port", get_i94port(df_temperature_data.City))

# Remove data points with no iport94 code
df_temperature_data = df_temperature_data.filter(df_temperature_data.i94port != 'null')



# Create udf to map city full name to city port abbreviation

@udf(StringType())
def city_to_port(city):
    for key in valid_ports:
        if city.lower() in valid_ports[key].lower():
            return key

# Clean temperature data

# Only use temperatures from United States
# Map full name to city port abbreviation
# Remove invalid ports
cleaned_temp_df = temperature_df.filter(temperature_df["Country"] == "United States") \
    .withColumn("year", year(temperature_df['dt'])) \
    .withColumn("month", month(temperature_df["dt"])) \
    .withColumn("i94port", city_to_port(temperature_df["City"])) \
    .withColumn("AverageTemperature", col("AverageTemperature").cast("float")) \
    .dropna(how='any', subset=["i94port"])



@udf()
def get_i94port(city):
    '''
    Input: City name 
    Output: Corresponding i94port
    '''
    
    for key in i94portvalid:
        if city.lower() in i94portvalid[key][0].lower():
            return key

        
        
        
# Add iport94 code based on city name
df_temperature_data = df_temperature_data.withColumn("i94port", get_i94port(df_temperature_data.City))
df_temperature_data.show()

```
==========================================================================================================
Samples i94_port script
==========================================================================================================
```

In [None]:
pd.set_option('display.max_columns', 50)
i94immi_df.head(10)

Unnamed: 0.1,Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,arrive_date,departure_date
0,2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,20573.0,61.0,2.0,1.0,20160422,,,G,O,,M,1955.0,7202016,F,,JL,56582670000.0,00782,WT,2016-04-22,2016-04-29
1,2171295,4422636.0,2016.0,4.0,582.0,582.0,MCA,20567.0,1.0,TX,20568.0,26.0,2.0,1.0,20160423,MTR,,G,R,,M,1990.0,10222016,M,,*GA,94362000000.0,XBLNG,B2,2016-04-23,2016-04-24
2,589494,1195600.0,2016.0,4.0,148.0,112.0,OGG,20551.0,1.0,FL,20571.0,76.0,2.0,1.0,20160407,,,G,O,,M,1940.0,7052016,M,,LH,55780470000.0,00464,WT,2016-04-07,2016-04-27
3,2631158,5291768.0,2016.0,4.0,297.0,297.0,LOS,20572.0,1.0,CA,20581.0,25.0,2.0,1.0,20160428,DOH,,G,O,,M,1991.0,10272016,M,,QR,94789700000.0,00739,B2,2016-04-28,2016-05-07
4,3032257,985523.0,2016.0,4.0,111.0,111.0,CHM,20550.0,3.0,NY,20553.0,19.0,2.0,1.0,20160406,,,Z,K,,M,1997.0,7042016,F,,,42322570000.0,LAND,WT,2016-04-06,2016-04-09
5,721257,1481650.0,2016.0,4.0,577.0,577.0,ATL,20552.0,1.0,GA,20606.0,51.0,2.0,1.0,20160408,,,T,N,,M,1965.0,10072016,M,,DL,736852600.0,910,B2,2016-04-08,2016-06-01
6,1072780,2197173.0,2016.0,4.0,245.0,245.0,SFR,20556.0,1.0,CA,20635.0,48.0,2.0,1.0,20160412,,,T,O,,M,1968.0,10112016,F,,CX,786312200.0,870,B2,2016-04-12,2016-06-30
7,112205,232708.0,2016.0,4.0,113.0,135.0,NYC,20546.0,1.0,NY,20554.0,33.0,2.0,1.0,20160402,,,G,O,,M,1983.0,6302016,F,,BA,55474490000.0,00117,WT,2016-04-02,2016-04-10
8,2577162,5227851.0,2016.0,4.0,131.0,131.0,CHI,20572.0,1.0,IL,20575.0,39.0,2.0,1.0,20160428,,,O,O,,M,1977.0,7262016,,,LX,59413420000.0,00008,WT,2016-04-28,2016-05-01
9,10930,13213.0,2016.0,4.0,116.0,116.0,LOS,20545.0,1.0,CA,20553.0,35.0,2.0,1.0,20160401,,,O,O,,M,1981.0,6292016,,,AA,55449790000.0,00109,WT,2016-04-01,2016-04-09


Verify record with missing value NA or NULL

In [None]:
i94immi_df.isna().sum()

Unnamed: 0           0
cicid                0
i94yr                0
i94mon               0
i94cit               0
i94res               0
i94port              0
arrdate              0
i94mode              0
i94addr             59
depdate             49
i94bir               0
i94visa              0
count                0
dtadfile             0
visapost           618
occup              996
entdepa              0
entdepd             46
entdepu           1000
matflag             46
biryear              0
dtaddto              0
gender             141
insnum             965
airline             33
admnum               0
fltno                8
visatype             0
arrive_date          0
departure_date      49
dtype: int64

In [None]:
i94immi_df.isnull().sum()

Unnamed: 0           0
cicid                0
i94yr                0
i94mon               0
i94cit               0
i94res               0
i94port              0
arrdate              0
i94mode              0
i94addr             59
depdate             49
i94bir               0
i94visa              0
count                0
dtadfile             0
visapost           618
occup              996
entdepa              0
entdepd             46
entdepu           1000
matflag             46
biryear              0
dtaddto              0
gender             141
insnum             965
airline             33
admnum               0
fltno                8
visatype             0
arrive_date          0
departure_date      49
dtype: int64

Remove records with missing value on column 'depdate'

In [None]:
i94immi_df = i94immi_df.dropna(subset="i94yr",how="all")

In [None]:
i94immi_df = i94immi_df.dropna(subset="i94mon",how="all")

In [None]:
i94immi_df = i94immi_df.dropna(subset="arrdate",how="all")

In [None]:
i94immi_df = i94immi_df.dropna(subset="depdate",how="all")

In [None]:
i94immi_df = i94immi_df.dropna(subset="i94addr",how="all")

In [None]:
i94immi_df = i94immi_df.dropna(subset="i94mode",how="all")

In [None]:
i94immi_df = i94immi_df.dropna(subset="dtaddto",how="all")

In [None]:
i94immi_df.count()

Unnamed: 0        899
cicid             899
i94yr             899
i94mon            899
i94cit            899
i94res            899
i94port           899
arrdate           899
i94mode           899
i94addr           899
depdate           899
i94bir            899
i94visa           899
count             899
dtadfile          899
visapost          345
occup               3
entdepa           899
entdepd           899
entdepu             0
matflag           899
biryear           899
dtaddto           899
gender            775
insnum             18
airline           880
admnum            899
fltno             898
visatype          899
arrive_date       899
departure_date    899
dtype: int64

In [None]:
i94immi_df.isnull().sum()

Unnamed: 0          0
cicid               0
i94yr               0
i94mon              0
i94cit              0
i94res              0
i94port             0
arrdate             0
i94mode             0
i94addr             0
depdate             0
i94bir              0
i94visa             0
count               0
dtadfile            0
visapost          554
occup             896
entdepa             0
entdepd             0
entdepu           899
matflag             0
biryear             0
dtaddto             0
gender            124
insnum            881
airline            19
admnum              0
fltno               1
visatype            0
arrive_date         0
departure_date      0
dtype: int64

In [None]:
i94immi_df = i94immi_df.drop(['count', 'tadfile', 'visapost', 'occup', 'entdepa', 'entdepd', 'entdepu', 'matflag'], axis=1)

In [None]:
# Convert year and month to int64 data type
i94immi_df['cicid'] = i94immi_df['cicid'].astype(int)
i94immi_df['i94yr'] = i94immi_df['i94yr'].astype(int)
i94immi_df['i94mon'] = i94immi_df['i94mon'].astype(int)
i94immi_df['i94cit'] = i94immi_df['i94cit'].astype(int)
i94immi_df['i94mode'] = i94immi_df['i94mode'].astype(int)
i94immi_df['arrdate'] = i94immi_df['arrdate'].astype(int)
i94immi_df['depdate'] = i94immi_df['depdate'].astype(int)
i94immi_df['i94visa'] = i94immi_df['i94visa'].astype(int)

i94immi_df.head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype
0,4084316.0,2016,4,209,20566,1,20573,2,WT
1,4422636.0,2016,4,582,20567,1,20568,2,B2
2,1195600.0,2016,4,148,20551,1,20571,2,WT
3,5291768.0,2016,4,297,20572,1,20581,2,B2
4,985523.0,2016,4,111,20550,3,20553,2,WT


In [None]:
i94immi_columns = ["cicid","i94yr","i94mon","i94cit","arrdate","i94mode","depdate","i94visa", 'visatype']
i94immi_df = i94immi_df[i94immi_columns]


In [None]:
# Show dataset sample records
i94immi_df = i94immi_df.set_index('cicid')
i94immi_df.head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype
0,4084316.0,2016.0,4.0,209.0,20566.0,1.0,20573.0,2.0,WT
1,4422636.0,2016.0,4.0,582.0,20567.0,1.0,20568.0,2.0,B2
2,1195600.0,2016.0,4.0,148.0,20551.0,1.0,20571.0,2.0,WT
3,5291768.0,2016.0,4.0,297.0,20572.0,1.0,20581.0,2.0,B2
4,985523.0,2016.0,4.0,111.0,20550.0,3.0,20553.0,2.0,WT


In [None]:
i94immi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   cicid     1000 non-null   float64
 1   i94yr     1000 non-null   float64
 2   i94mon    1000 non-null   float64
 3   i94cit    1000 non-null   float64
 4   arrdate   1000 non-null   float64
 5   i94mode   1000 non-null   float64
 6   depdate   951 non-null    float64
 7   i94visa   1000 non-null   float64
 8   visatype  1000 non-null   object 
dtypes: float64(8), object(1)
memory usage: 70.4+ KB


In [None]:
# Show dataset sample records
i94immi_df.shape

(951, 9)

In [None]:
i94immi_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 951 entries, 0 to 999
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   cicid     951 non-null    object
 1   i94yr     951 non-null    int32 
 2   i94mon    951 non-null    int32 
 3   i94cit    951 non-null    int32 
 4   arrdate   951 non-null    int32 
 5   i94mode   951 non-null    int32 
 6   depdate   951 non-null    int32 
 7   i94visa   951 non-null    int32 
 8   visatype  951 non-null    object
dtypes: int32(7), object(2)
memory usage: 48.3+ KB


In [None]:
# Check unique
i94immi_df[i94immi_columns].value_counts()

cicid      i94yr  i94mon  i94cit  arrdate  i94mode  depdate  i94visa  visatype
1000074.0  2016   4       129     20550    1        20564    2        WT          1
480428.0   2016   4       148     20547    1        20559    2        WT          1
4718122.0  2016   4       209     20569    1        20573    2        WT          1
4718538.0  2016   4       209     20569    1        20574    2        WT          1
4729596.0  2016   4       245     20569    1        20584    2        B2          1
                                                                                 ..
2863583.0  2016   4       689     20559    1        20569    2        B2          1
2865787.0  2016   4       691     20559    1        20566    1        B1          1
2865828.0  2016   4       691     20559    1        20600    2        B2          1
2867437.0  2016   4       691     20559    1        20590    2        B2          1
999282.0   2016   4       129     20550    1        20553    1        WB         

In [None]:
# Check unique
i94immi_df[i94immi_columns].nunique()

cicid       951
i94yr         1
i94mon        1
i94cit       87
arrdate      30
i94mode       3
depdate     109
i94visa       3
visatype      8
dtype: int64

In [None]:
i94immi_df[i94immi_columns].sort_values('arrdate',na_position="last")

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype
215,25478.0,2016,4,131,20545,1,20633,2,WT
770,67523.0,2016,4,245,20545,1,20560,1,B1
244,86265.0,2016,4,368,20545,1,20553,2,B2
867,18310.0,2016,4,123,20545,1,20548,2,WT
665,32582.0,2016,4,135,20545,1,20552,2,WT
...,...,...,...,...,...,...,...,...,...
115,5883463.0,2016,4,687,20574,1,20589,2,B2
109,5756066.0,2016,4,260,20574,3,20576,2,B2
256,5899181.0,2016,4,696,20574,1,20583,1,B1
371,6057910.0,2016,4,252,20574,1,20578,2,GMT


In [None]:
i94immi_df.sort_values(by=['visatype','cicid'], ascending=True)

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype
892,1215382.0,2016,4,245,20551,1,20569,1,B1
421,1330915.0,2016,4,687,20551,1,20558,1,B1
126,1346007.0,2016,4,746,20551,1,20566,1,B1
627,1346274.0,2016,4,746,20551,1,20554,1,B1
123,1643294.0,2016,4,273,20553,1,20558,1,B1
...,...,...,...,...,...,...,...,...,...
420,982263.0,2016,4,103,20550,1,20592,2,WT
848,982461.0,2016,4,103,20550,1,20551,2,WT
4,985523.0,2016,4,111,20550,3,20553,2,WT
897,991350.0,2016,4,111,20550,1,20555,2,WT


In [None]:
# Check unique
i94immi_df[i94immi_columns].value_counts()

cicid      i94yr  i94mon  i94cit  arrdate  i94mode  depdate  i94visa  visatype
1000074.0  2016   4       129     20550    1        20564    2        WT          1
480428.0   2016   4       148     20547    1        20559    2        WT          1
4718122.0  2016   4       209     20569    1        20573    2        WT          1
4718538.0  2016   4       209     20569    1        20574    2        WT          1
4729596.0  2016   4       245     20569    1        20584    2        B2          1
                                                                                 ..
2863583.0  2016   4       689     20559    1        20569    2        B2          1
2865787.0  2016   4       691     20559    1        20566    1        B1          1
2865828.0  2016   4       691     20559    1        20600    2        B2          1
2867437.0  2016   4       691     20559    1        20590    2        B2          1
999282.0   2016   4       129     20550    1        20553    1        WB         

In [None]:
i94immi_df['visatype'].value_counts(normalize=True)*100

WT     45.215563
B2     34.700315
WB      9.463722
B1      6.414301
GMT     2.523659
F1      0.841220
CP      0.525762
E2      0.315457
Name: visatype, dtype: float64

In [None]:
i94immi_df["visaranking"] = i94immi_df["visatype"].rank(ascending = True).astype("int")
i94immi_df.sort_values(by=['visatype','Visa Ranking'], ascending=True)

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype,Visa Ranking
30,5692439.0,2016,4,133,20574,1,20580,1,B1,31
34,4805034.0,2016,4,582,20569,1,20573,1,B1,31
41,692716.0,2016,4,245,20548,1,20651,1,B1,31
68,3293058.0,2016,4,691,20561,1,20565,1,B1,31
94,95870.0,2016,4,528,20545,1,20549,1,B1,31
...,...,...,...,...,...,...,...,...,...,...
989,1360834.0,2016,4,117,20552,1,20556,2,WT,736
992,3874218.0,2016,4,148,20565,1,20582,2,WT,736
994,5081809.0,2016,4,254,20571,1,20582,2,WT,736
995,4288772.0,2016,4,135,20567,1,20572,2,WT,736


In [None]:
i94immi_df['visatype'].value_counts()

WT     430
B2     330
WB      90
B1      61
GMT     24
F1       8
CP       5
E2       3
Name: visatype, dtype: int64

In [None]:
# Write to parquet partitioned by arrdate - Run on production
i94immi_df.write.partitionBy("arrdate").parquet(os.path.join(output_data, table), mode="overwrite")

=====================================================================================================

=====================================================================================================

In [None]:
def nan_percentage_calc(df):
    nan_demographics_df = pd.DataFrame(data=df.isnull().sum(), columns=['NaN'])
    nan_demographics_df.drop(nan_demographics_df[nan_demographics_df['NaN'] == 0].index, inplace = True)
    nan_demographics_df['% of NaN'] = (nan_demographics_df['NaN']/df.count())*100
    return nan_demographics_df

In [None]:
nan_percentage_calc(worldtempe_df)

Unnamed: 0,NaN,% of NaN
AverageTemperature,3448,3.571133
AverageTemperatureUncertainty,3448,3.571133


In [None]:
worldtempe_df.dropna(axis=0, inplace=True)

In [None]:
nan_percentage_calc(worldtempe_df)

Unnamed: 0,NaN,% of NaN
dt,,
AverageTemperature,,
AverageTemperatureUncertainty,,
City,,
Country,,
Latitude,,
Longitude,,
year,,
month,,


In [None]:
# Dropping 3 columns

worldtempe_df = worldtempe_df.drop(['AverageTemperatureUncertainty', 'Latitude', 'Longitude'], axis=1)
worldtempe_df.columns

Index(['dt', 'AverageTemperature', 'City', 'Country', 'year', 'month'], dtype='object')

In [None]:
def convert_column_names(df):
    cols = df.columns
    column_name_changed = []

    for col in cols:
        new_column = col.lstrip().rstrip().lower().replace (" ", "_").replace ("-", "_") #strip beginning spaces, makes lowercase, add underscpre
        column_name_changed.append(new_column)

    df.columns = column_name_changed

In [None]:
convert_column_names(worldtempe_df)
worldtempe_df.columns

Index(['dt', 'averagetemperature', 'city', 'country', 'year', 'month'], dtype='object')

```
===========================================================================================================
    demographics
===========================================================================================================
```

In [None]:
citydemo_df

In [None]:
citydemo_df.shape

(2891, 12)

In [None]:
citydemo_df['city'].nunique()

567

In [None]:
citydemo_df['state'].nunique()

49

In [None]:
# Percentage ratio between male population with total population
citydemo_df['pct_male_pop'] = (citydemo_df['male_population']/citydemo_df['total_population'])*100

# Percentage ratio between female population with total population
citydemo_df['pct_female_pop'] = (citydemo_df['Female Population']/citydemo_df['total_population'])*100

# American Indian and Alaska Native
citydemo_df['pct_native'] = (citydemo_df['American Indian and Alaska Native']/citydemo_df['total_population'])*100

# Asian
citydemo_df['pct_asian'] = (citydemo_df['Asian']/citydemo_df['total_population'])*100

# Black or African-American
citydemo_df['pct_black'] = (citydemo_df['Black or African-American']/citydemo_df['total_population'])*100

# Hispanic or Latino
citydemo_df['pct_hispanic'] = (citydemo_df['Hispanic or Latino']/citydemo_df['total_population'])*100

# White
citydemo_df['pct_white'] = (citydemo_df['White']/citydemo_df['citydemo_df'])*100

In [None]:
citydemo_df = citydemo_df.drop([], axis=1)
citydemo_df.columns

Perform convert data type

In [None]:
citydemo_clean_column_name_df['state'] = citydemo_clean_column_name_df['state'].astype(str)
citydemo_clean_column_name_df['median_age'] = citydemo_clean_column_name_df['median_age'].astype(int)
citydemo_clean_column_name_df['male_population'] = citydemo_clean_column_name_df['male_population'].astype(int)
citydemo_clean_column_name_df['female_population'] = citydemo_clean_column_name_df['female_population'].astype(int)
citydemo_clean_column_name_df['total_population'] = citydemo_clean_column_name_df['total_population'].astype(int)
citydemo_clean_column_name_df['number_of_veterans'] = citydemo_clean_column_name_df['number_of_veterans'].astype(int)
citydemo_clean_column_name_df['foreign_born'] = citydemo_clean_column_name_df['foreign_born'].astype(int)

citydemo_clean_column_name_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  citydemo_clean_column_name_df['state'] = citydemo_clean_column_name_df['state'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  citydemo_clean_column_name_df['median_age'] = citydemo_clean_column_name_df['median_age'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  citydemo_cl

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,race,count
0,SILVER SPRING,Maryland,33,40601,41862,82463,1562,30908,2.6,MD,Hispanic or Latino,25924
1,QUINCY,Massachusetts,41,44129,49500,93629,4147,32935,2.39,MA,White,58723
2,HOOVER,Alabama,38,38040,46799,84839,4819,8229,2.58,AL,Asian,4759
3,RANCHO CUCAMONGA,California,34,88127,87105,175232,5821,33878,3.18,CA,Black or African-American,24437
4,NEWARK,New Jersey,34,138040,143873,281913,5829,86253,2.73,NJ,White,76402


Verify converted datatype

In [None]:
citydemo_clean_column_name_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2875 entries, 0 to 2890
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    2875 non-null   object 
 1   state                   2875 non-null   object 
 2   median_age              2875 non-null   int32  
 3   male_population         2875 non-null   int32  
 4   female_population       2875 non-null   int32  
 5   total_population        2875 non-null   int32  
 6   number_of_veterans      2875 non-null   int32  
 7   foreign_born            2875 non-null   int32  
 8   average_household_size  2875 non-null   float64
 9   state_code              2875 non-null   object 
 10  race                    2875 non-null   object 
 11  count                   2875 non-null   int64  
dtypes: float64(1), int32(6), int64(1), object(4)
memory usage: 224.6+ KB


```
===========================================================================================================
    teperature
===========================================================================================================
```

In [None]:
def nan_percentage_calc(df):
    nan_demographics_df = pd.DataFrame(data=df.isnull().sum(), columns=['NaN'])
    nan_demographics_df.drop(nan_demographics_df[nan_demographics_df['NaN'] == 0].index, inplace = True)
    nan_demographics_df['% of NaN'] = (nan_demographics_df['NaN']/df.count())*100
    return nan_demographics_df

In [None]:
worldtempe_df.dropna(axis=0, inplace=True)

In [None]:
nan_percentage_calc(worldtempe_df)

In [None]:
# Dropping 3 columns

worldtempe_df = worldtempe_df.drop(['AverageTemperatureUncertainty', 'Latitude', 'Longitude'], axis=1)
worldtempe_df.columns

In [None]:
def convert_column_names(df):
    cols = df.columns
    column_name_changed = []

    for col in cols:
        new_column = col.lstrip().rstrip().lower().replace (" ", "_").replace ("-", "_") #strip beginning spaces, makes lowercase, add underscpre
        column_name_changed.append(new_column)

    df.columns = column_name_changed

In [None]:
convert_column_names(worldtempe_df)
worldtempe_df.columns