## Import files to create model¶

In [1]:
import configparser
import datetime as dt
from datetime import datetime
import os
import glob
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType
from pyspark.sql.functions import udf, col, lit, year, month, upper, to_date
from pyspark.sql.functions import monotonically_increasing_id
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date

## Load configuration data

In [2]:
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ["AWS_ACCESS_KEY_ID"]= config['AWS']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"]= config['AWS']['AWS_SECRET_ACCESS_KEY']

## Create Spark session

In [3]:
#creating the session
spark = SparkSession \
        .builder \
        .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
        .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
        .enableHiveSupport().getOrCreate()

In [12]:
output_data = "Capstone_Project/"

table_name = ['Immigrations/','Immigrants/','Airports/','Populations/','Population_Statistics/','Temperatures/',\
              'Temperature_Statistics/','Countries/','States/','Ports/','Visas/']

In [22]:
# Get Tables from 
def get_table(output_data, table_name):
    """Count checks on fact and dimension table to ensure that tables are not empty.
    :param df: spark dataframe to check counts on
    :param table_name: corresponding name of table
    """
    name = output_data + table_name
    df= spark.read.parquet(name)
    return df
        

In [25]:
Immigrations = get_table(output_data, 'Immigrations/')
Immigrations.show(5)

+------+----+-----+---------+---------+---------+------------+--------------+----------+--------------+----------+
|cic_id|year|month|port_code|mode_code|visa_code|arrival_date|departure_date|match_flag|immigration_id|state_code|
+------+----+-----+---------+---------+---------+------------+--------------+----------+--------------+----------+
|  1830|2016|    4|      SFB|        1|        2|     20545.0|       20552.0|         M| 1486058684426|        FL|
|  5055|2016|    4|      NEW|        1|        2|     20545.0|       20559.0|         M| 1486058684440|        FL|
|  9066|2016|    4|      MIA|        1|        2|     20545.0|       20554.0|         M| 1486058684461|        FL|
|  9181|2016|    4|      MIA|        1|        2|     20545.0|       20560.0|         M| 1486058684462|        FL|
|  9347|2016|    4|      MIA|        1|        2|     20545.0|       20555.0|         M| 1486058684463|        FL|
+------+----+-----+---------+---------+---------+------------+--------------+---

In [27]:
Populations = get_table(output_data, 'Populations/')
Populations.limit(5).toPandas()

Unnamed: 0,city,state_code,male_population,female_population,total_population,num_of_veterans,foreign_born,race,population_id,state
0,AUGUSTA-RICHMOND COUNTY CONSOLIDATED GOVERNMENT,GA,94662,101917,196579,19085,7915,American Indian and Alaska Native,584115552262,GEORGIA
1,LOUISVILLE/JEFFERSON COUNTY METRO GOVERNMENT,KY,298451,316938,615389,39364,37875,American Indian and Alaska Native,1468878815250,KENTUCKY
2,ATHENS-CLARKE COUNTY UNIFIED GOVERNMENT,GA,57415,65148,122563,3953,12868,American Indian and Alaska Native,652835029006,GEORGIA
3,PALO ALTO,CA,31451,35397,66848,2025,23348,American Indian and Alaska Native,704374636547,CALIFORNIA
4,OXNARD,CA,101906,105346,207252,6367,78678,American Indian and Alaska Native,704374636548,CALIFORNIA


In [None]:
## do people from countries with warmer or cold climate immigrate to the US in large numbers?

In [29]:
Immigrants = get_table(output_data, 'Immigrants/')
Immigrants.limit(5).toPandas()

Unnamed: 0,cic_id,citizen_country,residence_country,age,gender,ins_num,immigrants_id
0,171,103,103,30,F,,1297080123392
1,235,103,103,27,F,,1297080123393
2,944,104,104,18,M,,1297080123394
3,962,104,104,64,F,,1297080123395
4,1071,104,104,71,M,,1297080123396


In [30]:
Temperatures = get_table(output_data, 'Temperatures/')
Temperatures.limit(5).toPandas()

Unnamed: 0,date,year,month,country,city,latitude,longitude
0,1828-05-01,1828,5,INDONESIA,BONTANG,0.80N,118.13E
1,1835-01-01,1835,1,INDONESIA,BONTANG,0.80N,118.13E
2,1846-11-01,1846,11,INDONESIA,BONTANG,0.80N,118.13E
3,1853-04-01,1853,4,CONGO (DEMOCRATIC REPUBLIC OF THE),BUTEMBO,0.80N,29.73E
4,1870-09-01,1870,9,CONGO (DEMOCRATIC REPUBLIC OF THE),BUTEMBO,0.80N,29.73E


In [31]:
Temperature_Statistics = get_table(output_data, 'Temperature_Statistics/')
Temperature_Statistics.limit(5).toPandas()

Unnamed: 0,date,year,month,country,city,avg_temp,avg_temp_uncertainty
0,1907-07-01,1907,7,CANADA,EDMONTON,14.739,0.624
1,1907-08-01,1907,8,CANADA,EDMONTON,12.001,0.603
2,1907-09-01,1907,9,CANADA,EDMONTON,8.319,0.324
3,1907-10-01,1907,10,CANADA,EDMONTON,6.339,0.77
4,1907-11-01,1907,11,CANADA,EDMONTON,-1.668,0.787


In [32]:
Countries = get_table(output_data, 'Countries/')
Countries.limit(5).toPandas()

Unnamed: 0,code,country
0,582,"MEXICO Air Sea, and Not Reported (I-94, no lan..."
1,719,INVALID: BOUVET ISLAND (ANTARCTICA/NORWAY TERR.)
2,739,INVALID: DRONNING MAUD LAND (ANTARCTICA-NORWAY)
3,740,INVALID: NEUTRAL ZONE (S. ARABIA/IRAQ)
4,394,INVALID: FRENCH SOUTHERN AND ANTARCTIC


In [35]:
df = Immigrants.join(Countries, Immigrants.citizen_country == Countries.code, 'left')

## Find Country name of Residency Country

In [38]:
df.limit(5).toPandas()

Unnamed: 0,cic_id,citizen_country,residence_country,age,gender,ins_num,immigrants_id,code,country
0,171,103,103,30,F,,1297080123392,103,AUSTRIA
1,235,103,103,27,F,,1297080123393,103,AUSTRIA
2,944,104,104,18,M,,1297080123394,104,BELGIUM
3,962,104,104,64,F,,1297080123395,104,BELGIUM
4,1071,104,104,71,M,,1297080123396,104,BELGIUM


## Find Immigrants below 30, coming from cold countries

In [45]:
immi_30 = df.select('cic_id','country','age').filter(df.age<=30)
immi_30_temp = immi_30.join(Temperature_Statistics, immi_30.country == Temperature_Statistics.country, 'inner')
immi_30_temp_cold = immi_30_temp.filter(immi_30_temp.avg_temp <10)
immi_30_temp_cold.limit(10).toPandas() 

Unnamed: 0,cic_id,country,age,date,year,month,country.1,city,avg_temp,avg_temp_uncertainty
0,386657,ARMENIA,23,1780-01-01,1780,1,ARMENIA,GYUMRI,-4.888,3.353
1,386657,ARMENIA,23,1780-02-01,1780,2,ARMENIA,GYUMRI,-2.479,2.98
2,386657,ARMENIA,23,1780-03-01,1780,3,ARMENIA,GYUMRI,2.809,2.57
3,386657,ARMENIA,23,1780-10-01,1780,10,ARMENIA,GYUMRI,9.975,2.398
4,386657,ARMENIA,23,1780-11-01,1780,11,ARMENIA,GYUMRI,5.266,2.525
5,386657,ARMENIA,23,1780-12-01,1780,12,ARMENIA,GYUMRI,-3.199,2.698
6,386657,ARMENIA,23,1781-01-01,1781,1,ARMENIA,GYUMRI,-4.199,4.698
7,386657,ARMENIA,23,1781-02-01,1781,2,ARMENIA,GYUMRI,-3.88,5.982
8,386657,ARMENIA,23,1781-03-01,1781,3,ARMENIA,GYUMRI,1.618,2.467
9,386657,ARMENIA,23,1781-04-01,1781,4,ARMENIA,GYUMRI,6.145,2.414
