In [7]:
from pyspark.sql import functions as F

In [1]:
spark

In [2]:
immigration_df = spark.read.parquet("transformed_data/i94_transformed/")
temperature_df = spark.read.parquet("transformed_data/temperature_transformed/")
demographics_df = spark.read.parquet("transformed_data/demographics_transformed/")

In [3]:
immigration_df.printSchema()

root
 |-- immigration_year: integer (nullable = true)
 |-- immigration_month: integer (nullable = true)
 |-- immigration_source: string (nullable = true)
 |-- immigration_destination_port: string (nullable = true)
 |-- immigrant_age: integer (nullable = true)
 |-- immigration_state: string (nullable = true)
 |-- visa_code: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- visa_type: string (nullable = true)



In [4]:
temperature_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- month: string (nullable = true)
 |-- avg_temperature: double (nullable = true)



In [5]:
demographics_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- median_age: double (nullable = true)
 |-- male_population: integer (nullable = true)
 |-- female_population: integer (nullable = true)
 |-- total_population: integer (nullable = true)
 |-- foreign_born: integer (nullable = true)
 |-- houshold_size: double (nullable = true)
 |-- state: string (nullable = true)
 |-- african_american: long (nullable = true)
 |-- hispanic: long (nullable = true)
 |-- white: long (nullable = true)
 |-- asian: long (nullable = true)
 |-- native_american: long (nullable = true)



**Top-10 Cities with most immigration in 2016**

In [8]:
immigration_df.groupby('immigration_destination_port').count().sort(F.col("count").desc()).show(10, False)

+----------------------------+-------+
|immigration_destination_port|count  |
+----------------------------+-------+
|new york                    |6678555|
|miami                       |5122889|
|los angeles                 |4602847|
|san francisco               |2309621|
|honolulu                    |2249967|
|newark/teterboro            |1867734|
|chicago                     |1798697|
|orlando                     |1698944|
|houston                     |1347272|
|agana                       |1337940|
+----------------------------+-------+
only showing top 10 rows



In [39]:
top_20_cities = set([row['immigration_destination_port'] 
                      for row in immigration_df.groupby('immigration_destination_port').count().sort(F.col("count").desc()).take(100)])
top_20_cities

{'agana',
 'alexandria bay',
 'anchorage',
 'anzalduas',
 'atlanta',
 'austin',
 'baltimore',
 'bangor',
 'blaine',
 'boston',
 'bradenton - sarasota',
 'bridge of americas',
 'brownsville',
 'buffalo',
 'calais',
 'calexico',
 'cannon intl - reno/tahoe',
 'champlain',
 'charlotte',
 'charlotte amalie',
 'chicago',
 'cincinnati',
 'cleveland',
 'columbus',
 'dallas',
 'denver',
 'derby line',
 'detroit',
 'dublin',
 'fort lauderdale',
 'fort myers',
 'hakai pass',
 'hamilton',
 'hidalgo',
 'highgate springs',
 'honolulu',
 'houlton',
 'houston',
 'juarez-lincoln bridge',
 'kahului - maui',
 'keahole-kona',
 'laredo',
 'las vegas',
 'lewiston',
 'long beach',
 'los angeles',
 'lynden',
 'mcallen',
 'miami',
 'montreal',
 'nassau',
 'new orleans',
 'new york',
 'newark/teterboro',
 'niagara falls',
 'nogales',
 'north caicos',
 'oakland',
 'ontario',
 'opa locka',
 'orlando',
 'otay mesa',
 'palm springs',
 'peace bridge',
 'pembina',
 'philadelphia',
 'phoenix',
 'piegan',
 'pittsburg',

**Top-10 cities with most immigrants for studies**

In [13]:
immigration_df.filter("visa_type = 'F1'").groupby('immigration_destination_port').count().sort(F.col("count").desc()).show(10, False)

+----------------------------+------+
|immigration_destination_port|count |
+----------------------------+------+
|new york                    |262438|
|los angeles                 |194823|
|chicago                     |170813|
|san francisco               |114824|
|boston                      |89755 |
|unknown                     |75599 |
|newark/teterboro            |67041 |
|miami                       |64788 |
|seattle                     |63296 |
|dallas                      |61066 |
+----------------------------+------+
only showing top 10 rows



**Top-10 cities with most immigrants for tourism**

In [15]:
immigration_df.filter("visa_type = 'B1' OR visa_type = 'B2'").groupby('immigration_destination_port').count().sort(F.col("count").desc()).show(10, False)

+----------------------------+-------+
|immigration_destination_port|count  |
+----------------------------+-------+
|miami                       |3414807|
|new york                    |2703032|
|los angeles                 |1900427|
|houston                     |916565 |
|san francisco               |877266 |
|orlando                     |817880 |
|fort lauderdale             |817167 |
|chicago                     |774546 |
|unknown                     |620965 |
|newark/teterboro            |575530 |
+----------------------------+-------+
only showing top 10 rows



**Top-10 cities with most business travel**

In [17]:
immigration_df.filter("visa_type = 'WB' OR visa_type = 'WT'").groupby('immigration_destination_port').count().sort(F.col("count").desc()).show(10, False)

+----------------------------+-------+
|immigration_destination_port|count  |
+----------------------------+-------+
|new york                    |3643075|
|los angeles                 |2463068|
|honolulu                    |2056488|
|miami                       |1597582|
|san francisco               |1293873|
|newark/teterboro            |1204346|
|orlando                     |857383 |
|chicago                     |816490 |
|boston                      |561680 |
|atlanta                     |531525 |
+----------------------------+-------+
only showing top 10 rows



**Which month has most student travelers**

In [44]:
max_month_students = immigration_df.filter("visa_type = 'F1'").groupby('immigration_month').count().sort(F.col("count").desc()).collect()[0]['immigration_month']

In [None]:
immigration_df.filter("visa_type = 'F1'").groupby('immigration_destination_port').count().sort(F.col("count").desc()).show(10, False)

In [16]:
immigration_df.groupBy('visa_type').count().show()

+---------+--------+
|visa_type|   count|
+---------+--------+
|       F2|   68866|
|      GMB|    2728|
|       B2|15188785|
|       F1| 1487432|
|      CPL|     236|
|       I1|    2825|
|       WB| 2940456|
|       M1|   16306|
|       B1| 2282096|
|       WT|16915615|
|       M2|     667|
|       CP|  272007|
|      GMT| 1265275|
|       E1|   48905|
|        I|   39054|
|       E2|  259215|
|      SBP|      61|
+---------+--------+



In [51]:
from pyspark.sql.types import FloatType
from pyspark.sql.functions import udf
def diverse_score(a, b, c, d, e):
    import statistics
    if not a: a=10
    if not b: b=10
    if not c: c=10
    if not d: d=10
    if not e: e=10
    array = [a,b,c,d,e]
    return float(statistics.pstdev(array))
diverse_score_udf = udf(diverse_score, FloatType())
immigration_df_top_20 = immigration_df.filter(
    (F.col('immigration_destination_port').isin(top_20_cities)) &
    (F.col('immigration_month')==max_month_students)
)
diversity_df = demographics_df.withColumn('diversity',
                        diverse_score_udf('african_american', 'white', 'hispanic', 'asian', 'native_american'))
top_20_diverse_cities = diversity_df.orderBy(F.desc('diversity')).select('city').collect()[:20]

In [52]:
top_20_diverse_cities

[Row(city='new york'),
 Row(city='los angeles'),
 Row(city='houston'),
 Row(city='san antonio'),
 Row(city='chicago'),
 Row(city='phoenix'),
 Row(city='san diego'),
 Row(city='dallas'),
 Row(city='philadelphia'),
 Row(city='el paso'),
 Row(city='austin'),
 Row(city='fort worth'),
 Row(city='detroit'),
 Row(city='indianapolis'),
 Row(city='columbus'),
 Row(city='jacksonville'),
 Row(city='denver'),
 Row(city='portland'),
 Row(city='seattle'),
 Row(city='san jose')]

In [None]:
immigration_demographics = immigration_df_top_20.join(demographics_df, immigration_df.immigration_destination_port == demographics_df.city, how='inner')
immigration_demographics = immigration_demographics.withColumn('diversity',
                                                               diverse_score_udf('african_american', 'white', 'hispanic', 'asian', 'native_american'))

immigration_demographics.printSchema()

In [47]:
immigration_demographics.dropDuplicates(["city","diversity"]).orderBy(F.desc('diversity'))\
.select('city', 'visa_type', 'diversity').show(100)

+---------------+---------+----------+
|           city|visa_type| diversity|
+---------------+---------+----------+
|       new york|       F1| 1246493.5|
|    los angeles|       WT|  863757.4|
|        houston|       B1|  515606.9|
|    san antonio|       B2| 511733.94|
|        chicago|       WT|  488449.4|
|        phoenix|       WT| 438764.22|
|      san diego|       B2|  328560.4|
|         dallas|       F1| 310543.16|
|   philadelphia|       WT| 286383.22|
|         austin|       B2|  257024.5|
|        detroit|       WB| 203990.31|
|       columbus|       B1|  202806.1|
|         denver|       B2|  197874.1|
|       portland|       WT|  191821.5|
|        seattle|       F1| 184081.45|
|       san jose|       WT| 179433.14|
|  san francisco|       WT| 165125.94|
|      charlotte|       WT| 164356.67|
|         tucson|       B1| 152321.97|
|      baltimore|       F1|  150909.9|
|      las vegas|       WT| 150903.98|
|          miami|       B2| 149269.52|
|       san juan|       W