In [1]:
from pyspark.sql import functions as F

In [2]:
spark

### Read the transformed data

In [3]:
immigration_df = spark.read.parquet("transformed_data/i94_transformed/")
temperature_df = spark.read.parquet("transformed_data/temperature_transformed/")
demographics_df = spark.read.parquet("transformed_data/demographics_transformed/")

In [4]:
immigration_df.printSchema()

root
 |-- immigration_year: integer (nullable = true)
 |-- immigration_month: integer (nullable = true)
 |-- immigration_source: string (nullable = true)
 |-- immigration_destination_port: string (nullable = true)
 |-- immigrant_age: integer (nullable = true)
 |-- immigration_state: string (nullable = true)
 |-- visa_code: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- visa_type: string (nullable = true)



In [5]:
temperature_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- month: string (nullable = true)
 |-- avg_temperature: double (nullable = true)



In [6]:
demographics_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- median_age: double (nullable = true)
 |-- male_population: integer (nullable = true)
 |-- female_population: integer (nullable = true)
 |-- total_population: integer (nullable = true)
 |-- foreign_born: integer (nullable = true)
 |-- houshold_size: double (nullable = true)
 |-- state: string (nullable = true)
 |-- african_american: long (nullable = true)
 |-- hispanic: long (nullable = true)
 |-- white: long (nullable = true)
 |-- asian: long (nullable = true)
 |-- native_american: long (nullable = true)



### Top-10 Cities with most immigration in 2016

In [9]:
top_20_cities = [row['immigration_destination_port'] 
                      for row in immigration_df.groupby('immigration_destination_port').count().sort(F.col("count").desc()).take(20)]
print("Top 20 cities with most immigration in 2016: ")
print(top_20_cities)

Top 20 cities with most immigration in 2016: 
['new york', 'miami', 'los angeles', 'san francisco', 'honolulu', 'newark/teterboro', 'chicago', 'orlando', 'houston', 'agana', 'unknown', 'atlanta', 'fort lauderdale', 'dallas', 'boston', 'las vegas', 'seattle', 'saipan', 'detroit', 'philadelphia']


**Top-10 cities with most immigrants as students**

In [10]:
immigration_df.filter("visa_type = 'F1'").groupby('immigration_destination_port').count().sort(F.col("count").desc()).show(10, False)

+----------------------------+------+
|immigration_destination_port|count |
+----------------------------+------+
|new york                    |262438|
|los angeles                 |194823|
|chicago                     |170813|
|san francisco               |114824|
|boston                      |89755 |
|unknown                     |75599 |
|newark/teterboro            |67041 |
|miami                       |64788 |
|seattle                     |63296 |
|dallas                      |61066 |
+----------------------------+------+
only showing top 10 rows



**This makes sense as NY, LA, Chicago, SF, Boston are known for their colleges and universities.**
NOTE: Here Unknown means we could not find a mapping in I-94 dataset

**Top-10 cities with most immigrants for tourism**

In [11]:
immigration_df.filter("visa_type = 'B1' OR visa_type = 'B2'").groupby('immigration_destination_port').count().sort(F.col("count").desc()).show(10, False)

+----------------------------+-------+
|immigration_destination_port|count  |
+----------------------------+-------+
|miami                       |3414807|
|new york                    |2703032|
|los angeles                 |1900427|
|houston                     |916565 |
|san francisco               |877266 |
|orlando                     |817880 |
|fort lauderdale             |817167 |
|chicago                     |774546 |
|unknown                     |620965 |
|newark/teterboro            |575530 |
+----------------------------+-------+
only showing top 10 rows



**This makes sense as we see a lot of tourist-y places here like Miami, NY and even Orlando**

**Top-10 cities with most business travel**

In [13]:
immigration_df.filter("visa_type = 'WB' OR visa_type = 'WT'").groupby('immigration_destination_port').count().sort(F.col("count").desc()).show(10, False)

+----------------------------+-------+
|immigration_destination_port|count  |
+----------------------------+-------+
|new york                    |3643075|
|los angeles                 |2463068|
|honolulu                    |2056488|
|miami                       |1597582|
|san francisco               |1293873|
|newark/teterboro            |1204346|
|orlando                     |857383 |
|chicago                     |816490 |
|boston                      |561680 |
|atlanta                     |531525 |
+----------------------------+-------+
only showing top 10 rows



**Which month has most student travelers**

In [14]:
max_month_students = immigration_df.filter("visa_type = 'F1'").groupby('immigration_month').count().sort(F.col("count").desc()).collect()[0]['immigration_month']
print("Maximum student travel happens in month: ", max_month_students)

Maximum student travel happens in month:  8


**This is explained by the fact that a lot of universities begin their Fall intake around August/September**

In [15]:
immigration_df.groupBy('visa_type').count().show()

+---------+--------+
|visa_type|   count|
+---------+--------+
|       F2|   68866|
|      GMB|    2728|
|       B2|15188785|
|       F1| 1487432|
|      CPL|     236|
|       I1|    2825|
|       WB| 2940456|
|       M1|   16306|
|       B1| 2282096|
|       WT|16915615|
|       M2|     667|
|       CP|  272007|
|      GMT| 1265275|
|       E1|   48905|
|        I|   39054|
|       E2|  259215|
|      SBP|      61|
+---------+--------+



**Below we compute a "diversity" score by taking standard-deviation of population of each race**

NOTE: This is a rather naive method but for now we stick with this heuristic

In [18]:
from pyspark.sql.types import FloatType
from pyspark.sql.functions import udf
def diverse_score(a, b, c, d, e):
    import statistics
    if not a: a=10
    if not b: b=10
    if not c: c=10
    if not d: d=10
    if not e: e=10
    array = [a,b,c,d,e]
    return float(statistics.pstdev(array))
diverse_score_udf = udf(diverse_score, FloatType())
immigration_df_top_20 = immigration_df.filter(
    (F.col('immigration_destination_port').isin(top_20_cities)) &
    (F.col('immigration_month')==max_month_students)
)
diversity_df = demographics_df.withColumn('diversity',
                        diverse_score_udf('african_american', 'white', 'hispanic', 'asian', 'native_american'))
top_20_diverse_cities = [row['city'] for row in diversity_df.orderBy(F.desc('diversity')).select('city').collect()[:20]]

In [19]:
top_20_diverse_cities

['new york',
 'los angeles',
 'houston',
 'san antonio',
 'chicago',
 'phoenix',
 'san diego',
 'dallas',
 'philadelphia',
 'el paso',
 'austin',
 'fort worth',
 'detroit',
 'indianapolis',
 'columbus',
 'jacksonville',
 'denver',
 'portland',
 'seattle',
 'san jose']

**Here, we see a lot of coastal cities and urban areas in the diverse cities**

In [None]:
immigration_demographics = immigration_df_top_20.join(demographics_df, immigration_df.immigration_destination_port == demographics_df.city, how='inner')
immigration_demographics = immigration_demographics.withColumn('diversity',
                                                               diverse_score_udf('african_american', 'white', 'hispanic', 'asian', 'native_american'))

immigration_demographics.printSchema()

In [None]:
immigration_demographics.dropDuplicates(["city","diversity"]).orderBy(F.desc('diversity'))\
.select('city', 'visa_type', 'diversity').show(100)