## Exploratory Analysis in Pandas
##### Please use this notebook when loading a subset of immigration data. If the full dataset was loaded, please use spark SQL to get the subset of the data before running analysis.

In [None]:
# Import packages
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import desc
from pyspark.sql.functions import asc
from pyspark.sql.functions import sum as Fsum
import configparser
from datetime import datetime
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Create Spark session
spark = SparkSession \
    .builder \
    .config("spark.driver.memory", "30g") \
    .config('spark.executor.memory','8g') \
    .config('spark.driver.maxResultsSize','0')\
    .appName("Analysis of Immigration Data") \
    .getOrCreate()

In [3]:
# Assign table names
table_name = ['immigration','demographic','state_race','time','mode','visa','port','country','temperature']

In [4]:
# Create views for all tables
for table in table_name:
    df_table=spark.read.parquet("./parquet files/"+table)
    df_table.createOrReplaceTempView(table)
    count_rows = spark.sql('''
        SELECT count(*) FROM {}'''.format(table))
    if count_rows.collect()[0][0] == 0:
        raise ValueError("Data quality check failed. {} returned no results").format(table)

In [5]:
# immigration data
imm = spark.sql('''
    select cicid, 
    c.name as citizen_country,
    c.continent as citizen_continent,
    r.name as residential_country,
    m.mode,
    address_state,
    arrival_date,
    t.month as arrival_month,
    t.weekday as arrival_weekday,
    depart_date,
    age,
    birth_year,
    dpmt_visa,
    occupation,
    visa_expiry_date,
    gender,
    airline,
    p.city,
    p.state,
    v.visa,
    visa_code
    from immigration i
    inner join country c on c.code = i.cit_ctry
    inner join country r on r.code = i.res_ctry
    inner join mode m on m.code = i.trnps_mode_code
    inner join port p on p.code = i.port_code
    inner join visa v on v.code = i.visa
    inner join time t on t.date = i.arrival_date
    where gender in ('F','M') 
    and depart_date is not null 
    and age is not null
''')

In [7]:
imm.printSchema()

root
 |-- cicid: integer (nullable = true)
 |-- citizen_country: string (nullable = true)
 |-- citizen_continent: string (nullable = true)
 |-- residential_country: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- address_state: string (nullable = true)
 |-- arrival_date: date (nullable = true)
 |-- arrival_month: integer (nullable = true)
 |-- arrival_weekday: integer (nullable = true)
 |-- depart_date: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- dpmt_visa: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- visa_expiry_date: date (nullable = true)
 |-- gender: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- visa: string (nullable = true)
 |-- visa_code: string (nullable = true)



In [None]:
imm_df=imm.toPandas()

In [None]:
imm_df.head()

#### Cleaning

In [None]:
# Check missing age
len(imm_df[imm_df['age'].isna()])

In [None]:
# Check gender field's validity
imm_df['gender'].unique()

In [None]:
# Check how many rows do not have depart_date
len(imm_df[imm_df['depart_date'].isna()])

In [None]:
# Data Cleaning - remove the lines do not have a depart date
imm_df_dn=imm_df[imm_df['depart_date'].notna()]

In [None]:
# Data Cleaning - Remove missing/invalid gender
imm_df_dn=imm_df_dn[imm_df_dn['gender'].notna()]
imm_df_dn=imm_df_dn[(imm_df_dn['gender']=='F')|(imm_df_dn['gender']=='M')]

In [None]:
# Function for annotation
def show_values_on_bars(axs):
    def _show_on_single_plot(ax):        
        for p in ax.patches:
            _x = p.get_x() + p.get_width() / 2
            _y = p.get_y() + p.get_height()
            value = '{:.2f}'.format(p.get_height())
            ax.text(_x, _y, value, ha="center",fontsize=12) 

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

#fig, ax = plt.subplots(1, 2)
#show_values_on_bars(ax)

#### Q: Top 10 country visited U.S

In [None]:
top_ctry=imm_df_dn['citizen_country'].value_counts()[:10]
fig,ax = plt.subplots(figsize=(12,4))
sns.barplot(top_ctry.index,top_ctry.values,palette = 'coolwarm')
plt.tight_layout()
plt.title("Top 10 countries visited U.S",fontsize=15)
plt.xticks(fontsize=15,rotation=25)
plt.yticks(fontsize=15)
show_values_on_bars(ax)

##### A: UK contributes the most visitors to U.S

#### Q: Top 20 States visited in U.S

In [None]:
top20_state=imm_df_dn['address_state'].value_counts()[:20]
fig,ax1 = plt.subplots(figsize=(12,4))
sns.barplot(top20_state.index,top20_state.values,palette = 'coolwarm')
plt.tight_layout()
plt.title("Top 20 States visited in U.S",fontsize=15)
plt.xticks(fontsize=12,rotation=25)
plt.yticks(fontsize=12)
show_values_on_bars(ax1)

##### A: The most popular State being visited is Florida

#### Q: When do they usually come to U.S?

##### Month

In [None]:
imm_df_dn['arrival_month'].value_counts()

In [None]:
sns.countplot(x='arrival_month',data=imm_df_dn)

##### Weekday

In [None]:
imm_df_dn['arrival_weekday'].value_counts()

In [None]:
sns.countplot(x='arrival_weekday',data=imm_df_dn)

##### A: In general, the port were expecting more people on Saturday, and Friday is the second most. Wednesday and Sunday were expecting less people.

#### Q: How long did they stay?

In [None]:
imm_df_dn['stay_len']=(imm_df_dn['depart_date']- imm_df_dn['arrival_date']).apply(lambda x: int(str(x).split(' ')[0]))

In [None]:
plt.figure(figsize=(12,5))
sns.distplot(imm_df_dn['stay_len'],bins=50)

In [None]:
imm_df_dn['visa'].unique()

In [None]:
imm_df_dn['visa_code'].value_counts()

In [None]:
sns.boxplot(x='visa',y='stay_len',data=imm_df_dn,palette='coolwarm')

In [None]:
imm_df_dn['gender'].unique()

In [None]:
sns.boxplot(x='visa',y='stay_len',data=imm_df_dn,hue='gender',palette='coolwarm')

In [None]:
sns.boxplot(x='visa_code',y='stay_len',data=imm_df_dn,palette='coolwarm',hue='gender')

In [None]:
sns.boxplot(x='mode',y='stay_len',data=imm_df_dn,palette='coolwarm')

In [None]:
sns.distplot(imm_df_dn['age'],bins=80)

In [None]:
#plt.scatter(data=imm_df_dn,x='age',y='stay_len')

In [None]:
#plt.hist(imm_df_dn[imm_df_dn['mode']=='Air']['stay_len'],bins=50,alpha=0.5,color='blue')
#plt.hist(imm_df_dn[imm_df_dn['mode']=='Land']['stay_len'],bins=50,alpha=0.5,color='red')
#plt.hist(imm_df_dn[imm_df_dn['mode']=='Sea']['stay_len'],bins=50,alpha=0.5,color='yellow')

In [None]:
imm_df_dn['gender'].value_counts()

In [None]:
df=imm_df_dn1[['cicid','citizen_country','gender']].groupby(['citizen_country','gender']).count().reset_index()
df.head()

In [None]:
top10=imm_df_dn1['citizen_country'].value_counts()[:10]

In [None]:
top10_df=df[df['citizen_country'].isin(list(top10.index))].reset_index(drop=True)

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=top10_df,x='citizen_country',y='cicid',order = list(top10.index),hue='gender')

In [None]:
sns.jointplot(x='age',y='stay_len',data=imm_df_dn,kind='hex')

In [None]:
ocpt_df=imm_df_dn['occupation'].notna()

In [None]:
len(imm_df_dn[imm_df_dn['occupation'].notna()])