# Capstone dataset exploration

In [1]:
import pandas as pd
import json

## Step 1 and 2, gathering and exploring data

Focus: Travel to the US by country of residence and month

Immigration data

In [2]:
months = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
          'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
month = 'jun'

In [3]:
# read in data, use chunked months

fname = '../../data/18-83510-I94-Data-2016/i94_jun16_sub.sas7bdat'

immigration_data = []
immigration = pd.read_sas(fname, 'sas7bdat', encoding="ISO-8859-1", chunksize=10000)

for chunk in immigration:
    immigration_data.append(chunk)
immigration_df = pd.concat(immigration_data)

In [4]:
immigration_df.columns

Index(['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94port', 'arrdate',
       'i94mode', 'i94addr', 'depdate', 'i94bir', 'i94visa', 'count',
       'validres', 'delete_days', 'delete_mexl', 'delete_dup', 'delete_visa',
       'delete_recdup', 'dtadfile', 'visapost', 'occup', 'entdepa', 'entdepd',
       'entdepu', 'matflag', 'biryear', 'dtaddto', 'gender', 'insnum',
       'airline', 'admnum', 'fltno', 'visatype'],
      dtype='object')

In [5]:
len(immigration_df.duplicated())

3574989

In [6]:
# filter to i94visa = 2 (recreational travel)

rec_travel = immigration_df[immigration_df['i94visa']==2]

In [7]:
# group by i94res (country of residence), i94port (inbound airport) -- uses cicid to count individuals

rec_travel_grouped = rec_travel[['i94res','i94port','cicid']].groupby(['i94res', 'i94port']).count().reset_index().rename({'cicid':'count'},axis=1)

In [8]:
# without recreational filter -- year and month will be added back later
immigration_grouped = immigration_df[['i94res','i94port', 'i94visa','cicid']].groupby(['i94res', 'i94port', 'i94visa']).count().reset_index().rename({'cicid':'count'},axis=1)

In [9]:
print(len(immigration_grouped))
print(immigration_grouped.head())

18436
   i94res i94port  i94visa  count
0   101.0     ATL      1.0      7
1   101.0     ATL      2.0     18
2   101.0     ATL      3.0      1
3   101.0     BLA      2.0      2
4   101.0     BOS      1.0      6


In [10]:
print(len(rec_travel_grouped))
print(len(immigration_grouped))

9912
18436


In [11]:
#cast to int for merging
rec_travel_grouped['i94res'] = rec_travel_grouped['i94res'].astype(int)
rec_travel_grouped = rec_travel_grouped.rename({'cicid':'count'}, axis=1)
rec_travel_grouped.columns

Index(['i94res', 'i94port', 'count'], dtype='object')

In [12]:
rec_travel_grouped['month'] = months[month]

Airport codes

In [11]:
codes = pd.read_csv('airport-codes_csv.csv')

In [98]:
codes.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [12]:
# slice to columns of interest, to establish where each airport is

codes = codes[['ident', 'municipality', 'iso_country', 'iso_region']]

In [24]:
# merge to grouped travel data
rec_travel_port = rec_travel_grouped.merge(codes, left_on='i94port', right_on='ident', how='inner').drop(['iso_country', 'i94port'],axis=1)

In [25]:
rec_travel_port.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 530 entries, 0 to 529
Data columns (total 6 columns):
i94res          530 non-null int64
count           530 non-null int64
ident           530 non-null object
municipality    467 non-null object
iso_country     530 non-null object
iso_region      530 non-null object
dtypes: int64(2), object(4)
memory usage: 29.0+ KB


In [26]:
rec_travel_port.sample(20)

Unnamed: 0,i94res,count,ident,municipality,iso_country,iso_region
411,213,4,OTT,CotriguaÃ§u,BR,BR-MT
45,163,27,SFR,Los Angeles,US,US-CA
508,268,1,VIB,Ciudad ConstituciÃ³n,MX,MX-BCS
357,584,1,DER,Derim,PG,PG-MPL
346,387,1,DER,Derim,PG,PG-MPL
401,502,1,HTM,Hatgal,MN,MN-041
183,745,114,SFR,Los Angeles,US,US-CA
275,343,3,AUS,,US,US-TX
397,368,2,HTM,Hatgal,MN,MN-041
381,158,4,HTM,Hatgal,MN,MN-041


In [22]:
visitors_by_country = rec_travel_port[['i94res', 'iso_country', 'municipality', 'count']].groupby(['i94res', 'municipality']).sum().sort_values('count', ascending=False).reset_index()

In [23]:
visitors_by_country.head()

Unnamed: 0,i94res,municipality,count
0,245,Los Angeles,25802
1,213,Los Angeles,13751
2,135,Los Angeles,12518
3,438,Los Angeles,10290
4,112,Los Angeles,9288


In [131]:
len(visitors_by_country)

277

In [15]:
# country names read in for easier reading by humans -- codes taken from I94_SAS_Labels_Descriptions.SAS
countries_df = pd.read_json('countries.json')
countries_df['code'] = countries_df['code'].astype(int)

In [27]:
# merge for country names
visitors_by_country = visitors_by_country.merge(countries_df, left_on='i94res', right_on='code', how='left').drop('code', axis=1)

In [28]:
visitors_by_country.head(10)

Unnamed: 0,i94res,municipality,count,country
0,245,Los Angeles,25802,"CHINA, PRC"
1,213,Los Angeles,13751,INDIA
2,135,Los Angeles,12518,UNITED KINGDOM
3,438,Los Angeles,10290,AUSTRALIA
4,112,Los Angeles,9288,GERMANY
5,276,Los Angeles,9012,SOUTH KOREA
6,268,Los Angeles,7799,TAIWAN
7,111,Los Angeles,7406,FRANCE
8,209,Los Angeles,7390,JAPAN
9,582,Los Angeles,4904,"MEXICO Air Sea, and Not Reported (I-94, no lan..."


In [30]:
# strip country name of whitespace if exists (Argentina)
'ARGENTINA ' in visitors_by_country['country'].values

False

In [28]:
len(visitors_by_country)

562

In [29]:
visitors_by_country[['country','count']].groupby('country').sum().sort_values('count', ascending=False)

Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
"CHINA, PRC",13312
UNITED KINGDOM,9817
FRANCE,8459
JAPAN,7526
SOUTH KOREA,7455
GERMANY,6874
INDIA,6662
AUSTRALIA,5678
"MEXICO Air Sea, and Not Reported (I-94, no land arrivals)",3960
TAIWAN,2835


Temperature data

In [6]:
# read in temperature -- measurements listed in C
temp = pd.read_csv('../../data2/GlobalLandTemperaturesByCity.csv', usecols=['dt', 'AverageTemperature', 'Country'])

In [15]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 5 columns):
dt                    datetime64[ns]
AverageTemperature    float64
Country               object
month                 int64
year                  int64
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 328.0+ MB


In [8]:
len(temp)

8599212

In [14]:
# convert timestamp from type object to datetime for easy year/month extraction
temp['dt'] = pd.to_datetime(temp['dt'])
temp['month'] = temp['dt'].dt.month
temp['year'] = temp['dt'].dt.year

In [81]:
# filter to most recent complete year available (2012)
temp_2012 = temp[temp['year']==2012]

In [82]:
len(temp_2012)

42120

In [83]:
set(temp_2012['month'])

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}

In [84]:
# find avg temp per country per month
temp_avg = temp_2012[['AverageTemperature', 'Country', 'month']].groupby(['Country', 'month']).mean().reset_index()

In [85]:
print(temp_avg.head())
print(len(temp_avg))
print(set(temp_avg['month']))

       Country  month  AverageTemperature
0  Afghanistan      1            -0.27400
1  Afghanistan      2             0.17425
2  Afghanistan      3             8.31550
3  Afghanistan      4            15.58475
4  Afghanistan      5            20.26750
1908
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}


In [86]:
# calculate F for Americans
temp_avg['AverageTemperatureF'] = (temp_avg['AverageTemperature'] * 9/5) + 32

In [87]:
temp_avg.head()

Unnamed: 0,Country,month,AverageTemperature,AverageTemperatureF
0,Afghanistan,1,-0.274,31.5068
1,Afghanistan,2,0.17425,32.31365
2,Afghanistan,3,8.3155,46.9679
3,Afghanistan,4,15.58475,60.05255
4,Afghanistan,5,20.2675,68.4815


In [88]:
# get country code for table insertion & filtering
temp_avg['Country'] = temp_avg['Country'].str.upper()

# fix some country names to match countries json
temp_avg['Country'] = temp_avg['Country'].replace('BOSNIA AND HERZEGOVINA','BOSNIA-HERZEGOVINA')
temp_avg['Country'] = temp_avg['Country'].replace('CHINA', 'CHINA, PRC')
temp_avg['Country'] = temp_avg['Country'].replace("CÔTE D'IVOIRE", 'IVORY COAST')
temp_avg['Country'] = temp_avg['Country'].replace('GUINEA BISSAU', 'GUINEA-BISSAU')
temp_avg['Country'] = temp_avg['Country'].replace('CONGO (DEMOCRATIC REPUBLIC OF THE)', 'CONGO')

countries_df['country'] = countries_df['country'].replace('MEXICO Air Sea, and Not Reported (I-94, no land arrivals)', 'MEXICO')
countries_df['country'] = countries_df['country'].str.replace('INVALID: ', '')

temp_avg_ccode = pd.merge(temp_avg, countries_df, left_on='Country', right_on='country')

# drop extra US code (407 wanted)
temp_avg_ccode = temp_avg_ccode[temp_avg_ccode['code']!=583]

In [89]:
print(len(temp_avg))
print(len(countries_df))
print(len(temp_avg_ccode))

print([e for e in set(temp_avg['Country'].values) if e not in set(temp_avg_ccode['Country'].values)])

# Cambodia does not have a known code and visitor data is not being well tracked. Requires further investigation.

1908
289
1896
['CAMBODIA']


In [90]:
temp_avg[temp_avg['Country']=="CAMBODIA"]

Unnamed: 0,Country,month,AverageTemperature,AverageTemperatureF
276,CAMBODIA,1,26.571,79.8278
277,CAMBODIA,2,28.15,82.67
278,CAMBODIA,3,29.317,84.7706
279,CAMBODIA,4,29.14,84.452
280,CAMBODIA,5,28.514,83.3252
281,CAMBODIA,6,28.372,83.0696
282,CAMBODIA,7,27.814,82.0652
283,CAMBODIA,8,28.016,82.4288
284,CAMBODIA,9,27.409,81.3362
285,CAMBODIA,10,27.535,81.563


In [74]:
set(temp_avg['month'])

{1, 2, 3, 4, 5, 6, 7, 8, 9}