# Data Quality notebook

The purpose of this notebook is to perform the Data Quality evaluation in the datasets

In [2]:
import pandas as pd

# I 94 Aux Database

In [2]:
i94auxdb_file = open('Data/I94_SAS_Labels_Descriptions.SAS')
filecontent = i94auxdb_file.readlines()

### Build function to ingest country codes and port codes from the Labels Descriptions file


In [3]:
country_codes = filecontent[9:298]

country_codes_dic = {}
for rec in country_codes:
    country_codes_key = int(rec.split(' = ')[0].strip())
    country_codes_val = rec.split(' = ')[1].replace("'", "").strip("\n").strip(";").strip()
    country_codes_dic[country_codes_key] = country_codes_val

In [4]:
port_codes = filecontent[302:962]

port_codes_dic = {}
for rec in port_codes:
    port_codes_key = rec.split('\t=\t')[0].strip().replace("'", "")
    port_codes_val = rec.split('\t=\t')[1].replace("'", "").strip()
    port_codes_dic[port_codes_key] = port_codes_val

In [5]:
port_codes_dic

{'ALC': 'ALCAN, AK',
 'ANC': 'ANCHORAGE, AK',
 'BAR': 'BAKER AAF - BAKER ISLAND, AK',
 'DAC': 'DALTONS CACHE, AK',
 'PIZ': 'DEW STATION PT LAY DEW, AK',
 'DTH': 'DUTCH HARBOR, AK',
 'EGL': 'EAGLE, AK',
 'FRB': 'FAIRBANKS, AK',
 'HOM': 'HOMER, AK',
 'HYD': 'HYDER, AK',
 'JUN': 'JUNEAU, AK',
 '5KE': 'KETCHIKAN, AK',
 'KET': 'KETCHIKAN, AK',
 'MOS': 'MOSES POINT INTERMEDIATE, AK',
 'NIK': 'NIKISKI, AK',
 'NOM': 'NOM, AK',
 'PKC': 'POKER CREEK, AK',
 'ORI': 'PORT LIONS SPB, AK',
 'SKA': 'SKAGWAY, AK',
 'SNP': 'ST. PAUL ISLAND, AK',
 'TKI': 'TOKEEN, AK',
 'WRA': 'WRANGELL, AK',
 'HSV': 'MADISON COUNTY - HUNTSVILLE, AL',
 'MOB': 'MOBILE, AL',
 'LIA': 'LITTLE ROCK, AR (BPS)',
 'ROG': 'ROGERS ARPT, AR',
 'DOU': 'DOUGLAS, AZ',
 'LUK': 'LUKEVILLE, AZ',
 'MAP': 'MARIPOSA AZ',
 'NAC': 'NACO, AZ',
 'NOG': 'NOGALES, AZ',
 'PHO': 'PHOENIX, AZ',
 'POR': 'PORTAL, AZ',
 'SLU': 'SAN LUIS, AZ',
 'SAS': 'SASABE, AZ',
 'TUC': 'TUCSON, AZ',
 'YUI': 'YUMA, AZ',
 'AND': 'ANDRADE, CA',
 'BUR': 'BURBANK, CA',
 '

In [6]:
# function to return key for any value
def get_key(val):
    for key, value in port_codes_dic.items():
         if val == value:
             return key
 
    return "key doesn't exist"


get_key('ANCHORAGE, AK')

'ANC'

In [7]:
port_codes_dic['NYC']

'NEW YORK, NY'

# I94 Database

### Import dataset

In [8]:
i94_fname = 'Data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'
i94_df = pd.read_sas(i94_fname, 'sas7bdat', encoding="ISO-8859-1")

In [9]:
#i94_df.head().to_clipboard()

### Renaming the columns for better understanding

In [10]:
df_sasdatadict = pd.read_excel('Data/DataDict_SASfile.xlsx', sheet_name=0) 
newSAScolumns = df_sasdatadict.loc[0, :].values.tolist()

i94_df.columns = newSAScolumns

In [11]:
i94_df.head()

Unnamed: 0,ID,YEAR,MONTH,COUNTRYCODEPROCESSING_CIT,COUNTRYCODEPROCESSING_RES,PORTCODE,ARRIVALDATE,MODAL,STATE,DEPARTUREDATE,...,UPDATEFLAG,MATCHARRDEP,YEARBIRTH,ADDMITTEDUNTIL,GENDER,INS_NUM,AIRLINECODE,ADMINNUMBER,FLIGHTNUMBER,VISATYPE
0,6.0,2016.0,4.0,692.0,692.0,XXX,20573.0,,,,...,U,,1979.0,10282016,,,,1897628000.0,,B2
1,7.0,2016.0,4.0,254.0,276.0,ATL,20551.0,1.0,AL,,...,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1
2,15.0,2016.0,4.0,101.0,101.0,WAS,20545.0,1.0,MI,20691.0,...,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2
3,16.0,2016.0,4.0,101.0,101.0,NYC,20545.0,1.0,MA,20567.0,...,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2
4,17.0,2016.0,4.0,101.0,101.0,NYC,20545.0,1.0,MA,20567.0,...,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2


In [12]:
i94_df.columns

Index(['ID', 'YEAR', 'MONTH', 'COUNTRYCODEPROCESSING_CIT',
       'COUNTRYCODEPROCESSING_RES', 'PORTCODE', 'ARRIVALDATE', 'MODAL',
       'STATE', 'DEPARTUREDATE', 'RESPONDENTAGE', 'VISACODE', 'SUMSTATS',
       'DATE', 'WHEREVISAISSUED', 'OCCUPATIONWILLBEPERFORMEDINUS',
       'ARRIVALFLAG', 'DEPARTUREFLAG', 'UPDATEFLAG', 'MATCHARRDEP',
       'YEARBIRTH', 'ADDMITTEDUNTIL', 'GENDER', 'INS_NUM', 'AIRLINECODE',
       'ADMINNUMBER', 'FLIGHTNUMBER', 'VISATYPE'],
      dtype='object')

### Check total counts for Arrival Flag per item

*Decided not to use this field for lack of relevance

In [103]:
i94_df['ARRIVALFLAG'].value_counts()

G    2399582
O     413057
A     108560
Z      64864
T      61144
K      17076
P      14397
H      14341
U       2371
B        401
N        252
M         27
F          3
Name: ARRIVALFLAG, dtype: int64

In [104]:
i94_df['DEPARTUREFLAG'].value_counts()

O    2513632
I      99846
D      96518
N      76192
K      70624
Q      52729
R      41879
W       3887
J       1758
V        762
L         44
M         13
Name: DEPARTUREFLAG, dtype: int64

### General checks (counts and statistics)

In [63]:
i94_df.count()

ID                               3096313
YEAR                             3096313
MONTH                            3096313
COUNTRYCODEPROCESSING_CIT        3096313
COUNTRYCODEPROCESSING_RES        3096313
PORTCODE                         3096313
ARRIVALDATE                      3096313
MODAL                            3096074
STATE                            2943721
DEPARTUREDATE                    2953856
RESPONDENTAGE                    3095511
VISACODE                         3096313
SUMSTATS                         3096313
DATE                             3096312
WHEREVISAISSUED                  1215063
OCCUPATIONWILLBEPERFORMEDINUS       8126
ARRIVALFLAG                      3096075
DEPARTUREFLAG                    2957884
UPDATEFLAG                           392
MATCHARRDEP                      2957884
YEARBIRTH                        3095511
ADDMITTEDUNTIL                   3095836
GENDER                           2682044
INS_NUM                           113708
AIRLINECODE     

In [64]:
i94_df.describe()

Unnamed: 0,ID,YEAR,MONTH,COUNTRYCODEPROCESSING_CIT,COUNTRYCODEPROCESSING_RES,ARRIVALDATE,MODAL,DEPARTUREDATE,RESPONDENTAGE,VISACODE,SUMSTATS,YEARBIRTH,ADMINNUMBER
count,3096313.0,3096313.0,3096313.0,3096313.0,3096313.0,3096313.0,3096074.0,2953856.0,3095511.0,3096313.0,3096313.0,3095511.0,3096313.0
mean,3078652.0,2016.0,4.0,304.9069,303.2838,20559.85,1.07369,20573.95,41.76761,1.845393,1.0,1974.232,70828850000.0
std,1763278.0,0.0,0.0,210.0269,208.5832,8.777339,0.5158963,29.35697,17.42026,0.398391,0.0,17.42026,22154420000.0
min,6.0,2016.0,4.0,101.0,101.0,20545.0,1.0,15176.0,-3.0,1.0,1.0,1902.0,0.0
25%,1577790.0,2016.0,4.0,135.0,131.0,20552.0,1.0,20561.0,30.0,2.0,1.0,1962.0,56035230000.0
50%,3103507.0,2016.0,4.0,213.0,213.0,20560.0,1.0,20570.0,41.0,2.0,1.0,1975.0,59360940000.0
75%,4654341.0,2016.0,4.0,512.0,504.0,20567.0,1.0,20579.0,54.0,2.0,1.0,1986.0,93509870000.0
max,6102785.0,2016.0,4.0,999.0,760.0,20574.0,9.0,45427.0,114.0,3.0,1.0,2019.0,99915570000.0


In [65]:
i94_df[i94_df.duplicated(keep=False)]

Unnamed: 0,ID,YEAR,MONTH,COUNTRYCODEPROCESSING_CIT,COUNTRYCODEPROCESSING_RES,PORTCODE,ARRIVALDATE,MODAL,STATE,DEPARTUREDATE,...,UPDATEFLAG,MATCHARRDEP,YEARBIRTH,ADDMITTEDUNTIL,GENDER,INS_NUM,AIRLINECODE,ADMINNUMBER,FLIGHTNUMBER,VISATYPE


### Check automatic data types

In [66]:
i94_df.dtypes

ID                               float64
YEAR                             float64
MONTH                            float64
COUNTRYCODEPROCESSING_CIT        float64
COUNTRYCODEPROCESSING_RES        float64
PORTCODE                          object
ARRIVALDATE                      float64
MODAL                            float64
STATE                             object
DEPARTUREDATE                    float64
RESPONDENTAGE                    float64
VISACODE                         float64
SUMSTATS                         float64
DATE                              object
WHEREVISAISSUED                   object
OCCUPATIONWILLBEPERFORMEDINUS     object
ARRIVALFLAG                       object
DEPARTUREFLAG                     object
UPDATEFLAG                        object
MATCHARRDEP                       object
YEARBIRTH                        float64
ADDMITTEDUNTIL                    object
GENDER                            object
INS_NUM                           object
AIRLINECODE     

In [98]:
i94_df['ARRIVALDATE'].unique()

array([20573., 20551., 20545., 20546., 20547., 20548., 20549., 20550.,
       20552., 20553., 20554., 20555., 20556., 20557., 20558., 20559.,
       20560., 20561., 20562., 20563., 20564., 20565., 20566., 20567.,
       20568., 20569., 20570., 20571., 20572., 20574.])

# Airport Codes


In [5]:
airport_df = pd.read_csv('Data/airport-codes.csv')

In [6]:
airport_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"
...,...,...,...,...,...,...,...,...,...,...,...,...
57416,ZYYK,medium_airport,Yingkou Lanqi Airport,0.0,AS,CN,CN-21,Yingkou,ZYYK,YKH,,"122.3586, 40.542524"
57417,ZYYY,medium_airport,Shenyang Dongta Airport,,AS,CN,CN-21,Shenyang,ZYYY,,,"123.49600219726562, 41.784400939941406"
57418,ZZ-0001,heliport,Sealand Helipad,40.0,EU,GB,GB-ENG,Sealand,,,,"1.4825, 51.894444"
57419,ZZ-0002,small_airport,Glorioso Islands Airstrip,11.0,AF,TF,TF-U-A,Grande Glorieuse,,,,"47.296388888900005, -11.584277777799999"


### Confirmation that IATA code was not covering the cases of NYC and Newark

In [19]:
airport_df[airport_df['iata_code']=='NEW']

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
29610,KNEW,medium_airport,Lakefront Airport,8.0,,US,US-LA,New Orleans,KNEW,NEW,NEW,"-90.028297424316, 30.042400360107"


### General checks (counts and statistics)

In [70]:
airport_df.count()

ident           57421
type            57421
name            57421
elevation_ft    49608
continent       28978
iso_country     57175
iso_region      57421
municipality    51527
gps_code        41561
iata_code        9225
local_code      30030
coordinates     57421
dtype: int64

In [71]:
airport_df.describe()

Unnamed: 0,elevation_ft
count,49608.0
mean,1253.371775
std,1615.482592
min,-1266.0
25%,208.0
50%,722.0
75%,1519.0
max,29977.0


In [72]:
airport_df[airport_df.duplicated(keep=False)]

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates


In [73]:
airport_df.dtypes

ident            object
type             object
name             object
elevation_ft    float64
continent        object
iso_country      object
iso_region       object
municipality     object
gps_code         object
iata_code        object
local_code       object
coordinates      object
dtype: object

# Cities Temperature Database

In [74]:
citiestemp_df = pd.read_csv('Data/GlobalLandTemperaturesByCity.csv')

In [75]:
citiestemp_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


### General data quality checks

In [79]:
citiestemp_df['dt'].max()

'2013-09-01'

In [93]:
Selected_city = 'Denver'

print(citiestemp_df[citiestemp_df['City'] == Selected_city]['AverageTemperature'].min(), \
    citiestemp_df[citiestemp_df['City'] == Selected_city]['AverageTemperature'].max(), \
        citiestemp_df[citiestemp_df['City'] == Selected_city]['AverageTemperature'].mean())

-10.466 24.511 8.777836262323188


In [76]:
citiestemp_df.count()

dt                               8599212
AverageTemperature               8235082
AverageTemperatureUncertainty    8235082
City                             8599212
Country                          8599212
Latitude                         8599212
Longitude                        8599212
dtype: int64

In [77]:
citiestemp_df.describe()

Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty
count,8235082.0,8235082.0
mean,16.72743,1.028575
std,10.35344,1.129733
min,-42.704,0.034
25%,10.299,0.337
50%,18.831,0.591
75%,25.21,1.349
max,39.651,15.396


In [7]:
citiestemp_df[citiestemp_df.duplicated(keep=False)]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude


In [11]:
citiestemp_df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                              object
Country                           object
Latitude                          object
Longitude                         object
dtype: object

# Cities Demographics

In [3]:
citiesdemo_df = pd.read_json('Data/us-cities-demographics.json')

In [4]:
citiesdemo_df

Unnamed: 0,datasetid,recordid,fields,record_timestamp
0,us-cities-demographics,85458783ecf5da6572ee00e7120f68eff4fd0d61,"{'count': 76402, 'city': 'Newark', 'number_of_...",1969-12-31T17:00:00-07:00
1,us-cities-demographics,a5ad84bdb4d72688fb6ae19a8bee43bcb01f9fea,"{'count': 1343, 'city': 'Peoria', 'number_of_v...",1969-12-31T17:00:00-07:00
2,us-cities-demographics,c54cd5021a16eb5f7b83987742bd495229b2155e,"{'count': 2583, 'city': 'O'Fallon', 'number_of...",1969-12-31T17:00:00-07:00
3,us-cities-demographics,914487cd3d80f15f9eacf24bf26fe28d518b670f,"{'count': 70303, 'city': 'Hampton', 'number_of...",1969-12-31T17:00:00-07:00
4,us-cities-demographics,a324ac81d2bed29a4b90d74c839e1698e9d06328,"{'count': 33630, 'city': 'Lakewood', 'number_o...",1969-12-31T17:00:00-07:00
...,...,...,...,...
2886,us-cities-demographics,e78367fdc5d0fa80f508e58ae8497e36bd613a11,"{'count': 14735, 'city': 'Boca Raton', 'number...",1969-12-31T17:00:00-07:00
2887,us-cities-demographics,06463e5a978a54d2f5557941eb2fb6131926ea1f,"{'count': 28148, 'city': 'Fort Lauderdale', 'n...",1969-12-31T17:00:00-07:00
2888,us-cities-demographics,ec62c23a4709c2de7851c2f1f306d9ab4e60ae33,"{'count': 165423, 'city': 'Oxnard', 'number_of...",1969-12-31T17:00:00-07:00
2889,us-cities-demographics,846d96ba3e95caf3446f6d7cb5eda32f5bba435c,"{'count': 62927, 'city': 'Somerville', 'number...",1969-12-31T17:00:00-07:00


### Exploding the JSON values into new dataframe

In [27]:
citiesdemo_fields_df = pd.DataFrame(citiesdemo_df.fields.values.tolist())

In [28]:
citiesdemo_fields_df

Unnamed: 0,count,city,number_of_veterans,male_population,foreign_born,average_household_size,median_age,state,race,total_population,state_code,female_population
0,76402,Newark,5829.0,138040.0,86253.0,2.73,34.6,New Jersey,White,281913,NJ,143873.0
1,1343,Peoria,6634.0,56229.0,7517.0,2.40,33.1,Illinois,American Indian and Alaska Native,118661,IL,62432.0
2,2583,O'Fallon,5783.0,41762.0,3269.0,2.77,36.0,Missouri,Hispanic or Latino,85032,MO,43270.0
3,70303,Hampton,19638.0,66214.0,6204.0,2.48,35.5,Virginia,Black or African-American,136454,VA,70240.0
4,33630,Lakewood,9988.0,76013.0,14169.0,2.29,37.7,Colorado,Hispanic or Latino,152589,CO,76576.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2886,14735,Boca Raton,4367.0,44760.0,21117.0,2.22,47.3,Florida,Hispanic or Latino,93226,FL,48466.0
2887,28148,Fort Lauderdale,8897.0,93948.0,47582.0,2.38,42.8,Florida,Hispanic or Latino,178587,FL,84639.0
2888,165423,Oxnard,6367.0,101906.0,78678.0,4.08,31.0,California,White,207252,CA,105346.0
2889,62927,Somerville,2103.0,41028.0,22292.0,2.43,31.0,Massachusetts,White,80334,MA,39306.0


In [29]:
citiesdemo_df = citiesdemo_df.join(citiesdemo_fields_df)

In [36]:
citiesdemo_df.sort_values(by='record_timestamp', ascending=False).head()


Unnamed: 0,datasetid,recordid,fields,record_timestamp,count,city,number_of_veterans,male_population,foreign_born,average_household_size,median_age,state,race,total_population,state_code,female_population
0,us-cities-demographics,85458783ecf5da6572ee00e7120f68eff4fd0d61,"{'count': 76402, 'city': 'Newark', 'number_of_...",1969-12-31T17:00:00-07:00,76402,Newark,5829.0,138040.0,86253.0,2.73,34.6,New Jersey,White,281913,NJ,143873.0
1920,us-cities-demographics,8a0fad2a44502979aa00499a22e8bc1046d431fc,"{'count': 1186, 'city': 'Lauderhill', 'number_...",1969-12-31T17:00:00-07:00,1186,Lauderhill,1965.0,32813.0,25471.0,3.02,35.7,Florida,American Indian and Alaska Native,71574,FL,38761.0
1922,us-cities-demographics,2d682a506cc1ebf752a956b2274a583a1c28a2f2,"{'count': 42331, 'city': 'Tuscaloosa', 'number...",1969-12-31T17:00:00-07:00,42331,Tuscaloosa,3647.0,47293.0,4706.0,2.67,29.1,Alabama,Black or African-American,98338,AL,51045.0
1923,us-cities-demographics,2f4cbc5d4f18707354566ca56ff090b7d2d43a8c,"{'count': 9393, 'city': 'Perris', 'number_of_v...",1969-12-31T17:00:00-07:00,9393,Perris,1321.0,41623.0,23277.0,4.78,26.9,California,Black or African-American,74959,CA,33336.0
1924,us-cities-demographics,5b24ce5bdf3ec70d42e59c69180b22ce48482432,"{'count': 10379, 'city': 'Midland', 'number_of...",1969-12-31T17:00:00-07:00,10379,Midland,5753.0,66595.0,17061.0,2.78,32.1,Texas,Black or African-American,132950,TX,66355.0


### Confirming that the record_timestamp field has a single value

In [37]:
citiesdemo_df['record_timestamp'].unique()

array(['1969-12-31T17:00:00-07:00'], dtype=object)