# Create Dimension Tables

## I94ADDR

In [56]:
import pandas as pd
with open("I94_SAS_Labels_Descriptions.SAS") as f:
    content = f.readlines()
content = [x.strip().replace("'","") for x in content[981:1035]]
df_addr=pd.DataFrame()
for line in content:
    value = line.split("=")[0].strip()
    i94addrl = line.split("=")[-1].strip()
    df_addr=df_addr.append(
        {"state_code" : value, "state_name": i94addrl}, ignore_index=True
    )
df_addr.head()

Unnamed: 0,state_code,state_name
0,AL,ALABAMA
1,AK,ALASKA
2,AZ,ARIZONA
3,AR,ARKANSAS
4,CA,CALIFORNIA


In [57]:
states = list(set(df_addr['state_code'].values))

In [58]:
df_addr.to_csv("dimensions/us_states.csv", index=False)

## I94PORT

In [59]:
import pandas as pd

with open("I94_SAS_Labels_Descriptions.SAS") as f:
    content = f.readlines()
content = [x.strip().replace("'","") for x in content[302:962]]
df_port_locations=pd.DataFrame()
for line in content:
    port_code = line.split("=")[0].strip()
    port_city = line.split("=")[1].strip().split(",")[0].strip()
    port_state = line.split("=")[1].strip().split(",")[-1].strip()
    if port_state == port_city or port_state not in states:
        continue
    else:
        if " " in port_state:
            port_state = port_state.split(" ")[0]
    df_port_locations=df_port_locations.append(
        {"port_code" : port_code, "municipality": port_city, "state_code": port_state}, ignore_index=True
    )
df_port_locations.head()
municipality_port = list(set(df_port_locations['municipality'].values))

In [60]:
df_port_locations.head()

Unnamed: 0,municipality,port_code,state_code
0,ALCAN,ALC,AK
1,ANCHORAGE,ANC,AK
2,BAKER AAF - BAKER ISLAND,BAR,AK
3,DALTONS CACHE,DAC,AK
4,DEW STATION PT LAY DEW,PIZ,AK


In [61]:
df_port_locations.to_csv("dimensions/us_ports.csv", index=False)

## I94CIT & I94RES

In [11]:
with open("I94_SAS_Labels_Descriptions.SAS") as f:
    content = f.readlines()
content = [x.strip().replace("'","") for x in content[10:299]]
df_cit_res=pd.DataFrame()
for line in content:
    value = line.split("=")[0].strip()
    i94cntyl = line.split("=")[-1].strip()
    if "INVALID" in i94cntyl or "Not Reported" in i94cntyl or "Collapsed" in i94cntyl or value in i94cntyl:
        continue
    df_cit_res=df_cit_res.append(
        {"country_code" : value, "country_name": i94cntyl}, ignore_index=True
    )
df_cit_res.head()

Unnamed: 0,country_code,country_name
0,236,AFGHANISTAN
1,101,ALBANIA
2,316,ALGERIA
3,102,ANDORRA
4,324,ANGOLA


In [12]:
df_cit_res.to_csv("dimensions/countries.csv", index=False)

## Airports

In [62]:
import pandas as pd
df_ac = pd.read_csv('airport-codes_csv.csv')

df_ac=df_ac[df_ac['iso_country']=='US']
df_ac=df_ac.dropna(subset=['iata_code'])
new=df_ac["coordinates"].str.split(",", n = 1, expand = True)
df_ac["latitude"]= new[1]
df_ac["longitude"]= new[0]
new=df_ac["iso_region"].str.split("-", n = 1, expand = True)
df_ac["state_code"]= new[1]
df_ac = df_ac.drop(['coordinates', 'iso_country', 'continent', 'iso_region'], axis=1)
df_ac = df_ac.rename(columns={"ident": "id"})
df_ac.head()

Unnamed: 0,id,type,name,elevation_ft,municipality,gps_code,iata_code,local_code,latitude,longitude,state_code
440,07FA,small_airport,Ocean Reef Club Airport,8.0,Key Largo,07FA,OCA,07FA,25.325399398804,-80.274803161621,FL
594,0AK,small_airport,Pilot Station Airport,305.0,Pilot Station,,PQS,0AK,61.934601,-162.899994,AK
673,0CO2,small_airport,Crested Butte Airpark,8980.0,Crested Butte,0CO2,CSE,0CO2,38.851918,-106.928341,CO
1088,0TE7,small_airport,LBJ Ranch Airport,1515.0,Johnson City,0TE7,JCY,0TE7,30.251800537100003,-98.6224975586,TX
1402,13MA,small_airport,Metropolitan Airport,418.0,Palmer,13MA,PMX,13MA,42.2233009338,-72.31140136719999,MA


In [63]:
df_ac[df_ac['iata_code']=='FCA']

Unnamed: 0,id,type,name,elevation_ft,municipality,gps_code,iata_code,local_code,latitude,longitude,state_code
27362,KGPI,medium_airport,Glacier Park International Airport,2977.0,Kalispell,KGPI,FCA,GPI,48.31050109863281,-114.25599670410156,MT


In [20]:
df_ac.to_csv("dimensions/us_airport_codes.csv", index=False)

## Temperature Data 

In [3]:
df_temper = pd.read_csv('../../data2/GlobalLandTemperaturesByCity.csv')
df_temper=df_temper[df_temper['Country']=='United States']
df_temper = df_temper.dropna()
df_temper = df_temper.drop_duplicates(['dt', 'City', 'Country'],keep= 'first')
df_temper =df_temper.drop(columns=['AverageTemperatureUncertainty', 'Latitude', 'Longitude', 'Country'])

df_temper = df_temper.rename(columns=
                             {
                                 "AverageTemperature": "avg_temp", 
                                 "City": "city"
                             })

df_temper.head()

Unnamed: 0,dt,avg_temp,city
47555,1820-01-01,2.101,Abilene
47556,1820-02-01,6.926,Abilene
47557,1820-03-01,10.767,Abilene
47558,1820-04-01,17.989,Abilene
47559,1820-05-01,21.809,Abilene


In [4]:
df_temper.columns

Index(['dt', 'avg_temp', 'city'], dtype='object')

In [5]:
df_temper.shape

(639649, 3)

In [6]:
df_temper.to_csv("dimensions/us_temperature.csv", index=False)

## Demographics Data 

In [64]:
import pandas as pd
df_uscd = pd.read_csv('us-cities-demographics.csv', delimiter=';')
df_uscd['City'] = df_uscd['City'].apply(lambda x: x.upper())
df_uscd['State'] = df_uscd['State'].apply(lambda x: x.upper())
df_uscd.columns = [i.lower().replace(" ", "_").replace("-", "_") for i in df_uscd.columns]
df_uscd =df_uscd.drop(columns=['state'])
df_uscd.head()

Unnamed: 0,city,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,race,count
0,SILVER SPRING,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,QUINCY,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,HOOVER,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,RANCHO CUCAMONGA,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,NEWARK,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [22]:
len(pd.unique(df_uscd['city'])) 

567

In [37]:
dem_citiy = set(df_uscd['city'].values)
por_cities = set(municipality_port)

In [39]:
print(len(dem_citiy))
print(len(por_cities))
print(len(por_cities.intersection(dem_citiy)))

567
485
115


In [49]:
df_uscd[df_uscd['city']=='COLUMBIA']

Unnamed: 0,city,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,race,count
62,COLUMBIA,37.9,52202.0,51265.0,103467,6526.0,23249.0,2.68,MD,Black or African-American,30075
760,COLUMBIA,37.9,52202.0,51265.0,103467,6526.0,23249.0,2.68,MD,Hispanic or Latino,8033
761,COLUMBIA,37.9,52202.0,51265.0,103467,6526.0,23249.0,2.68,MD,Asian,17821
874,COLUMBIA,26.8,56544.0,62554.0,119098,4548.0,10729.0,2.37,MO,Asian,8673
1384,COLUMBIA,26.8,56544.0,62554.0,119098,4548.0,10729.0,2.37,MO,Black or African-American,15489
1442,COLUMBIA,37.9,52202.0,51265.0,103467,6526.0,23249.0,2.68,MD,White,58343
1514,COLUMBIA,26.8,56544.0,62554.0,119098,4548.0,10729.0,2.37,MO,White,96067
1543,COLUMBIA,28.0,67686.0,65707.0,133393,5708.0,6074.0,2.32,SC,White,73232
1617,COLUMBIA,26.8,56544.0,62554.0,119098,4548.0,10729.0,2.37,MO,Hispanic or Latino,4956
1618,COLUMBIA,26.8,56544.0,62554.0,119098,4548.0,10729.0,2.37,MO,American Indian and Alaska Native,1713


In [67]:
df_dem_gen = df_uscd[['city','state_code','median_age', 'male_population', 'female_population', 'total_population', 'number_of_veterans', 'foreign_born', 'average_household_size']].drop_duplicates()

In [50]:
df_dem_gen.head()

Unnamed: 0,city,state_code,median_age,male_population,female_population,total_population,number_of_veterans,average_household_size
0,SILVER SPRING,MD,33.8,40601.0,41862.0,82463,1562.0,2.6
1,QUINCY,MA,41.0,44129.0,49500.0,93629,4147.0,2.39
2,HOOVER,AL,38.5,38040.0,46799.0,84839,4819.0,2.58
3,RANCHO CUCAMONGA,CA,34.5,88127.0,87105.0,175232,5821.0,3.18
4,NEWARK,NJ,34.6,138040.0,143873.0,281913,5829.0,2.73


In [68]:
df_dem_gen.to_csv("dimensions/us-cities-demographics_general.csv", index=False)

In [53]:
df_dem_race = df_uscd[['city','state_code','race', 'count']].drop_duplicates()
df_dem_race.head()

Unnamed: 0,city,state_code,race,count
0,SILVER SPRING,MD,Hispanic or Latino,25924
1,QUINCY,MA,White,58723
2,HOOVER,AL,Asian,4759
3,RANCHO CUCAMONGA,CA,Black or African-American,24437
4,NEWARK,NJ,White,76402


In [54]:
df_dem_race.to_csv("dimensions/us-cities-demographics_race.csv", index=False)

In [43]:
len(pd.unique(df_dem_gen['city'])) 

567

In [44]:
len(df_dem_gen['city']) 

596