# Explore Data

## Imports and installs

In [1]:
import pandas as pd
import numpy as np
import pyspark
import os
import configparser

## Exploring Immigration Data

#### Read in the Immigration Data & View 1st 5 rows

In [2]:
# Read in the Immigration Data
df_immigration = pd.read_csv("immigration_data_sample.csv")
df_immigration.head(5)

Unnamed: 0.1,Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,...,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,...,,M,1955.0,7202016,F,,JL,56582670000.0,00782,WT
1,2171295,4422636.0,2016.0,4.0,582.0,582.0,MCA,20567.0,1.0,TX,...,,M,1990.0,10222016,M,,*GA,94362000000.0,XBLNG,B2
2,589494,1195600.0,2016.0,4.0,148.0,112.0,OGG,20551.0,1.0,FL,...,,M,1940.0,7052016,M,,LH,55780470000.0,00464,WT
3,2631158,5291768.0,2016.0,4.0,297.0,297.0,LOS,20572.0,1.0,CA,...,,M,1991.0,10272016,M,,QR,94789700000.0,00739,B2
4,3032257,985523.0,2016.0,4.0,111.0,111.0,CHM,20550.0,3.0,NY,...,,M,1997.0,7042016,F,,,42322570000.0,LAND,WT


#### Learn more about the columns

In [3]:
# View Columns and their respective Data type
df_immigration.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 29 columns):
Unnamed: 0    1000 non-null int64
cicid         1000 non-null float64
i94yr         1000 non-null float64
i94mon        1000 non-null float64
i94cit        1000 non-null float64
i94res        1000 non-null float64
i94port       1000 non-null object
arrdate       1000 non-null float64
i94mode       1000 non-null float64
i94addr       941 non-null object
depdate       951 non-null float64
i94bir        1000 non-null float64
i94visa       1000 non-null float64
count         1000 non-null float64
dtadfile      1000 non-null int64
visapost      382 non-null object
occup         4 non-null object
entdepa       1000 non-null object
entdepd       954 non-null object
entdepu       0 non-null float64
matflag       954 non-null object
biryear       1000 non-null float64
dtaddto       1000 non-null object
gender        859 non-null object
insnum        35 non-null float64
airline       967 non

#### Create Fact Table

In [4]:
#Create Fact Table
Immigrations = df_immigration[['cicid', 'i94yr', 'i94mon','i94addr', 'i94port', 'i94mode','i94visa', 'arrdate', 'depdate' , 'matflag']]
Immigrations.columns = ['cic_id', 'year', 'month', 'state_code','port_code', 'mode_code','visa_code','arrival_date', 'departure_date','match_flag']

#### View 1st 5 rows in Fact Table: Immigrations

In [5]:
#View Top 5 Value of Fact Table
Immigrations.head(5)

Unnamed: 0,cic_id,year,month,state_code,port_code,mode_code,visa_code,arrival_date,departure_date,match_flag
0,4084316.0,2016.0,4.0,HI,HHW,1.0,2.0,20566.0,20573.0,M
1,4422636.0,2016.0,4.0,TX,MCA,1.0,2.0,20567.0,20568.0,M
2,1195600.0,2016.0,4.0,FL,OGG,1.0,2.0,20551.0,20571.0,M
3,5291768.0,2016.0,4.0,CA,LOS,1.0,2.0,20572.0,20581.0,M
4,985523.0,2016.0,4.0,NY,CHM,3.0,2.0,20550.0,20553.0,M


### Clean up Fact Table

#### Convert arrive_date and departure_date from SAS format to pandas data time format

In [6]:
# Define function to convert SAS to data format
def Convert_SAS_to_date(date):
    return pd.to_timedelta(date, unit='D') + pd.Timestamp('1960-1-1')

In [7]:
Immigrations['arrival_date'] = Convert_SAS_to_date(Immigrations['arrival_date'])
Immigrations['departure_date'] = Convert_SAS_to_date(Immigrations['departure_date'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#### Convert cici_id, year, month, mode_code and visa_code to datatype integer

In [8]:
# using dictionary to convert specific columns
convert_dict = {'cic_id': int,
                'year': int,
                'month': int,
                'mode_code': int,
                'visa_code': int,
               }
  
Immigrations = Immigrations.astype(convert_dict)
print(Immigrations.dtypes)

cic_id                     int64
year                       int64
month                      int64
state_code                object
port_code                 object
mode_code                  int64
visa_code                  int64
arrival_date      datetime64[ns]
departure_date    datetime64[ns]
match_flag                object
dtype: object


#### View Fact Table: Immigrations after clean up

In [9]:
Immigrations.head(5)

Unnamed: 0,cic_id,year,month,state_code,port_code,mode_code,visa_code,arrival_date,departure_date,match_flag
0,4084316,2016,4,HI,HHW,1,2,2016-04-22,2016-04-29,M
1,4422636,2016,4,TX,MCA,1,2,2016-04-23,2016-04-24,M
2,1195600,2016,4,FL,OGG,1,2,2016-04-07,2016-04-27,M
3,5291768,2016,4,CA,LOS,1,2,2016-04-28,2016-05-07,M
4,985523,2016,4,NY,CHM,3,2,2016-04-06,2016-04-09,M


#### Create 1st Dimension Table: Immigrants

In [10]:
#Create 1st Dimension Table
Immigrants = df_immigration[['cicid', 'i94cit', 'i94res', 'i94bir', 'gender', 'insnum']]
Immigrants.columns = ['cic_id','citizen_country', 'residence_country','age','gender','ins_num']

#### View 1st 5 rows of Dimension Table: Immigrants

In [11]:
#View Top 10 Values from Immigrant Table
Immigrants.head(5)

Unnamed: 0,cic_id,citizen_country,residence_country,age,gender,ins_num
0,4084316.0,209.0,209.0,61.0,F,
1,4422636.0,582.0,582.0,26.0,M,
2,1195600.0,148.0,112.0,76.0,M,
3,5291768.0,297.0,297.0,25.0,M,
4,985523.0,111.0,111.0,19.0,F,


### Clean Up Dimension Table: Immigrants

####  Convert cic_id, citizen_country, residence_country and age to datatype integer

In [12]:
# using dictionary to convert specific columns

convert_dict = {'cic_id': int,
                'citizen_country': int,
                'residence_country': int,
                'age': int
               }
  
Immigrants = Immigrants.astype(convert_dict)
print(Immigrants.dtypes)

cic_id                 int64
citizen_country        int64
residence_country      int64
age                    int64
gender                object
ins_num              float64
dtype: object


#### View Dimension Table: Immigrants after clean up

In [13]:
Immigrants.head(5)

Unnamed: 0,cic_id,citizen_country,residence_country,age,gender,ins_num
0,4084316,209,209,61,F,
1,4422636,582,582,26,M,
2,1195600,148,112,76,M,
3,5291768,297,297,25,M,
4,985523,111,111,19,F,


#### Create 2nd Dimension Table: Airports

In [14]:
#Create 2nd Dimension Table
Airports = df_immigration[['cicid','airline','fltno','admnum','visatype']]
Airports.columns = ['cic_id','airline','flight_number','admin_number','visa_type']

#### View 1st 5 rows of Dimension Table: Airports

In [15]:
#View top 5 Airport Table
Airports.head(5)

Unnamed: 0,cic_id,airline,flight_number,admin_number,visa_type
0,4084316.0,JL,00782,56582670000.0,WT
1,4422636.0,*GA,XBLNG,94362000000.0,B2
2,1195600.0,LH,00464,55780470000.0,WT
3,5291768.0,QR,00739,94789700000.0,B2
4,985523.0,,LAND,42322570000.0,WT


### Clean Up Dimension Table: Airports

#### Convert cic_id to datatype int & admin_number to 

In [16]:
convert_dict = {'cic_id': int}
Airports = Airports.astype(convert_dict)
Airports['admin_number'] = Airports['admin_number'].apply(lambda x: '{:18.0f}'.format(x))
print(Airports.dtypes)

cic_id            int64
airline          object
flight_number    object
admin_number     object
visa_type        object
dtype: object


#### View Dimension Table: Airports after cleaning 

In [17]:
Airports.head(5)


Unnamed: 0,cic_id,airline,flight_number,admin_number,visa_type
0,4084316,JL,00782,56582674633,WT
1,4422636,*GA,XBLNG,94361995930,B2
2,1195600,LH,00464,55780468433,WT
3,5291768,QR,00739,94789696030,B2
4,985523,,LAND,42322572633,WT


## Exploring U.S City Demographic Data

#### Read in U.S City Demographic Data & view 1st 5 rows

In [18]:
#Read in U.S City Demographic Data
df_demographic = pd.read_csv("us-cities-demographics.csv", delimiter=';')
df_demographic.head(5)

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


#### Learn more about the columns

In [19]:
# View Columns and their respective Data type
df_demographic.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2891 entries, 0 to 2890
Data columns (total 12 columns):
City                      2891 non-null object
State                     2891 non-null object
Median Age                2891 non-null float64
Male Population           2888 non-null float64
Female Population         2888 non-null float64
Total Population          2891 non-null int64
Number of Veterans        2878 non-null float64
Foreign-born              2878 non-null float64
Average Household Size    2875 non-null float64
State Code                2891 non-null object
Race                      2891 non-null object
Count                     2891 non-null int64
dtypes: float64(6), int64(2), object(4)
memory usage: 271.1+ KB


#### Create Dimension Table: Populations

In [20]:
Populations = df_demographic[['City','State','State Code','Male Population','Female Population','Total Population','Number of Veterans','Foreign-born','Race']]
Populations.columns = ['city','state','state_code','male_population','female_population','total_population','num_of_veterans','foreign_born','race']

#### View 1st 5 rows of Dimension Table: Populations

In [21]:
Populations.head(5)

Unnamed: 0,city,state,state_code,male_population,female_population,total_population,num_of_veterans,foreign_born,race
0,Silver Spring,Maryland,MD,40601.0,41862.0,82463,1562.0,30908.0,Hispanic or Latino
1,Quincy,Massachusetts,MA,44129.0,49500.0,93629,4147.0,32935.0,White
2,Hoover,Alabama,AL,38040.0,46799.0,84839,4819.0,8229.0,Asian
3,Rancho Cucamonga,California,CA,88127.0,87105.0,175232,5821.0,33878.0,Black or African-American
4,Newark,New Jersey,NJ,138040.0,143873.0,281913,5829.0,86253.0,White


### Clean Dimension Table: Populations

#### Convert Male Population, Female Population, Number of Veterans and Foreign-born as datatype int

In [22]:
Populations['male_population'] = Populations['male_population'].fillna(0)
Populations['female_population'] = Populations['female_population'].fillna(0)
Populations['num_of_veterans'] = Populations['num_of_veterans'].fillna(0)
Populations['foreign_born'] = Populations['foreign_born'].fillna(0)


convert_dict = {'male_population': int,
                'female_population': int,
                'num_of_veterans': int,
                'foreign_born': int
               }
  
Populations = Populations.astype(convert_dict)
print(Populations.dtypes)

city                 object
state                object
state_code           object
male_population       int64
female_population     int64
total_population      int64
num_of_veterans       int64
foreign_born          int64
race                 object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

Convert city and state to upper case

In [23]:
Populations['city'] = Populations['city'].str.upper()
Populations['state'] = Populations['state'].str.upper()

#### View changes to Dimension Table: Populations

In [24]:
Populations.head(5)

Unnamed: 0,city,state,state_code,male_population,female_population,total_population,num_of_veterans,foreign_born,race
0,SILVER SPRING,MARYLAND,MD,40601,41862,82463,1562,30908,Hispanic or Latino
1,QUINCY,MASSACHUSETTS,MA,44129,49500,93629,4147,32935,White
2,HOOVER,ALABAMA,AL,38040,46799,84839,4819,8229,Asian
3,RANCHO CUCAMONGA,CALIFORNIA,CA,88127,87105,175232,5821,33878,Black or African-American
4,NEWARK,NEW JERSEY,NJ,138040,143873,281913,5829,86253,White


#### Create Dimension Table: Populations_Statistics

In [25]:
Population_Statistics= df_demographic[['City','State','State Code','Median Age','Average Household Size']]
Population_Statistics.columns = ['city','state','state_code','median_age','avg_household_size']

#### View 1st 5 rows from Dimension Table: Populations_Statistics

In [26]:
Population_Statistics.head(5)

Unnamed: 0,city,state,state_code,median_age,avg_household_size
0,Silver Spring,Maryland,MD,33.8,2.6
1,Quincy,Massachusetts,MA,41.0,2.39
2,Hoover,Alabama,AL,38.5,2.58
3,Rancho Cucamonga,California,CA,34.5,3.18
4,Newark,New Jersey,NJ,34.6,2.73


### Clean Dimension Table: Population_Statistics

#### Convert Median Age and Average Household Size to datatype float

In [27]:
convert_dict = {
                'median_age': float,
                'avg_household_size': float
               }
  
Population_Statistics = Population_Statistics.astype(convert_dict)
print(Population_Statistics.dtypes)

city                   object
state                  object
state_code             object
median_age            float64
avg_household_size    float64
dtype: object


#### Convert city and state to uppercase

In [28]:
Population_Statistics['city'] = Population_Statistics['city'].str.upper()
Population_Statistics['state'] = Population_Statistics['state'].str.upper()

#### View Dimension Table: Population_Statistics after cleaning

In [29]:
Population_Statistics.head(5)

Unnamed: 0,city,state,state_code,median_age,avg_household_size
0,SILVER SPRING,MARYLAND,MD,33.8,2.6
1,QUINCY,MASSACHUSETTS,MA,41.0,2.39
2,HOOVER,ALABAMA,AL,38.5,2.58
3,RANCHO CUCAMONGA,CALIFORNIA,CA,34.5,3.18
4,NEWARK,NEW JERSEY,NJ,34.6,2.73


## Exploring World Temperature Data

#### Read in World Temperature Data & View 1st 5 rows

In [30]:
fname = '../../data2/GlobalLandTemperaturesByCity.csv'
df_temperature = pd.read_csv(fname)
df_temperature.head(5)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


#### Learn more about the coloums

In [31]:
df_temperature.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
dt                               object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                             object
Country                          object
Latitude                         object
Longitude                        object
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


### Add year and month to dataframe

In [32]:
df_temperature['dt'] = pd.to_datetime(df_temperature['dt'])
df_temperature['year'] = pd.DatetimeIndex(df_temperature['dt']).year
df_temperature['month'] =pd.DatetimeIndex(df_temperature['dt']).month


### Convert city and country to uppercase

In [34]:
df_temperature['City'] = df_temperature['City'].str.upper()
df_temperature['Country'] = df_temperature['Country'].str.upper()

In [35]:
df_temperature.head(5)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,year,month
0,1743-11-01,6.068,1.737,ÅRHUS,DENMARK,57.05N,10.33E,1743,11
1,1743-12-01,,,ÅRHUS,DENMARK,57.05N,10.33E,1743,12
2,1744-01-01,,,ÅRHUS,DENMARK,57.05N,10.33E,1744,1
3,1744-02-01,,,ÅRHUS,DENMARK,57.05N,10.33E,1744,2
4,1744-03-01,,,ÅRHUS,DENMARK,57.05N,10.33E,1744,3


#### Create Dimension Table: Temperature

In [36]:
Temperatures = df_temperature[['dt','year','month','City','Country','Latitude','Longitude']]
Temperatures.columns = ['date','year','month','city','country','latitude','longitude']

#### View Dimension Table: Temperatures

In [37]:
Temperatures.head(5)

Unnamed: 0,date,year,month,city,country,latitude,longitude
0,1743-11-01,1743,11,ÅRHUS,DENMARK,57.05N,10.33E
1,1743-12-01,1743,12,ÅRHUS,DENMARK,57.05N,10.33E
2,1744-01-01,1744,1,ÅRHUS,DENMARK,57.05N,10.33E
3,1744-02-01,1744,2,ÅRHUS,DENMARK,57.05N,10.33E
4,1744-03-01,1744,3,ÅRHUS,DENMARK,57.05N,10.33E


#### View Dimension Table: Temperature after cleaning

In [38]:
Temperatures.head(5)

Unnamed: 0,date,year,month,city,country,latitude,longitude
0,1743-11-01,1743,11,ÅRHUS,DENMARK,57.05N,10.33E
1,1743-12-01,1743,12,ÅRHUS,DENMARK,57.05N,10.33E
2,1744-01-01,1744,1,ÅRHUS,DENMARK,57.05N,10.33E
3,1744-02-01,1744,2,ÅRHUS,DENMARK,57.05N,10.33E
4,1744-03-01,1744,3,ÅRHUS,DENMARK,57.05N,10.33E


Create Dimension Table: Temperature_Statistics

In [39]:
Temperature_Statistics = df_temperature[['dt','year','month','City','Country','AverageTemperature','AverageTemperatureUncertainty']]
Temperature_Statistics.columns = ['date','year','month','city','country','average_temperature','average_temperature_uncertainty']

#### View Dimension Table: Temperature_Statistics

In [40]:
Temperature_Statistics.head(5)

Unnamed: 0,date,year,month,city,country,average_temperature,average_temperature_uncertainty
0,1743-11-01,1743,11,ÅRHUS,DENMARK,6.068,1.737
1,1743-12-01,1743,12,ÅRHUS,DENMARK,,
2,1744-01-01,1744,1,ÅRHUS,DENMARK,,
3,1744-02-01,1744,2,ÅRHUS,DENMARK,,
4,1744-03-01,1744,3,ÅRHUS,DENMARK,,


### Clean Dimension Table: Temperature_Statistics

#### Convert Na to 0 and conform datatype to float 

In [41]:
Temperature_Statistics['average_temperature'] = Temperature_Statistics['average_temperature'].fillna(0)
Temperature_Statistics['average_temperature_uncertainty'] = Temperature_Statistics['average_temperature_uncertainty'].fillna(0)
convert_dict = {
                'average_temperature': float,
                'average_temperature_uncertainty': float
               }
  
Temperature_Statistics = Temperature_Statistics.astype(convert_dict)
print(Temperature_Statistics.dtypes)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


date                               datetime64[ns]
year                                        int64
month                                       int64
city                                       object
country                                    object
average_temperature                       float64
average_temperature_uncertainty           float64
dtype: object


#### View Dimension Table: Temperature_Statistics after cleaning

In [42]:
Temperature_Statistics.head(5)

Unnamed: 0,date,year,month,city,country,average_temperature,average_temperature_uncertainty
0,1743-11-01,1743,11,ÅRHUS,DENMARK,6.068,1.737
1,1743-12-01,1743,12,ÅRHUS,DENMARK,0.0,0.0
2,1744-01-01,1744,1,ÅRHUS,DENMARK,0.0,0.0
3,1744-02-01,1744,2,ÅRHUS,DENMARK,0.0,0.0
4,1744-03-01,1744,3,ÅRHUS,DENMARK,0.0,0.0


## Use I94_SAS_Labels_Descriptions to create dataframes to get the actual names of Countries, States, Port, Modes and Visas.

#### Open "I94_SAS_Labels_Descriptions.SAS"
    

In [43]:
with open("I94_SAS_Labels_Descriptions.SAS") as library:
    lines = library.readlines()

### Create Auxilary Tables

#### Create Countries Table

In [44]:
# Create Countries Dataframe
col_names =  ['code', 'country']
Countries  = pd.DataFrame(columns = col_names)


#  Insert Data into Dataframe
country_data = lines[9:298]
for data in country_data:
    temp = data.split('=')
    list = [temp[0].strip(), temp[1].strip().strip("'")]
    a_series = pd.Series(list, index = Countries.columns)
    Countries = Countries.append(a_series, ignore_index=True)

# View 1st 5 Columns in Countries Auxilary Table
Countries.head(5)

Unnamed: 0,code,country
0,582,"MEXICO Air Sea, and Not Reported (I-94, no lan..."
1,236,AFGHANISTAN
2,101,ALBANIA
3,316,ALGERIA
4,102,ANDORRA


#### Create States Table

In [45]:
# Create States Dataframe
col_names =  ['state_code', 'state']
States  = pd.DataFrame(columns = col_names)


#  Insert Data into Dataframe
state_data = lines[981:1036]
for data in state_data:
    temp = data.split('=')
    list = [temp[0].strip().strip("'"), temp[1].strip().strip("'")]
    a_series = pd.Series(list, index = States.columns)
    States = States.append(a_series, ignore_index=True)

# View 1st 5 Columns in States Auxilary Table
States.head(5)

Unnamed: 0,state_code,state
0,AL,ALABAMA
1,AK,ALASKA
2,AZ,ARIZONA
3,AR,ARKANSAS
4,CA,CALIFORNIA


#### Create Port Tables

In [46]:
# Create Ports Dataframe 
col_names =  ['port_code','port_city']
Ports  = pd.DataFrame(columns = col_names)


#  Insert Data into Dataframe
port_data = lines[302:961]
for data in port_data:
    temp = data.split('=')
    list = [temp[0].strip().strip("'"),temp[1].strip().strip("'").split(',')[0]]
    a_series = pd.Series(list, index = Ports.columns)
    Ports = Ports.append(a_series, ignore_index=True)


# View 1st 5 Columns in Ports Auxilary Table
Ports.head(5)

Unnamed: 0,port_code,port_city
0,ALC,ALCAN
1,ANC,ANCHORAGE
2,BAR,BAKER AAF - BAKER ISLAND
3,DAC,DALTONS CACHE
4,PIZ,DEW STATION PT LAY DEW


#### Create Modes Table

In [47]:
# Create Modes Dataframe 
col_names =  ['mode_code', 'mode']
Modes  = pd.DataFrame(columns = col_names)

#  Insert Data into Dataframe
mode_data = lines[972:976]
for data in mode_data:
    temp = data.split('=')
    list = [temp[0].strip().strip("'"), temp[1].strip().strip("'").strip(";")]
    a_series = pd.Series(list, index = Modes.columns)
    Modes = Modes.append(a_series, ignore_index=True)

# View 1st 5 Columns in Modes Auxilary Table
Ports.head(5)


Unnamed: 0,port_code,port_city
0,ALC,ALCAN
1,ANC,ANCHORAGE
2,BAR,BAKER AAF - BAKER ISLAND
3,DAC,DALTONS CACHE
4,PIZ,DEW STATION PT LAY DEW


### Create Visas Table

In [48]:
# Create Visas Dataframe 
col_names =  ['visa_code', 'visa']
Visas = pd.DataFrame(columns = col_names)

#  Insert Data into Dataframe
visa_data = lines[1046:1049]
for data in visa_data:
    temp = data.split('=')
    list = [temp[0].strip().strip("'"), temp[1].strip().strip("'").strip(";")]
    a_series = pd.Series(list, index = Visas.columns)
    Visas = Visas.append(a_series, ignore_index=True)

# View 1st 5 Columns in Modes Auxilary Table
Visas.head(5)


Unnamed: 0,visa_code,visa
0,1,Business
1,2,Pleasure
2,3,Student
