In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import sqlite3

In [2]:
# Preview data: loading a sample dataset to understand its structure
sample = pd.read_excel('Resources/2012_immsuptable1d.xls')
sample.head()

Unnamed: 0,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,...,Texas,U.S. Armed Services Posts,U.S. Territories,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,REGION,,,,,,,,,,...,,,,,,,,,,
1,Total,1031631.0,3873.0,1612.0,18434.0,2795.0,196622.0,13327.0,12237.0,2208.0,...,95557.0,108.0,1559.0,5932.0,877.0,28227.0,23060.0,779.0,6049.0,427.0
2,Africa,107241.0,519.0,118.0,1517.0,129.0,7150.0,2471.0,1093.0,497.0,...,9513.0,8.0,8.0,514.0,142.0,5429.0,3160.0,124.0,768.0,34.0
3,Asia,429599.0,1784.0,910.0,6225.0,1077.0,107825.0,5003.0,3941.0,857.0,...,32341.0,67.0,694.0,2090.0,476.0,13748.0,12347.0,382.0,2773.0,161.0
4,Europe,81671.0,330.0,186.0,967.0,180.0,11121.0,1086.0,1952.0,156.0,...,3834.0,14.0,24.0,440.0,126.0,1945.0,2853.0,93.0,660.0,69.0


In [3]:
files = ['2005_immsuptable1d.xls','2006_immsuptable1d.xls',
         '2007_immsuptable1d.xls','2008_immsuptable1d.xls','2009_immsuptable1d.xls',
         '2010_immsuptable1d.xlsx','2011_immsuptable1d.xls',
         '2012_immsuptable1d.xls', '2013_immsuptable1d.xls', '2014_immsuptable1d.xls',
         '2015_immsuptable1d.xls', '2016_immsuptable1d.xls', '2017_immsuptable1d.xlsx',
         '2018_immsuptable1d.xlsx', '2019_immsuptable1d.xlsx']

# Initialize DataFrame to store immigration data across all years
immigration_df = pd.DataFrame(columns=['Year'])

# Iterate over the files
for file in files:
    # Load file into DataFrame
    df = pd.read_excel('Resources/' + file)

    # Extract year from filename and add it as a column
    year = int(file[:4])
    df.insert(0, 'Year', year)

    # Concatenate this DataFrame with the master DataFrame
    immigration_df = pd.concat([immigration_df, df], ignore_index=True)

# Display the first 50 rows of the DataFrame
immigration_df.head(50)


Unnamed: 0,Year,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,Wyoming,U.S. Armed Services posts,U.S. possessions,Unknown,U.S. Armed Services Posts,U.S. Dependencies,Guam,Puerto Rico,U.S. Territories,U.S. Territories1
0,2005,REGION,,,,,,,,,...,,,,,,,,,,
1,2005,Total,1122373.0,4200,1525,18988,2698,232023,11977,15335,...,321,128,5868,9,,,,,,
2,2005,Africa,85102.0,341,47,1229,115,6813,1300,900,...,D,D,25,-,,,,,,
3,2005,Asia,400135.0,1789,735,4418,952,106540,3519,4641,...,90,67,1683,4,,,,,,
4,2005,Europe,176569.0,695,366,2529,267,22806,2572,4094,...,86,26,170,1,,,,,,
5,2005,North America,345575.0,1049,272,10000,1213,84337,3810,2751,...,120,17,3161,2,,,,,,
6,2005,Oceania,6546.0,28,36,96,13,2142,139,71,...,D,D,14,2,,,,,,
7,2005,South America,103143.0,261,65,639,125,7616,622,2844,...,15,9,715,-,,,,,,
8,2005,Unknown,5303.0,37,4,77,13,1769,15,34,...,-,4,100,-,,,,,,
9,2005,COUNTRY,,,,,,,,,...,,,,,,,,,,


In [4]:
# Clean up the DataFrame by replacing 'D' and '-' values with 0 and convert all to integer type
int_columns = immigration_df.columns.drop('Region and country of birth')
immigration_df[int_columns] = immigration_df[int_columns].replace({'D': 0, '-': 0}).fillna(0).astype(np.int64)
immigration_df.head()

Unnamed: 0,Year,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,Wyoming,U.S. Armed Services posts,U.S. possessions,Unknown,U.S. Armed Services Posts,U.S. Dependencies,Guam,Puerto Rico,U.S. Territories,U.S. Territories1
0,2005,REGION,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2005,Total,1122373,4200,1525,18988,2698,232023,11977,15335,...,321,128,5868,9,0,0,0,0,0,0
2,2005,Africa,85102,341,47,1229,115,6813,1300,900,...,0,0,25,0,0,0,0,0,0,0
3,2005,Asia,400135,1789,735,4418,952,106540,3519,4641,...,90,67,1683,4,0,0,0,0,0,0
4,2005,Europe,176569,695,366,2529,267,22806,2572,4094,...,86,26,170,1,0,0,0,0,0,0


In [5]:
# replacing NaN values with 0
immigration_df = immigration_df.fillna(0)

In [6]:
immigration_df.columns

Index(['Year', 'Region and country of birth', 'Total', 'Alabama', 'Alaska',
       'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
       'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii',
       'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
       'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
       'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming', 'U.S. Armed Services posts',
       'U.S. possessions', 'Unknown', 'U.S. Armed Services Posts',
       'U.S. Dependencies', 'Guam', 'Puerto Rico', 'U.S. Territories',
       'U.S. Territories1'],
      dtype='object')

In [7]:
# Drop unnecessary columns
immigration_df = immigration_df.drop(['U.S. Armed Services Posts', 'U.S. Territories', 'U.S. Territories1', 'Unknown',
                                      'U.S. Dependencies', 'U.S. possessions', 'U.S. Armed Services posts', 'U.S. Territories',
                                      'U.S. Territories1', 'Guam', 'Puerto Rico'], axis=1)

In [8]:
immigration_df.columns

Index(['Year', 'Region and country of birth', 'Total', 'Alabama', 'Alaska',
       'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
       'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii',
       'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
       'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
       'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'],
      dtype='object')

In [9]:
# define the list of values to be removed
values_to_remove = ['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
                    'South America', 'Unknown', 'REGION', 'COUNTRY', 'D Data withheld to limit disclosure.',
                    "'- Represents zero.", '1 Includes American Samoa, Northern Mariana Islands, and U.S. Virgin Islands.',
                    'Source: U.S. Department of Homeland Security.' ]

# remove the rows
immigration_df = immigration_df[~immigration_df['Region and country of birth'].isin(values_to_remove)]
immigration_df

Unnamed: 0,Year,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
1,2005,Total,1122373,4200,1525,18988,2698,232023,11977,15335,...,881,8962,95958,5082,1042,27100,26482,847,7909,321
10,2005,Total,1122373,4200,1525,18988,2698,232023,11977,15335,...,881,8962,95958,5082,1042,27100,26482,847,7909,321
11,2005,Afghanistan,4749,5,0,244,0,1331,64,50,...,20,64,273,90,3,456,88,5,29,0
12,2005,Albania,5947,0,0,35,5,94,9,387,...,0,17,97,3,14,80,26,0,73,0
13,2005,Algeria,1115,4,0,11,0,180,18,7,...,0,3,59,0,0,54,21,0,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3483,2019,"Virgin Islands, U.S.",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3484,2019,Western Sahara,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3485,2019,Yemen,3717,43,0,0,14,608,4,59,...,0,74,22,0,0,53,22,3,0,0
3486,2019,Zambia,511,4,0,21,0,25,7,4,...,3,5,73,8,0,6,12,0,10,0


In [10]:
# Rename 'Total' column to 'Total Permanent Residents'
immigration_df.rename(columns={'Total': 'Total Permanent Residents'}, inplace=True)

In [11]:
# Remove duplicate entries. This is necessary as each file contains two 'Total' rows.
immigration_df = immigration_df.drop_duplicates(subset=["Year", "Region and country of birth"])

In [12]:
# Add a new column named 'Percentage' after 'Total Permanent Residents' column
immigration_df.insert(immigration_df.columns.get_loc('Total Permanent Residents') + 1, 'Percentage', 0)

In [13]:
# Get unique years from the 'Year' column in immigration_df DataFrame
years = immigration_df['Year'].unique()

for year in years:
    # Calculate total permanent residents for the given year
    total_residents = immigration_df.loc[(immigration_df['Region and country of birth'].str.contains('Total')) & 
                                         (immigration_df['Year'] == year), 'Total Permanent Residents'].values
    
    # Calculate the percentage of residents per region/country with respect to the total for the given year, and store it in the 'Percentage' column
    immigration_df.loc[immigration_df['Year'] == year, 'Percentage'] = \
        (immigration_df.loc[immigration_df['Year'] == year, 'Total Permanent Residents'] / total_residents) * 100
    
    # Round the 'Percentage' values to two decimal places for the given year
    immigration_df.loc[immigration_df['Year'] == year, 'Percentage'] = \
        immigration_df.loc[immigration_df['Year'] == year, 'Percentage'].round(2)
    
immigration_df.head(50)

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Percentage,Alabama,Alaska,Arizona,Arkansas,California,Colorado,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
1,2005,Total,1122373,100.0,4200,1525,18988,2698,232023,11977,...,881,8962,95958,5082,1042,27100,26482,847,7909,321
11,2005,Afghanistan,4749,0.42,5,0,244,0,1331,64,...,20,64,273,90,3,456,88,5,29,0
12,2005,Albania,5947,0.53,0,0,35,5,94,9,...,0,17,97,3,14,80,26,0,73,0
13,2005,Algeria,1115,0.1,4,0,11,0,180,18,...,0,3,59,0,0,54,21,0,10,0
14,2005,American Samoa,15,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,2005,Angola,188,0.02,0,0,14,0,7,0,...,0,0,17,0,0,0,0,0,0,0
16,2005,Anguilla,35,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,2005,Antigua-Barbuda,440,0.04,0,0,0,0,0,0,...,0,4,7,0,0,6,0,0,0,0
18,2005,Argentina,7081,0.63,6,3,50,15,868,72,...,10,32,348,91,5,144,61,11,34,0
19,2005,Armenia,2591,0.23,0,0,13,0,1969,20,...,0,3,26,8,5,28,37,0,3,0


In [14]:
# Save the final DataFrame to a new CSV file
immigration_df.to_csv('Resources/immigration_data_2005_2019.csv', index=False)

In [16]:
# Find the top 10 countries by percentage of total immigration for each year
years = immigration_df['Year'].unique()
for year in years:
    
    # Filter the rows for the current year and exclude rows with "Total" in the "Region and country of birth" column
    filtered_df = immigration_df[(immigration_df['Year'] == year) &
                                 (~immigration_df['Region and country of birth'].str.contains('Total', na=False))]
    
    # Sort the DataFrame by the "Percentage" column in descending order and select the top 10 rows
    top_10 = filtered_df.nlargest(10, 'Percentage')
    
    # Display the year, region/country, percentage and 'Total Permanent Residents' columns
    print(f"Year: {year}")
    print(top_10[['Region and country of birth', 'Percentage', 'Total Permanent Residents']])
    print()


Year: 2005
    Region and country of birth  Percentage  Total Permanent Residents
139                      Mexico       14.38                     161445
102                       India        7.54                      84681
52     China, People's Republic        6.23                      69967
167                 Philippines        5.41                      60748
61                         Cuba        3.23                      36261
225                     Vietnam        2.92                      32784
68           Dominican Republic        2.45                      27504
115                       Korea        2.37                      26562
53                     Colombia        2.28                      25571
217                     Ukraine        2.03                      22761

Year: 2006
    Region and country of birth  Percentage  Total Permanent Residents
373                      Mexico       13.72                     173753
287    China, People's Republic        6.90           

In [17]:
# Define the countries to keep
immigration_df = immigration_df[immigration_df["Region and country of birth"].isin
                                (["Iran", "Mexico", "China, People's Republic", "Pakistan", "India",
                                  "United Kingdom", "Dominican Republic", "Philippines", "Total"])]

immigration_df

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Percentage,Alabama,Alaska,Arizona,Arkansas,California,Colorado,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
1,2005,Total,1122373,100.00,4200,1525,18988,2698,232023,11977,...,881,8962,95958,5082,1042,27100,26482,847,7909,321
52,2005,"China, People's Republic",69967,6.23,328,92,543,202,17668,765,...,51,637,4139,217,50,1327,1508,101,593,28
68,2005,Dominican Republic,27504,2.45,5,42,22,0,82,6,...,0,22,119,11,0,90,18,6,39,0
102,2005,India,84681,7.54,431,15,739,215,14724,516,...,23,900,7139,147,74,2776,1747,133,876,0
104,2005,Iran,13887,1.24,48,4,285,9,7059,131,...,0,150,1002,135,4,562,318,18,48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363,2019,Iran,6640,0.64,16,0,92,26,2693,70,...,3,46,701,40,3,214,258,5,54,12
3398,2019,Mexico,156052,15.12,507,77,8520,891,47725,4243,...,74,1286,38736,2034,8,796,3266,44,1418,113
3419,2019,Pakistan,13921,1.35,44,0,67,28,1706,60,...,8,67,2095,72,0,943,286,23,126,0
3425,2019,Philippines,45920,4.45,235,514,916,180,14934,387,...,47,672,2629,202,23,1019,1492,70,252,31


In [18]:
# Display the data type information for each column in the immigration_df DataFrame
immigration_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135 entries, 1 to 3475
Data columns (total 55 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Year                         135 non-null    int64  
 1   Region and country of birth  135 non-null    object 
 2   Total Permanent Residents    135 non-null    int64  
 3   Percentage                   135 non-null    float64
 4   Alabama                      135 non-null    int64  
 5   Alaska                       135 non-null    int64  
 6   Arizona                      135 non-null    int64  
 7   Arkansas                     135 non-null    int64  
 8   California                   135 non-null    int64  
 9   Colorado                     135 non-null    int64  
 10  Connecticut                  135 non-null    int64  
 11  Delaware                     135 non-null    int64  
 12  District of Columbia         135 non-null    int64  
 13  Florida            

In [19]:
# Save the final DataFrame to a new CSV file
immigration_df.to_csv('Resources/immigration_selected_2005_2019.csv', index=False)

In [20]:
# Write the data to a SQLite database
# Create a connection to the SQLite database
conn = sqlite3.connect('Resources/immigration_selected_2005_2019_sqlite.sqlite')

# Write the data to a sqlite table
immigration_df.to_sql('immigration_selected_2005_2019_sqlite', conn, if_exists='replace', index=False)

# Commit any changes and close the connection
conn.commit()
conn.close()

In [21]:
# Create a connection to the SQLite database to load the data and check if the data has been stored correctly
conn = sqlite3.connect('Resources/immigration_selected_2005_2019_sqlite.sqlite')

# Write a SQL query to load the data from the table in the SQLite database
immigration_df_sqlite = pd.read_sql_query("SELECT * from immigration_selected_2005_2019_sqlite", conn)

# close the connection
conn.close()

# View the DataFrame
immigration_df_sqlite

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Percentage,Alabama,Alaska,Arizona,Arkansas,California,Colorado,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2005,Total,1122373,100.00,4200,1525,18988,2698,232023,11977,...,881,8962,95958,5082,1042,27100,26482,847,7909,321
1,2005,"China, People's Republic",69967,6.23,328,92,543,202,17668,765,...,51,637,4139,217,50,1327,1508,101,593,28
2,2005,Dominican Republic,27504,2.45,5,42,22,0,82,6,...,0,22,119,11,0,90,18,6,39,0
3,2005,India,84681,7.54,431,15,739,215,14724,516,...,23,900,7139,147,74,2776,1747,133,876,0
4,2005,Iran,13887,1.24,48,4,285,9,7059,131,...,0,150,1002,135,4,562,318,18,48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,2019,Iran,6640,0.64,16,0,92,26,2693,70,...,3,46,701,40,3,214,258,5,54,12
131,2019,Mexico,156052,15.12,507,77,8520,891,47725,4243,...,74,1286,38736,2034,8,796,3266,44,1418,113
132,2019,Pakistan,13921,1.35,44,0,67,28,1706,60,...,8,67,2095,72,0,943,286,23,126,0
133,2019,Philippines,45920,4.45,235,514,916,180,14934,387,...,47,672,2629,202,23,1019,1492,70,252,31
