In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import sqlite3

In [2]:
# Preview data: loading a sample dataset to understand its structure
sample = pd.read_excel('Resources/2012_immsuptable1d.xls')
sample.head()

Unnamed: 0,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,...,Texas,U.S. Armed Services Posts,U.S. Territories,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,REGION,,,,,,,,,,...,,,,,,,,,,
1,Total,1031631.0,3873.0,1612.0,18434.0,2795.0,196622.0,13327.0,12237.0,2208.0,...,95557.0,108.0,1559.0,5932.0,877.0,28227.0,23060.0,779.0,6049.0,427.0
2,Africa,107241.0,519.0,118.0,1517.0,129.0,7150.0,2471.0,1093.0,497.0,...,9513.0,8.0,8.0,514.0,142.0,5429.0,3160.0,124.0,768.0,34.0
3,Asia,429599.0,1784.0,910.0,6225.0,1077.0,107825.0,5003.0,3941.0,857.0,...,32341.0,67.0,694.0,2090.0,476.0,13748.0,12347.0,382.0,2773.0,161.0
4,Europe,81671.0,330.0,186.0,967.0,180.0,11121.0,1086.0,1952.0,156.0,...,3834.0,14.0,24.0,440.0,126.0,1945.0,2853.0,93.0,660.0,69.0


In [3]:
files = ['2005_immsuptable1d.xls','2006_immsuptable1d.xls',
         '2007_immsuptable1d.xls','2008_immsuptable1d.xls','2009_immsuptable1d.xls',
         '2010_immsuptable1d.xlsx','2011_immsuptable1d.xls',
         '2012_immsuptable1d.xls', '2013_immsuptable1d.xls', '2014_immsuptable1d.xls',
         '2015_immsuptable1d.xls', '2016_immsuptable1d.xls', '2017_immsuptable1d.xlsx',
         '2018_immsuptable1d.xlsx', '2019_immsuptable1d.xlsx', '2020_immsuptable1d.xlsx',
         '2021_immsuptable1d.xlsx']

# Initialize DataFrame to store immigration data across all years
immigration_df = pd.DataFrame(columns=['Year'])

# Iterate over the files
for file in files:
    # Load file into DataFrame
    df = pd.read_excel('Resources/' + file)

    # Extract year from filename and add it as a column
    year = int(file[:4])
    df.insert(0, 'Year', year)

    # Concatenate this DataFrame with the master DataFrame
    immigration_df = pd.concat([immigration_df, df], ignore_index=True)

# Display the first 50 rows of the DataFrame
immigration_df.head(50)


Unnamed: 0,Year,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,Wyoming,U.S. Armed Services posts,U.S. possessions,Unknown,U.S. Armed Services Posts,U.S. Dependencies,Guam,Puerto Rico,U.S. Territories,U.S. Territories1
0,2005,REGION,,,,,,,,,...,,,,,,,,,,
1,2005,Total,1122373.0,4200,1525,18988,2698,232023,11977,15335,...,321,128,5868,9,,,,,,
2,2005,Africa,85102.0,341,47,1229,115,6813,1300,900,...,D,D,25,-,,,,,,
3,2005,Asia,400135.0,1789,735,4418,952,106540,3519,4641,...,90,67,1683,4,,,,,,
4,2005,Europe,176569.0,695,366,2529,267,22806,2572,4094,...,86,26,170,1,,,,,,
5,2005,North America,345575.0,1049,272,10000,1213,84337,3810,2751,...,120,17,3161,2,,,,,,
6,2005,Oceania,6546.0,28,36,96,13,2142,139,71,...,D,D,14,2,,,,,,
7,2005,South America,103143.0,261,65,639,125,7616,622,2844,...,15,9,715,-,,,,,,
8,2005,Unknown,5303.0,37,4,77,13,1769,15,34,...,-,4,100,-,,,,,,
9,2005,COUNTRY,,,,,,,,,...,,,,,,,,,,


In [4]:
# Clean up the DataFrame by replacing 'D' and '-' values with 0 and convert all to integer type
int_columns = immigration_df.columns.drop('Region and country of birth')
immigration_df[int_columns] = immigration_df[int_columns].replace({'D': 0, '-': 0}).fillna(0).astype(np.int64)
immigration_df.head()

Unnamed: 0,Year,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,Wyoming,U.S. Armed Services posts,U.S. possessions,Unknown,U.S. Armed Services Posts,U.S. Dependencies,Guam,Puerto Rico,U.S. Territories,U.S. Territories1
0,2005,REGION,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2005,Total,1122373,4200,1525,18988,2698,232023,11977,15335,...,321,128,5868,9,0,0,0,0,0,0
2,2005,Africa,85102,341,47,1229,115,6813,1300,900,...,0,0,25,0,0,0,0,0,0,0
3,2005,Asia,400135,1789,735,4418,952,106540,3519,4641,...,90,67,1683,4,0,0,0,0,0,0
4,2005,Europe,176569,695,366,2529,267,22806,2572,4094,...,86,26,170,1,0,0,0,0,0,0


In [5]:
# replacing NaN values with 0
immigration_df = immigration_df.fillna(0)

In [6]:
# Drop unnecessary columns
immigration_df = immigration_df.drop(['U.S. Armed Services Posts', 'U.S. Territories', 'U.S. Territories1', 'Unknown'], axis=1)

In [7]:
# Rename 'Total' column to 'Total Permanent Residents'
immigration_df.rename(columns={'Total': 'Total Permanent Residents'}, inplace=True)

In [8]:
# Remove duplicate entries. This is necessary as each file contains two 'Total' rows.
immigration_df = immigration_df.drop_duplicates(subset=["Year", "Region and country of birth"])

In [9]:
# Add a new column named 'Percentage' after 'Total Permanent Residents' column
immigration_df.insert(immigration_df.columns.get_loc('Total Permanent Residents') + 1, 'Percentage', 0)

In [10]:
# Get unique years from the 'Year' column in immigration_df DataFrame
years = immigration_df['Year'].unique()

for year in years:
    # Calculate total permanent residents for the given year
    total_residents = immigration_df.loc[(immigration_df['Region and country of birth'].str.contains('Total')) & 
                                         (immigration_df['Year'] == year), 'Total Permanent Residents'].values
    
    # Calculate the percentage of residents per region/country with respect to the total for the given year, and store it in the 'Percentage' column
    immigration_df.loc[immigration_df['Year'] == year, 'Percentage'] = \
        (immigration_df.loc[immigration_df['Year'] == year, 'Total Permanent Residents'] / total_residents) * 100
    
    # Round the 'Percentage' values to two decimal places for the given year
    immigration_df.loc[immigration_df['Year'] == year, 'Percentage'] = \
        immigration_df.loc[immigration_df['Year'] == year, 'Percentage'].round(2)
    
immigration_df.head(50)

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Percentage,Alabama,Alaska,Arizona,Arkansas,California,Colorado,...,Virginia,Washington,West Virginia,Wisconsin,Wyoming,U.S. Armed Services posts,U.S. possessions,U.S. Dependencies,Guam,Puerto Rico
0,2005,REGION,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2005,Total,1122373,100.0,4200,1525,18988,2698,232023,11977,...,27100,26482,847,7909,321,128,5868,0,0,0
2,2005,Africa,85102,7.58,341,47,1229,115,6813,1300,...,4398,2054,52,575,0,0,25,0,0,0
3,2005,Asia,400135,35.65,1789,735,4418,952,106540,3519,...,12229,9688,454,3020,90,67,1683,0,0,0
4,2005,Europe,176569,15.73,695,366,2529,267,22806,2572,...,3159,9731,158,2099,86,26,170,0,0,0
5,2005,North America,345575,30.79,1049,272,10000,1213,84337,3810,...,4456,4024,127,1733,120,17,3161,0,0,0
6,2005,Oceania,6546,0.58,28,36,96,13,2142,139,...,118,352,5,57,0,0,14,0,0,0
7,2005,South America,103143,9.19,261,65,639,125,7616,622,...,2710,596,51,395,15,9,715,0,0,0
8,2005,Unknown,5303,0.47,37,4,77,13,1769,15,...,30,37,0,30,0,4,100,0,0,0
9,2005,COUNTRY,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Find the top 20 countries by percentage of total immigration for each year
years = immigration_df['Year'].unique()
for year in years:
    
    # Filter the rows for the current year and exclude rows with "Total" in the "Region and country of birth" column
    filtered_df = immigration_df[(immigration_df['Year'] == year) &
                                 (~immigration_df['Region and country of birth'].str.contains('Total', na=False))]
    
    # Exclude rows with region/country names in the excluded_regions list
    filtered_df = filtered_df[~filtered_df['Region and country of birth'].isin(
        ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America', 'Unknown'])]
    
    # Sort the DataFrame by the "Percentage" column in descending order and select the top 20 rows
    top_20 = filtered_df.nlargest(20, 'Percentage')
    
    # Display the year, region/country, percentage and 'Total Permanent Residents' columns
    print(f"Year: {year}")
    print(top_20[['Region and country of birth', 'Percentage', 'Total Permanent Residents']])
    print()


Year: 2005
    Region and country of birth  Percentage  Total Permanent Residents
139                      Mexico       14.38                     161445
102                       India        7.54                      84681
52     China, People's Republic        6.23                      69967
167                 Philippines        5.41                      60748
61                         Cuba        3.23                      36261
225                     Vietnam        2.92                      32784
68           Dominican Republic        2.45                      27504
115                       Korea        2.37                      26562
53                     Colombia        2.28                      25571
217                     Ukraine        2.03                      22761
46                       Canada        1.95                      21878
71                  El Salvador        1.90                      21359
219              United Kingdom        1.76                      1

In [12]:
# Define the countries to keep
immigration_df = immigration_df[immigration_df["Region and country of birth"].isin
                                (["Iran", "Mexico", "China, People's Republic", "Pakistan", "India",
                                  "United Kingdom", "Dominican Republic", "Philippines", "Total"])]

immigration_df

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Percentage,Alabama,Alaska,Arizona,Arkansas,California,Colorado,...,Virginia,Washington,West Virginia,Wisconsin,Wyoming,U.S. Armed Services posts,U.S. possessions,U.S. Dependencies,Guam,Puerto Rico
1,2005,Total,1122373,100.00,4200,1525,18988,2698,232023,11977,...,27100,26482,847,7909,321,128,5868,0,0,0
52,2005,"China, People's Republic",69967,6.23,328,92,543,202,17668,765,...,1327,1508,101,593,28,13,184,0,0,0
68,2005,Dominican Republic,27504,2.45,5,42,22,0,82,6,...,90,18,6,39,0,0,2346,0,0,0
102,2005,India,84681,7.54,431,15,739,215,14724,516,...,2776,1747,133,876,0,0,24,0,0,0
104,2005,Iran,13887,1.24,48,4,285,9,7059,131,...,562,318,18,48,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3835,2021,Iran,5734,0.77,21,0,86,17,2206,89,...,218,187,9,38,0,0,0,0,0,0
3870,2021,Mexico,107230,14.49,323,33,6859,610,31715,3131,...,586,2539,20,949,69,0,0,0,0,39
3893,2021,Pakistan,9691,1.31,31,0,59,41,1104,52,...,784,166,17,57,3,0,0,0,0,0
3899,2021,Philippines,27511,3.72,152,190,555,129,6478,228,...,648,737,69,236,27,0,0,0,300,9


In [13]:
# Display the data type information for each column in the immigration_df DataFrame
immigration_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 153 entries, 1 to 3948
Data columns (total 60 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Year                         153 non-null    int64  
 1   Region and country of birth  153 non-null    object 
 2   Total Permanent Residents    153 non-null    int64  
 3   Percentage                   153 non-null    float64
 4   Alabama                      153 non-null    int64  
 5   Alaska                       153 non-null    int64  
 6   Arizona                      153 non-null    int64  
 7   Arkansas                     153 non-null    int64  
 8   California                   153 non-null    int64  
 9   Colorado                     153 non-null    int64  
 10  Connecticut                  153 non-null    int64  
 11  Delaware                     153 non-null    int64  
 12  District of Columbia         153 non-null    int64  
 13  Florida            

In [14]:
# Save the final DataFrame to a new CSV file
immigration_df.to_csv('Resources/immigration_data_2012_2021.csv', index=False)

In [15]:
# Write the data to a SQLite database
# Create a connection to the SQLite database
conn = sqlite3.connect('Resources/immigration_data_sqlite.db')

# Write the data to a sqlite table
immigration_df.to_sql('immigration_data_sqlite', conn, if_exists='replace', index=False)

# Commit any changes and close the connection
conn.commit()
conn.close()

In [16]:
# Create a connection to the SQLite database to load the data and check if the data has been stored correctly
conn = sqlite3.connect('Resources/immigration_data_sqlite.db')

# Write a SQL query to load the data from the table in the SQLite database
immigration_df_sqlite = pd.read_sql_query("SELECT * from immigration_data_sqlite", conn)

# close the connection
conn.close()

# View the DataFrame
immigration_df_sqlite

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Percentage,Alabama,Alaska,Arizona,Arkansas,California,Colorado,...,Virginia,Washington,West Virginia,Wisconsin,Wyoming,U.S. Armed Services posts,U.S. possessions,U.S. Dependencies,Guam,Puerto Rico
0,2005,Total,1122373,100.00,4200,1525,18988,2698,232023,11977,...,27100,26482,847,7909,321,128,5868,0,0,0
1,2005,"China, People's Republic",69967,6.23,328,92,543,202,17668,765,...,1327,1508,101,593,28,13,184,0,0,0
2,2005,Dominican Republic,27504,2.45,5,42,22,0,82,6,...,90,18,6,39,0,0,2346,0,0,0
3,2005,India,84681,7.54,431,15,739,215,14724,516,...,2776,1747,133,876,0,0,24,0,0,0
4,2005,Iran,13887,1.24,48,4,285,9,7059,131,...,562,318,18,48,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2021,Iran,5734,0.77,21,0,86,17,2206,89,...,218,187,9,38,0,0,0,0,0,0
149,2021,Mexico,107230,14.49,323,33,6859,610,31715,3131,...,586,2539,20,949,69,0,0,0,0,39
150,2021,Pakistan,9691,1.31,31,0,59,41,1104,52,...,784,166,17,57,3,0,0,0,0,0
151,2021,Philippines,27511,3.72,152,190,555,129,6478,228,...,648,737,69,236,27,0,0,0,300,9
