In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
# check columns and data as a sample
sample = pd.read_excel('2012_immsuptable1d.xls')
sample.head()

Unnamed: 0,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,...,Texas,U.S. Armed Services Posts,U.S. Territories,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,REGION,,,,,,,,,,...,,,,,,,,,,
1,Total,1031631.0,3873.0,1612.0,18434.0,2795.0,196622.0,13327.0,12237.0,2208.0,...,95557.0,108.0,1559.0,5932.0,877.0,28227.0,23060.0,779.0,6049.0,427.0
2,Africa,107241.0,519.0,118.0,1517.0,129.0,7150.0,2471.0,1093.0,497.0,...,9513.0,8.0,8.0,514.0,142.0,5429.0,3160.0,124.0,768.0,34.0
3,Asia,429599.0,1784.0,910.0,6225.0,1077.0,107825.0,5003.0,3941.0,857.0,...,32341.0,67.0,694.0,2090.0,476.0,13748.0,12347.0,382.0,2773.0,161.0
4,Europe,81671.0,330.0,186.0,967.0,180.0,11121.0,1086.0,1952.0,156.0,...,3834.0,14.0,24.0,440.0,126.0,1945.0,2853.0,93.0,660.0,69.0


In [3]:
# File names
files = ['2012_immsuptable1d.xls', '2013_immsuptable1d.xls', '2014_immsuptable1d.xls',
         '2015_immsuptable1d.xls', '2016_immsuptable1d.xls', '2017_immsuptable1d.xlsx',
         '2018_immsuptable1d.xlsx', '2019_immsuptable1d.xlsx', '2020_immsuptable1d.xlsx',
         '2021_immsuptable1d.xlsx']

# Initialize an empty DataFrame to store the data
immigration_df = pd.DataFrame(columns=['Year'])

# Loop through each file and read it into a DataFrame
for file_name in files:
    df = pd.read_excel(file_name)
    
    # Extract the year from the file name
    year = int(file_name[:4])
    
    # Add the year as a new column to the DataFrame
    df.insert(0, 'Year', year)
    
    immigration_df = pd.concat([immigration_df, df], ignore_index=True)

# Display the first 50 rows of the DataFrame
immigration_df.head(50)

Unnamed: 0,Year,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,U.S. Territories,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,U.S. Territories1,Unknown
0,2012,REGION,,,,,,,,,...,,,,,,,,,,
1,2012,Total,1031631,3873,1612,18434,2795,196622,13327,12237,...,1559,5932,877,28227,23060,779,6049,427,,
2,2012,Africa,107241,519,118,1517,129,7150,2471,1093,...,8,514,142,5429,3160,124,768,34,,
3,2012,Asia,429599,1784,910,6225,1077,107825,5003,3941,...,694,2090,476,13748,12347,382,2773,161,,
4,2012,Europe,81671,330,186,967,180,11121,1086,1952,...,24,440,126,1945,2853,93,660,69,,
5,2012,North America,327771,995,272,9166,1315,63037,4140,3180,...,794,1821,83,4271,3823,104,1500,137,,
6,2012,Oceania,4742,7,37,83,9,1732,128,46,...,7,147,6,78,266,8,27,4,,
7,2012,South America,79401,232,88,460,79,5620,488,2016,...,25,917,43,2735,596,68,300,22,,
8,2012,Unknown,1206,6,1,16,6,137,11,9,...,7,3,1,21,15,-,21,-,,
9,2012,COUNTRY,,,,,,,,,...,,,,,,,,,,


In [4]:
# Convert 'D' and '-' values to 0 and set format to int
int_columns = immigration_df.columns.drop('Region and country of birth')
immigration_df[int_columns] = immigration_df[int_columns].replace({'D': 0, '-': 0}).fillna(0).astype(np.int64)
immigration_df.head()

Unnamed: 0,Year,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,U.S. Territories,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,U.S. Territories1,Unknown
0,2012,REGION,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2012,Total,1031631,3873,1612,18434,2795,196622,13327,12237,...,1559,5932,877,28227,23060,779,6049,427,0,0
2,2012,Africa,107241,519,118,1517,129,7150,2471,1093,...,8,514,142,5429,3160,124,768,34,0,0
3,2012,Asia,429599,1784,910,6225,1077,107825,5003,3941,...,694,2090,476,13748,12347,382,2773,161,0,0
4,2012,Europe,81671,330,186,967,180,11121,1086,1952,...,24,440,126,1945,2853,93,660,69,0,0


In [5]:
# Convert NaN values to 0
immigration_df = immigration_df.fillna(0)

In [6]:
# Drop useless columns
immigration_df = immigration_df.drop(['U.S. Armed Services Posts', 'U.S. Territories', 'U.S. Territories1', 'Unknown'], axis=1)

In [7]:
# Define the countries to keep
immigration_df = immigration_df[immigration_df["Region and country of birth"].isin
                                (["Iran", "Mexico", "China, People's Republic", "Pakistan", "India","United Kingdom",
                                  "Dominican Republic", "Philippines"])]
immigration_df.head(50)

Unnamed: 0,Year,Region and country of birth,Total,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
52,2012,"China, People's Republic",81784,299,49,676,169,22424,637,609,...,45,539,3203,273,69,1385,2017,64,467,27
67,2012,Dominican Republic,41566,17,91,33,5,171,33,694,...,0,39,220,23,5,157,32,11,51,3
98,2012,India,66434,330,18,978,320,13951,483,1143,...,22,573,5844,159,36,2473,2180,66,632,15
100,2012,Iran,12916,26,4,223,13,6591,105,62,...,6,94,1160,75,0,567,361,16,23,4
135,2012,Mexico,146406,561,75,8075,961,49595,3316,255,...,54,1031,37852,1334,7,792,2408,27,1142,100
157,2012,Pakistan,14740,45,4,91,30,1731,67,270,...,4,68,1877,73,7,989,208,28,93,9
163,2012,Philippines,57327,145,600,849,164,22484,350,327,...,55,346,2778,135,28,1315,1804,68,216,43
214,2012,United Kingdom,12014,39,13,170,43,2090,179,282,...,19,121,1070,126,22,283,381,22,80,10
284,2013,"China, People's Republic",71798,312,64,511,125,20134,499,618,...,42,479,3281,231,42,1348,1842,64,389,0
299,2013,Dominican Republic,41311,23,45,51,5,129,12,624,...,3,38,316,27,6,205,41,9,40,0


In [8]:
# Check Dtype 
immigration_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 52 to 2324
Data columns (total 56 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Year                         80 non-null     int64 
 1   Region and country of birth  80 non-null     object
 2   Total                        80 non-null     int64 
 3   Alabama                      80 non-null     int64 
 4   Alaska                       80 non-null     int64 
 5   Arizona                      80 non-null     int64 
 6   Arkansas                     80 non-null     int64 
 7   California                   80 non-null     int64 
 8   Colorado                     80 non-null     int64 
 9   Connecticut                  80 non-null     int64 
 10  Delaware                     80 non-null     int64 
 11  District of Columbia         80 non-null     int64 
 12  Florida                      80 non-null     int64 
 13  Georgia                      80 no

In [9]:
#save to a new csv file
# Save the DataFrame to a new CSV file
immigration_df.to_csv('immigration_data_2012_2021.csv', index=False)