In [1]:
# Import the modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [2]:
# Load the data
df = pd.read_csv('Resources/immigration_data_2012_2021.csv')
df

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Percentage,Alabama,Alaska,Arizona,Arkansas,California,Colorado,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2012,Total,1031631,100.00,3873,1612,18434,2795,196622,13327,...,1521,8573,95557,5932,877,28227,23060,779,6049,427
1,2012,"China, People's Republic",81784,7.93,299,49,676,169,22424,637,...,45,539,3203,273,69,1385,2017,64,467,27
2,2012,Dominican Republic,41566,4.03,17,91,33,5,171,33,...,0,39,220,23,5,157,32,11,51,3
3,2012,India,66434,6.44,330,18,978,320,13951,483,...,22,573,5844,159,36,2473,2180,66,632,15
4,2012,Iran,12916,1.25,26,4,223,13,6591,105,...,6,94,1160,75,0,567,361,16,23,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,2021,Iran,5734,0.77,21,0,86,17,2206,89,...,7,62,473,34,0,218,187,9,38,0
86,2021,Mexico,107230,14.49,323,33,6859,610,31715,3131,...,66,809,25282,1374,6,586,2539,20,949,69
87,2021,Pakistan,9691,1.31,31,0,59,41,1104,52,...,0,53,1464,25,0,784,166,17,57,3
88,2021,Philippines,27511,3.72,152,190,555,129,6478,228,...,35,759,1680,154,26,648,737,69,236,27


In [3]:
df = df[df['Region and country of birth'] != 'Total']
df = df.drop(columns="Percentage")
df

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
1,2012,"China, People's Republic",81784,299,49,676,169,22424,637,609,...,45,539,3203,273,69,1385,2017,64,467,27
2,2012,Dominican Republic,41566,17,91,33,5,171,33,694,...,0,39,220,23,5,157,32,11,51,3
3,2012,India,66434,330,18,978,320,13951,483,1143,...,22,573,5844,159,36,2473,2180,66,632,15
4,2012,Iran,12916,26,4,223,13,6591,105,62,...,6,94,1160,75,0,567,361,16,23,4
5,2012,Mexico,146406,561,75,8075,961,49595,3316,255,...,54,1031,37852,1334,7,792,2408,27,1142,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,2021,Iran,5734,21,0,86,17,2206,89,39,...,7,62,473,34,0,218,187,9,38,0
86,2021,Mexico,107230,323,33,6859,610,31715,3131,247,...,66,809,25282,1374,6,586,2539,20,949,69
87,2021,Pakistan,9691,31,0,59,41,1104,52,111,...,0,53,1464,25,0,784,166,17,57,3
88,2021,Philippines,27511,152,190,555,129,6478,228,178,...,35,759,1680,154,26,648,737,69,236,27


In [4]:
countries = df['Region and country of birth'].unique()
states = df.columns[4:]

In [8]:
# Dictionary to store the predictions
predictions_dict = {'Country': [], 'State': [], 'Predicted_Immigrants_2025': []}

# Loop through countries and states
for country in countries:
    # First, predict 'Total Permanent Residents' based on 'Year'
    X_total = df[df['Region and country of birth'] == country][['Year']]
    y_total = df[df['Region and country of birth'] == country]['Total Permanent Residents']
    
    model_total = LinearRegression()
    model_total.fit(X_total, y_total)
    predicted_total_2025 = model_total.predict([[2025]])
    
    for state in states:
        # Get the data for the current country and state
        X = df[df['Region and country of birth'] == country][['Total Permanent Residents', 'Year']]
        y = df[df['Region and country of birth'] == country][state]
        
        # Create and train the linear regression model
        model = LinearRegression()
        model.fit(X, y)
        
        # Predict the number of immigrants for the state in 2025
        predicted_immigrants_2025 = model.predict([[predicted_total_2025[0], 2025]])
        
        # Store the predictions in the dictionary
        predictions_dict['Country'].append(country)
        predictions_dict['State'].append(state)
        predictions_dict['Predicted_Immigrants_2025'].append(predicted_immigrants_2025[0])
        
# Create a new DataFrame to store the predictions
predictions_df = pd.DataFrame(predictions_dict)

# Print the DataFrame with the predictions
predictions_df










Unnamed: 0,Country,State,Predicted_Immigrants_2025
0,"China, People's Republic",Alaska,5.830303
1,"China, People's Republic",Arizona,314.515152
2,"China, People's Republic",Arkansas,30.793939
3,"China, People's Republic",California,16309.824242
4,"China, People's Republic",Colorado,329.884848
...,...,...,...
411,United Kingdom,Virginia,179.163636
412,United Kingdom,Washington,319.042424
413,United Kingdom,West Virginia,8.769697
414,United Kingdom,Wisconsin,62.800000


In [6]:
predictions_df

Unnamed: 0,Country,State,Predicted_Immigrants_2025
0,"China, People's Republic",Alaska,5.830303
1,"China, People's Republic",Arizona,314.515152
2,"China, People's Republic",Arkansas,30.793939
3,"China, People's Republic",California,16309.824242
4,"China, People's Republic",Colorado,329.884848
...,...,...,...
411,United Kingdom,Virginia,179.163636
412,United Kingdom,Washington,319.042424
413,United Kingdom,West Virginia,8.769697
414,United Kingdom,Wisconsin,62.800000
