In [1]:
# Import the modules
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Connect to SQLite database
# Create a connection to the SQLite database to load the data and check if the data has been stored correctly
conn = sqlite3.connect('Resources/immigration_selected_2005_2021_sqlite.sqlite')

# Write a SQL query to load the data from the table in the SQLite database
df = pd.read_sql_query("SELECT * from immigration_selected_2005_2021_sqlite", conn)
df.head()

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Percentage,Alabama,Alaska,Arizona,Arkansas,California,Colorado,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2005,Total,1122373,100.0,4200,1525,18988,2698,232023,11977,...,881,8962,95958,5082,1042,27100,26482,847,7909,321
1,2005,"China, People's Republic",69967,6.23,328,92,543,202,17668,765,...,51,637,4139,217,50,1327,1508,101,593,28
2,2005,Dominican Republic,27504,2.45,5,42,22,0,82,6,...,0,22,119,11,0,90,18,6,39,0
3,2005,India,84681,7.54,431,15,739,215,14724,516,...,23,900,7139,147,74,2776,1747,133,876,0
4,2005,Iran,13887,1.24,48,4,285,9,7059,131,...,0,150,1002,135,4,562,318,18,48,0


In [3]:
# Drop the "Percentage" column
df = df.drop('Percentage', axis=1)

In [4]:
# Add a new 'Covid' column
df['Covid'] = df['Year'].apply(lambda x: 1 if x in [2020, 2021] else 0)

In [5]:
df

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Covid
0,2005,Total,1122373,4200,1525,18988,2698,232023,11977,15335,...,8962,95958,5082,1042,27100,26482,847,7909,321,0
1,2005,"China, People's Republic",69967,328,92,543,202,17668,765,894,...,637,4139,217,50,1327,1508,101,593,28,0
2,2005,Dominican Republic,27504,5,42,22,0,82,6,319,...,22,119,11,0,90,18,6,39,0,0
3,2005,India,84681,431,15,739,215,14724,516,1571,...,900,7139,147,74,2776,1747,133,876,0,0
4,2005,Iran,13887,48,4,285,9,7059,131,88,...,150,1002,135,4,562,318,18,48,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2021,Iran,5734,21,0,86,17,2206,89,39,...,62,473,34,0,218,187,9,38,0,1
149,2021,Mexico,107230,323,33,6859,610,31715,3131,247,...,809,25282,1374,6,586,2539,20,949,69,1
150,2021,Pakistan,9691,31,0,59,41,1104,52,111,...,53,1464,25,0,784,166,17,57,3,1
151,2021,Philippines,27511,152,190,555,129,6478,228,178,...,759,1680,154,26,648,737,69,236,27,1


In [6]:
# List of unique countries
countries = df['Region and country of birth'].unique()
countries

array(['Total', "China, People's Republic", 'Dominican Republic', 'India',
       'Iran', 'Mexico', 'Pakistan', 'Philippines', 'United Kingdom'],
      dtype=object)

In [7]:
# Drop 'Total' from the unique countries
countries = np.delete(countries, np.where(countries == 'Total'))
countries

array(["China, People's Republic", 'Dominican Republic', 'India', 'Iran',
       'Mexico', 'Pakistan', 'Philippines', 'United Kingdom'],
      dtype=object)

In [8]:
# DataFrame to store predictions
df_predictions = pd.DataFrame()

In [9]:
# List of all states
states = df.columns[3:-1]

In [10]:
# Collect all predictions and true values for final R-squared calculation
all_preds = []
all_true = []

In [11]:
# Loop through all countries
for country in countries:
    # Prepare the data for the specific country
    df_country = df[df['Region and country of birth'] == country]

    # Loop through all states
    for state in states:
        # Prepare the state-specific data
        X = df_country[['Year', 'Covid']]  # Features
        y = df_country[state].values  # Target for the specific state

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Create and train the model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Calculate predicted values
        y_pred = model.predict(X_test)

        # Append predicted and true values to the respective lists
        all_preds.extend(y_pred)
        all_true.extend(y_test)

        # Create a list of next 5 years
        next_5_years = pd.DataFrame({
            'Year': np.array(range(max(df_country['Year']) + 1, max(df_country['Year']) + 6)),
            'Covid': [1, 0, 0, 0, 0]  
        })

        # Predict the immigration for the next 5 years and store in the predictions DataFrame
        predictions = model.predict(next_5_years)
        for i, year in enumerate(next_5_years['Year'].values):
            df_predictions = df_predictions.append({
                'Year': year,
                'Country': country,
                'State': state,
                'Predicted Immigration': predictions[i]
            }, ignore_index=True)

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predictions = df_predictions.append({
  df_predic

In [12]:
# Calculate overall R2 score
overall_r2 = r2_score(all_true, all_preds)
print(f'Overall R2 score: {overall_r2}')

Overall R2 score: 0.9718216096185182


In [13]:
# Pivot the DataFrame to have each state in a separate column
df_pivot = df_predictions.pivot_table(index=['Year', 'Country'], columns='State', values='Predicted Immigration')

In [14]:
# Reset the index
df_pivot.reset_index(inplace=True)

In [15]:
# Save the predictions DataFrame to a CSV file
df_pivot.to_csv('immigration_predictions.csv', index=False)