In [1]:

# Import pacakages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sns
import pycountry
import re
import ipywidgets as widgets
import cufflinks as cf
import geopandas as gpd

from ipywidgets import interact, interactive, fixed, interact_manual
from mpl_toolkits.axes_grid1 import make_axes_locatable

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA


# Setting global enviorment parameters below for easy viewing
pd.set_option('display.max_columns', None) 
plt.rcParams['figure.figsize'] = [15, 9] 

In [2]:
! pip install geopandas




In [3]:
# Reading CSV files and removing Unnecessary columns
pop_data = pd.read_csv('./Datasets/Population Data 1960-2050.csv', low_memory=False)
pop_data = pop_data.replace('..', np.nan)
pop_data = pop_data.iloc[: , 1:] # Removing unnecessary column
pop_data.head(10)

FileNotFoundError: [Errno 2] No such file or directory: './Datasets/Population Data 1960-2050.csv'

In [None]:
# Types of columns in the dataset
pop_data.dtypes

# All the data in the columns are of the type object thus to do any computation on the data we need to convert the data into float.
# As most of the data is of float type.

In [None]:
# Creating a dataset of the data where the series name is net Migration as the major focus of this section 
# is on the net migration section
netMigration = pop_data[pop_data.Series_Name == "Net migration"]

In [None]:
netMigration = netMigration.iloc[:, :70]

In [None]:
netMigration

In [None]:
# Extracting data for net migration for each continent 


Afr = netMigration.loc[netMigration['Continent_Name'] == 'Africa']
Asi = netMigration.loc[netMigration['Continent_Name'] == 'Asia']
Eur = netMigration.loc[netMigration['Continent_Name'] == 'Europe']
Nor = netMigration.loc[netMigration['Continent_Name'] == 'North America']
Oce = netMigration.loc[netMigration['Continent_Name'] == 'Oceania']
Sou = netMigration.loc[netMigration['Continent_Name'] == 'South America']

In [None]:
Afr

In [None]:
def DataCleaning(data):
    Countries = pd.DataFrame(data.iloc[:,3])
    Migration = pd.DataFrame(data.iloc[:,9::5])
    country_migration = Countries.join(Migration.astype('float64')).reset_index(drop = True).dropna( axis=0, 
                        how='any')
    return country_migration

- the issue with the net migration data is that the data is recorded in every 5th year ie, in 1962 then in 1967. So I have cleaned the data where I have collected the data forevery 5th year 
- I have also converted the net migration data columns into float64 dtype so as to calculate sum and average of the data.

In [None]:

Afr = DataCleaning(Afr).reset_index(drop = True)
Asi = DataCleaning(Asi).reset_index(drop = True)
Eur = DataCleaning(Eur).reset_index(drop = True)
Nor = DataCleaning(Nor).reset_index(drop = True)
Oce = DataCleaning(Oce).reset_index(drop = True)
Sou = DataCleaning(Sou).reset_index(drop = True)

In [None]:
def SumByContinentByYear(data):
    data = pd.DataFrame(data.sum().to_dict(),index = [data.index.values[-1]])
    return data

In [None]:
continents = ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [None]:
ContinentalGrowthAvgByYear = SumByContinentByYear(Afr)
ContinentalGrowthAvgByYear = ContinentalGrowthAvgByYear.append(SumByContinentByYear(Asi), ignore_index = True)
ContinentalGrowthAvgByYear = ContinentalGrowthAvgByYear.append(SumByContinentByYear(Eur), ignore_index = True)
ContinentalGrowthAvgByYear = ContinentalGrowthAvgByYear.append(SumByContinentByYear(Nor), ignore_index = True)
ContinentalGrowthAvgByYear = ContinentalGrowthAvgByYear.append(SumByContinentByYear(Oce), ignore_index = True)
ContinentalGrowthAvgByYear = ContinentalGrowthAvgByYear.append(SumByContinentByYear(Sou), ignore_index = True)


In [None]:
ContinentalGrowthAvgByYear

In [None]:
ContinentalGrowthAvgByYear = ContinentalGrowthAvgByYear.drop(columns=['Country_Name'])

In [None]:

ContinentalGrowthAvgByYear = ContinentalGrowthAvgByYear.transpose()


In [None]:
ContinentalGrowthAvgByYear.columns = continents

In [None]:
ContinentalGrowthAvgByYear

- In the section above i have calcuated the sum of net migration for each year for each continent and created a dataframe for the same.
- Below I have plotted a graph showing a trend in the net migratio over the years.

In [None]:
ContinentalGrowthAvgByYear.plot.line(xlabel = 'Year', ylabel = 'Migration', title = 'Migration by Year Per Continent')
plt.savefig('./Images/MigrationTrendsByContinent.jpg', bbox_inches='tight')

- This graph gives us a proper view of the net migration
- From the above graph we can get that from year 1962 to 2022 there has been a trend where there has been a lot of migration towards western countries from asia and africa continent. with a small migration from South America. 

In [None]:
## Here i am collecting the net migration for all the countries combined so as to train a global model.


Continents = pd.DataFrame(netMigration.iloc[:,0])
Migration = pd.DataFrame(netMigration.iloc[:,9::5])
continent_migration = Continents.join(Migration.astype('float64')).reset_index(drop = True).dropna( axis=0, 
                        how='any')
# Global = continent_migration.reset_index(drop = True)
Global = continent_migration.reset_index(drop = True)

In [None]:
Global

In [None]:
Global.iloc[:,1:13]

In [None]:
Global.iloc[:,-1]

In [None]:
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(Global.iloc[:,1:12], Global.iloc[:,-1], test_size=0.4,random_state=109) 

In [None]:
## Building a linear regression model

from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [None]:
y_pred

In [None]:
## Calculating the mean squared error

from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

- Here i tried seperating the data for the net migration and tried to build a regression model over it. 
- In this model we tried to predict the migration rate for the year 2021 using the data of the previous year.
- But in this dataframe we just have 196 datapoints and 12 columns to train on. 
- The dimensionality of the data is very high and the number of data points is very low so as to obtain a good mean squared error. 
- The model was trained using a simple linear regression but gave a very bad result.
- Initially we assumed that all the columns must be related to each other and there must be a trend in the net migration over the years. But after the model buiding it was clear that our assumption was wrong. As there is not much relation between the net migration in each year column.
- Net migration rate depends on more complicated reasons such as development, climate, education, etc and cannot be modeled by using a regression model.