In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

# Import NumPy, Pandas, Matplotlib, Seaborn libraries, Load the following libraries

In [2]:
import warnings

warnings.filterwarnings('ignore')
# Suppress the warnings

Link to the dataset: Only 2023-2022-2021-2020 was selected

https://data.cso.ie/table/PEA11



In [3]:
missing_values = ['/',' ','-','--','na','Na','n/a','NA','n.a.','?','nan','Nan','nul','Nul','null','Null']

df= pd.read_csv('PopulationEstimatesDatabase.csv',na_values=missing_values)

# Create a list with all the possible missing value formats. Create a name for the dataframe as report ('PopulationEstimatesDatabase') and apply the
#pd.read_csv function in order to read the csv file containing the report
# Include the variable 'na_values' and define it with the 'missing values' list previously created

In [4]:
df.isnull().sum()

# Check missing values by applying the .isnull().sum method to the dataframe

STATISTIC Label       0
Year                  0
Single Year of Age    0
Sex                   0
UNIT                  0
VALUE                 0
dtype: int64

In [5]:
df.info()

# Check the information of the dataframe by applying the .info() function

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1212 entries, 0 to 1211
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   STATISTIC Label     1212 non-null   object
 1   Year                1212 non-null   int64 
 2   Single Year of Age  1212 non-null   object
 3   Sex                 1212 non-null   object
 4   UNIT                1212 non-null   object
 5   VALUE               1212 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 56.9+ KB


In [6]:
df.head(5)

# Print the first 5 rows of the dataframe by applying the.head() function

Unnamed: 0,STATISTIC Label,Year,Single Year of Age,Sex,UNIT,VALUE
0,Population estimates from 1926,2020,All ages,Both sexes,Number,5029875
1,Population estimates from 1926,2020,All ages,Male,Number,2491778
2,Population estimates from 1926,2020,All ages,Female,Number,2538097
3,Population estimates from 1926,2020,Under 1 year,Both sexes,Number,58167
4,Population estimates from 1926,2020,Under 1 year,Male,Number,29845


In [7]:
df.tail(5)

# Print the last 5 rows of the dataframe by applying the .tail() function

Unnamed: 0,STATISTIC Label,Year,Single Year of Age,Sex,UNIT,VALUE
1207,Population estimates from 1926,2023,98 years,Male,Number,197
1208,Population estimates from 1926,2023,98 years,Female,Number,632
1209,Population estimates from 1926,2023,99 years and over,Both sexes,Number,1600
1210,Population estimates from 1926,2023,99 years and over,Male,Number,352
1211,Population estimates from 1926,2023,99 years and over,Female,Number,1248


In [8]:
df.shape

# Check the number of rows and columns of the new_house dataframe by applying the .shape attribute

(1212, 6)

In [9]:
df.dtypes

# Check the data types of the dataframe by applying the .dtypes attribute to the dataframe

STATISTIC Label       object
Year                   int64
Single Year of Age    object
Sex                   object
UNIT                  object
VALUE                  int64
dtype: object

In [10]:
df.columns

# Check the column names of the dataframe by applying the .columns attribute

Index(['STATISTIC Label', 'Year', 'Single Year of Age', 'Sex', 'UNIT',
       'VALUE'],
      dtype='object')

In [11]:
df.drop(columns=["STATISTIC Label", "UNIT"], inplace=True)

# Apply .drop() to remove the spesific column as it contains the same index values

In [12]:
df.head(5)

Unnamed: 0,Year,Single Year of Age,Sex,VALUE
0,2020,All ages,Both sexes,5029875
1,2020,All ages,Male,2491778
2,2020,All ages,Female,2538097
3,2020,Under 1 year,Both sexes,58167
4,2020,Under 1 year,Male,29845


In [13]:
df = df.rename(columns={'Single Year of Age': 'Single_Year_of_Age','VALUE': 'Value'})

# To correct only the "VALUE" spelling in columns-Rename the columns of the dataframe by applying the .rename() method,
# spaces between words in column names have been replaced by "_" in order to Other columns are suitable.

In [14]:
df.head(5)

Unnamed: 0,Year,Single_Year_of_Age,Sex,Value
0,2020,All ages,Both sexes,5029875
1,2020,All ages,Male,2491778
2,2020,All ages,Female,2538097
3,2020,Under 1 year,Both sexes,58167
4,2020,Under 1 year,Male,29845


In [15]:
df.describe(include = "object")

Unnamed: 0,Single_Year_of_Age,Sex
count,1212,1212
unique,101,3
top,All ages,Both sexes
freq,12,404


In [16]:
l_encoder = LabelEncoder()

In [17]:
df["Sex_Encoded"] = l_encoder.fit_transform(df["Sex"])

In [18]:
df["Single_Year_of_Age_Encoded"] = l_encoder.fit_transform(df["Single_Year_of_Age"])

In [19]:
df.head(100)

Unnamed: 0,Year,Single_Year_of_Age,Sex,Value,Sex_Encoded,Single_Year_of_Age_Encoded
0,2020,All ages,Both sexes,5029875,0,99
1,2020,All ages,Male,2491778,2,99
2,2020,All ages,Female,2538097,1,99
3,2020,Under 1 year,Both sexes,58167,0,100
4,2020,Under 1 year,Male,29845,2,100
...,...,...,...,...,...,...
95,2020,30 years,Female,31397,1,23
96,2020,31 years,Both sexes,65626,0,24
97,2020,31 years,Male,32265,2,24
98,2020,31 years,Female,33361,1,24


In [20]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,1212.0,2021.5,1.118496,2020.0,2020.75,2021.5,2022.25,2023.0
Value,1212.0,67888.188119,357883.363991,102.0,20923.5,32384.0,43172.25,5281612.0
Sex_Encoded,1212.0,1.0,0.816834,0.0,0.0,1.0,2.0,2.0
Single_Year_of_Age_Encoded,1212.0,50.0,29.166794,0.0,25.0,50.0,75.0,100.0


In [21]:
df.drop(columns=["Single_Year_of_Age", "Sex"], inplace=True)

In [22]:
df.head(10)

Unnamed: 0,Year,Value,Sex_Encoded,Single_Year_of_Age_Encoded
0,2020,5029875,0,99
1,2020,2491778,2,99
2,2020,2538097,1,99
3,2020,58167,0,100
4,2020,29845,2,100
5,2020,28322,1,100
6,2020,59948,0,0
7,2020,30775,2,0
8,2020,29173,1,0
9,2020,62394,0,11


In [23]:
df.shape

(1212, 4)

In [24]:
len(df)

1212

In [25]:
train_validation = df.sample(frac = 0.8, random_state = 200)
test = df.drop(train_validation.index)
len(test)

242

We allocated 80% of the data as the training and validation set, and the remaining 20% as the test set. I stated what percentage I would divide the amount of “.example” by the accumulation of “frak”. The “random_state” command, like most commands, enables the division of progress by setting the “seed” part. The “.drop” cut in the second line throws the remaining data from the original data, except the parts we reserved for train and verification, to the test set.

In [26]:
len(train_validation)

970

In [27]:
len(test)

242

In the last 2 lines, we look at the line numbers in order to verify the operation. The number of rows of the train and validation set almost should be equal to 4 times the number of rows of the test set.

In [28]:
from sklearn.linear_model import LinearRegression
y = train_validation["Value"]
x = train_validation.drop("Value", axis = 1)
lr = LinearRegression()
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sfs = SFS(estimator = lr,
k_features = (1, 3),forward = True,floating = False,cv = 0)
sfs_fit = sfs.fit(X = x.values, y = y.values)
sfs_fit.subsets_

{1: {'feature_idx': (2,),
  'cv_scores': array([0.01508134]),
  'avg_score': 0.015081342429616673,
  'feature_names': ('2',)},
 2: {'feature_idx': (1, 2),
  'cv_scores': array([0.01781473]),
  'avg_score': 0.01781472940784201,
  'feature_names': ('1', '2')},
 3: {'feature_idx': (0, 1, 2),
  'cv_scores': array([0.01815075]),
  'avg_score': 0.018150747642184606,
  'feature_names': ('0', '1', '2')}}

First of all, I import LinearRegression from the sklearn.linear_model module with the "import" command. The commands we will use are in this library. Then, I introduce to the program what the dependent and independent variables are for feature selection. Since "y", the dependent variable, will be the population value, I filter the number rate from the train_validation set. I assign everything else to the “x” object as estimators, that is, independent variables, by giving the necessary arguments to the “.drop” command. In the next line, I assign the method I will use for estimation to the “lr” object. Since I will be selecting features using Linear Regression, I perform this process with the “LinearRegression()” command. In the next step, I create the method for selecting the feature and assign it to the "sfs" object. The “SFS” command here is the abbreviation of “Sequential Feature Selection”, that is, the method that selects features in an incremental manner. After giving the "lr" object as an estimator in the function's arguments, I tell it to choose the best feature combination in a certain range in the "k_features" argument. Since I have 3 columns, I want it to find the best combination of variables starting from 1 to 3. The “Forward” argument being “True” means that the selection should be forward-looking, that is, by adding the features sequentially. The “Floating” argument adds an intermediate step to each step and continues the algorithm by removing variables that reduce performance. This argument was selected as “False”. Finally, when I said “cv = 0”, I stated that we did not want any cross-validation. I did not use the cross-validation method at this stage. This will allow us to see the difference clearly. After determining this method, we started the process with the “sfs.fit” command. In the last line of the code block, I used the “.subsets_” command to see the models at each step.

{1: {'feature_idx': (2,),
  'cv_scores': array([0.01508134]),
  'avg_score': 0.015081342429616673
  
I only took the first line of the result of the command, this will be enough for us. In the first step, train_validation was the second variable that minimized the average error in the set. It can be seen in the “avg_score” section that the average score for this variable is around 0.015081.

In [29]:
type(sfs_fit.subsets_)

dict

In [None]:
plt.hist(df['Value'])
plt.xlabel('Year')
plt.ylabel('Value')
plt.show()

In [None]:
range_min = 0
range_max = 500000

EsPopulation_data = df[(df['Value'] >= range_min) & (df['Year'] <= range_max)]

EsPopulation_data['Value'].plot(kind='density')

plt.title('Population by Year Plot')
plt.xlabel('Value')
plt.ylabel('density')

plt.show()