# Create Time-Series ML Model

## Load data

First, we will load the libraries.

In [9]:
import re
import pandas as pd
import geopandas as gpd
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import logging
import pysal as ps
import contextily
from splot.esda import plot_moran
from pysal.viz import splot
from unidecode import unidecode
from pysal.explore import esda
from pysal.lib import weights
from numpy.random import seed
from sklearn.model_selection import train_test_split
from typing import Tuple, List

pp = pprint.PrettyPrinter(indent=2)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

Now, we load the the dataset.

In [10]:
df = pd.read_csv("../../data/output/only_votes.csv", dtype=str)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5083 entries, 0 to 5082
Columns: 1264 entries, MUNICIPI to vots_valids_percentage_82484191_M20111
dtypes: object(1264)
memory usage: 49.0+ MB


## Prepare data

In [12]:
# Drop `df` columns `vots_*`, `votants_percentage_*` and `vots_valids_percentage_*`
df_filtered = df.loc[:, ~df.columns.str.contains("vots_")]
df_filtered = df_filtered.loc[
    :, ~df_filtered.columns.str.contains("votants_percentage_")
]
df_filtered = df_filtered.loc[
    :, ~df_filtered.columns.str.contains("vots_valids_percentage_")
]

In [13]:
# Set "MUNDISSEC" as index
df_filtered = df_filtered.set_index("MUNDISSEC")

# Remove census section identifier columns
df_filtered = df_filtered.drop(columns=["MUNICIPI", "DISTRICTE", "SECCIO"])

In [25]:
def create_timeseries_df(df):
    #  Melt the DataFrame to long format
    df_long = df.reset_index().melt(id_vars=['MUNDISSEC'])

    # Extract variable name and electionid from 'variable' column
    df_long['electionid'] = df_long['variable'].apply(lambda x: x.split('_')[-1])
    df_long['variable_name'] = df_long['variable'].apply(lambda x: '_'.join(x.split('_')[:-1]))

    # Drop the original 'variable' column as it's now redundant
    df_long.drop(columns='variable', inplace=True)

    # Set Index using MUNDISSEC and electionid
    df_long["electionid_munidssec"] = df_long["electionid"] + "_" + df_long["MUNDISSEC"]
    df_long = df_long.drop(columns=["electionid", "MUNDISSEC"])
    df_long = df_long.set_index("electionid_munidssec")

    # Pivot the table to wide format (if necessary, depending on how you want to view/use the data)
    df_timeseries = df_long.pivot(columns='variable_name')

    # Flatten the MultiLevel column index
    # Ensure that the columns are on the same level by joining level names
    df_timeseries.columns = [col[1] for col in df_timeseries.columns.values]

    return df_timeseries

df_timeseries = create_timeseries_df(df_filtered)

In [26]:
df_timeseries

Unnamed: 0_level_0,cens_electoral_percentage_10,cens_electoral_percentage_1000,cens_electoral_percentage_1003,cens_electoral_percentage_1007,cens_electoral_percentage_1008,cens_electoral_percentage_1013,cens_electoral_percentage_1015,cens_electoral_percentage_1016,cens_electoral_percentage_1031,cens_electoral_percentage_1096,...,cens_electoral_percentage_12,cens_electoral_percentage_2019838,cens_electoral_percentage_237,cens_electoral_percentage_301,cens_electoral_percentage_412,cens_electoral_percentage_6,cens_electoral_percentage_698,cens_electoral_percentage_82484191,cens_electoral_percentage_86,cens_electoral_percentage_999999999
electionid_munidssec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A20101_08001801001,3.577512776831346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,17.802385008517888,0.0,0.0,2.555366269165247,0.0,14.821124361158432,0.0,0.0,7.240204429301533,0.5465644520159
A20101_08001801002,1.992337164750958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,15.402298850574713,0.0,0.0,1.7624521072796935,0.0,17.164750957854405,0.0,0.0,9.808429118773947,0.38633461047254153
A20101_08001801003,3.6764705882352944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,22.610294117647058,0.0,0.0,4.595588235294118,0.0,11.213235294117647,0.0,0.0,8.088235294117647,0.5055147058823529
A20101_08001801004,4.7658175842235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,21.446179129005753,0.0,0.0,2.7937551355792936,0.0,13.064913722267871,0.0,0.0,6.0805258833196385,0.45877841687208987
A20101_08001801005,2.5584795321637426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14.473684210526317,0.0,0.0,1.9005847953216373,0.0,14.25438596491228,0.0,0.0,8.552631578947368,0.5451998050682261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M20191_43906001002,21.239837398373986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.292682926829265,0.0,...,0.0,0.0,0.0,8.536585365853659,0.0,0.0,0.0,0.0,0.0,0.0
M20191_43907601001,4.483925549915398,0.0,7.952622673434856,0.0,0.0,0.0,0.0,0.0,5.583756345177665,0.0,...,0.0,38.74788494077834,0.0,5.499153976311337,0.0,0.0,0.0,0.0,2.199661590524535,0.0
M20191_43907601002,3.644444444444445,0.0,2.4,0.0,0.0,0.0,0.0,0.0,2.1333333333333333,0.0,...,0.0,42.4,0.0,5.955555555555556,0.0,0.0,0.0,0.0,4.266666666666667,0.0
M20191_43907601003,4.5602605863192185,0.0,10.206297502714442,0.0,0.0,0.0,0.0,0.0,7.709011943539632,0.0,...,0.0,39.52225841476656,0.0,3.691639522258415,0.0,0.0,0.0,0.0,2.3887079261672097,0.0
