# Polls Date Parsing 

As there is no easy way to perfectly know where the year changes, it is advised to check that the last year correspond to reality. To check the last year you can look in the survey documents on the poll column on the website.

In [None]:
import matplotlib.pyplot as plt
import matplotlib.style as st
import numpy as np
import pandas as pd
import regex as re
import seaborn as sns
from matplotlib import cm

## Functions

In [None]:
def get_indices_to_split_at(df):
    """
    Find where the year changes, returns a list with the indices.
    """
    months = df["Date"].str.extract(r"(\d+)")
    months[0][0] = 10
    months = months.astype(int)
    months_up_index = months[1:].reset_index(drop=True).append([0], ignore_index=True)
    months_diff = months - months_up_index
    potential_indices = months_diff[(months_diff[0] < 0) & (months_diff[0] != -1)].index
    indices = []
    for idx in potential_indices:
        if (idx != 0) and (months[0][idx] != months[0][idx + 2]):
            indices.append(idx)

    return indices


def split_at_indices(df, indices):
    """
    Split the dataset based on the indices, return a list of dataframes.
    """
    previous = 0
    splitted = []

    for i, idx in enumerate(indices):
        if i == len(indices) - 1:
            splitted.append(df[previous : idx + 1].copy())
            splitted.append(df[idx + 1 :].copy())
        elif i == 0:
            splitted.append(df[: idx + 1].copy())
        else:
            splitted.append(df[previous : idx + 1].copy())

        previous = idx + 1

    return splitted


def complete_date(df, year):
    """
    Parse the date range that they give us into start date and end date.
    The format is datetime, so that we can sort by date.
    """
    df_temp = df.copy()
    dates_start = df_temp["Date"].str.split("-").str[0]
    dates_end = df_temp["Date"].str.split("-").str[1]

    df_temp["Date_start"] = str(year) + "/" + dates_start.astype(str)
    df_temp["Date_end"] = str(year) + "/" + dates_end.astype(str)

    df_temp["Date_start"] = df_temp["Date_start"].astype("datetime64[ns]")
    df_temp["Date_end"] = df_temp["Date_end"].astype("datetime64[ns]")

    return df_temp


def process_date(df, year):
    """
    Parse the date column into start date and end dateand assign
    the years based on the months order. It is advised to check the last year
    for correctness.
    """
    indices = get_indices_to_split_at(df)
    splitted = split_at_indices(df, indices)
    years = reversed(range(year - len(splitted) + 1, year + 1))
    splitted_processed = [
        complete_date(splitted[i], year) for i, year in enumerate(years)
    ]
    df_processed = pd.concat(splitted_processed)
    df_processed = df_processed.sort_values("Date_start").reset_index(drop=True)

    return df_processed

## Data loading

In [None]:
df_tab_2012 = pd.read_csv("Data\Scrapped\polls_2012.csv").drop(columns="Unnamed: 0")
df_tab_2016 = pd.read_csv("Data\Scrapped\polls_2016.csv").drop(columns="Unnamed: 0")
df_tab_2020 = pd.read_csv("Data\Scrapped\polls_2020.csv").drop(columns="Unnamed: 0")

## Parse dates

In [None]:
process_date(df_tab_2012, 2012)

In [None]:
process_date(df_tab_2016, 2016)

In [None]:
process_date(df_tab_2020, 2020)