# Mini-projet - Elections
*Basé sur Hands-on Data Preprocessing, R. Jafari, 2022*

**Ce TP est noté. Merci de lire attentivement le fichier instructions.pdf avant de commencer**

Nom étudiant 1: **Gombas**

Prénom étudiant 1: **Owen**

Nom étudiant 2: **Darmanger**

Prénom étudiant 2: **David**

## Préambule

### Imports

In [None]:
# packages standards
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
from typing import List, Dict, Tuple, Callable, Any
import re

In [None]:
# packages spécifiques
import matplotlib.image as mpimg
import seaborn as sns
import glob

### Data

In [None]:
DATA_FOLDER = os.path.join(".", "data")  # chemin relatif et nom du dossier "data"

RAW_FOLDER = os.path.join(
    DATA_FOLDER, "raw"
)  # chemin du dossier raw (ne devrait pas être changé): INPUT

PREPROCESSED_FOLDER = os.path.join(
    DATA_FOLDER, "preprocessed"
)  # chemin du dossier preprocessed (resultat du traitement raw): OUTPUT

MEDIA_FOLDER = os.path.join(
    DATA_FOLDER, "media"
)  # chemin du dossier media pour les illustrations de mise en page des notebooks

EXPLORATION_FOLDER = os.path.join(
    DATA_FOLDER, "exploration"
)  # chemin du dossier exploration pour les notebooks dexploration

In [None]:
# dictionnary with filename and header row
files = glob.glob(os.path.join(PREPROCESSED_FOLDER, "*.csv"))
files

In [None]:
def read_csv(filename: str) -> pd.DataFrame:
    return pd.read_csv(filename, dtype={"fips": str})

In [None]:
def numeric_df(df: pd.DataFrame) -> pd.DataFrame:
    return df.select_dtypes(include=np.number)

In [None]:
def show_na(df: pd.DataFrame):
    nulls = df.isnull().sum()[df.isnull().sum() > 0].to_dict()
    
    if len(nulls) == 0:
        print("No null values")
        return 0

    for key, value in nulls.items():
        print(f"{key}: {value}")
    
    # show rows with null values
    nulls_df = df[df.isnull().any(axis=1)]

    # print rows of nulls_df
    for i in range(len(nulls_df)):
        r = ""
        for key, value in nulls_df.iloc[i].to_dict().items():
            r += f"{key}: {value} | "
        print(r)

    fig = plt.figure(figsize=(30, 10))
    df.isnull().sum().plot(kind="bar")

    return len(nulls_df)

In [None]:
votes = read_csv(files[2])
datas_dfs = [read_csv(file) for file in files[:2] + files[3:]]

# Merge everything into one dataset

In [None]:
def merge_dfs_on_fips(dfs: List[pd.DataFrame]):
    df = dfs[0]
    for d in dfs[1:]:
        df = pd.merge(df, d, on="fips", how="outer", suffixes=("", "_y"))
    return df

In [None]:
for file in files:
    df = read_csv(file)
    fig = plt.figure(figsize=(30, 10))
    plt.title(f"{file} {df.shape}")

    # Source https://datavizpyr.com/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/
    splot = sns.barplot(x=df.columns, y=df.nunique())
    for p in splot.patches:
        splot.annotate(
            p.get_height(), 
            (p.get_x() + p.get_width() / 2., p.get_height()), 
            ha="center",
            va="center", 
            xytext = (0, 9), 
            textcoords = "offset points"
        )

    plt.xticks(rotation=90)
    plt.show()

In [None]:
df = merge_dfs_on_fips(datas_dfs)
df

In [None]:
df.columns.to_list()

# Drop duplicated values columns
- fips,
- state,
- area_name,
- 2013_urban_influence_code,
- percent_of_adults_with_a_high_school_diploma_only_1980,
- percent_of_adults_completing_some_college_or_associates_degree_2000,
- bachelors_degree_or_higher_2015_19,
- percent_of_adults_with_less_than_a_high_school_diploma_2015_19,
- percent_of_adults_with_a_high_school_diploma_only_2015_19,
- percent_of_adults_completing_some_college_or_associates_degree_2015_19,
- percent_of_adults_with_a_bachelors_degree_or_higher_2015_19,
- **state_y**,
- **area_name_y**,
- urban_influence_code_2013,
- economic_typology_2015,
- international_mig_2019,
- net_mig_2010,
- net_mig_2019,
- residual_2010,
- residual_2011,
- residual_2012,
- residual_2013,
- residual_2016,
- residual_2019,
- gq_estimates_2019,
- r_birth_2019,
- r_death_2019,
- r_natural_inc_2019,
- r_international_mig_2011,
- r_international_mig_2019,
- r_net_mig_2011,
- r_net_mig_2012,
- r_net_mig_2013,
- r_net_mig_2014,
- r_net_mig_2015,
- r_net_mig_2016,
- r_net_mig_2017,
- r_net_mig_2018,
- r_net_mig_2019,
- **state_y**,
- **area_name_y**,
- urban_influence_code_2013_y,
- ci90ub517_2019,
- ci90ub517p_2019,
- ci90ubinc_2019,
- **state_y**,
- **area_name_y**,
- urban_influence_code_2013_y,
- metro_2013,
- unemployment_rate_2019,
- unemployed_2020,
- unemployment_rate_2020,
- med_hh_income_percent_of_state_total_2019

In [None]:
df = df.drop(columns=["state_y", "area_name_y"])

In [None]:
show_na(df)

In [None]:
df

# Clustering

In [None]:
# ...