# Mini-projet - Elections
*Basé sur Hands-on Data Preprocessing, R. Jafari, 2022*

**Ce TP est noté. Merci de lire attentivement le fichier instructions.pdf avant de commencer**

Nom étudiant 1: **Gombas**

Prénom étudiant 1: **Owen**

Nom étudiant 2: **Darmanger**

Prénom étudiant 2: **David**

## Préambule

### Imports

In [None]:
# packages standards
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
from typing import List, Dict, Tuple, Callable, Any
import re

In [None]:
# packages spécifiques
import matplotlib.image as mpimg
import seaborn as sns
import glob

### Data

In [None]:
DATA_FOLDER = os.path.join(".", "data")  # chemin relatif et nom du dossier "data"

RAW_FOLDER = os.path.join(
    DATA_FOLDER, "raw"
)  # chemin du dossier raw (ne devrait pas être changé): INPUT

PREPROCESSED_FOLDER = os.path.join(
    DATA_FOLDER, "preprocessed"
)  # chemin du dossier preprocessed (resultat du traitement raw): OUTPUT

MEDIA_FOLDER = os.path.join(
    DATA_FOLDER, "media"
)  # chemin du dossier media pour les illustrations de mise en page des notebooks

EXPLORATION_FOLDER = os.path.join(
    DATA_FOLDER, "exploration"
)  # chemin du dossier exploration pour les notebooks d'exploration

In [None]:
# dictionnary with filename and header row
files = glob.glob(os.path.join(PREPROCESSED_FOLDER, "*.csv"))
files = [file for file in files if not file.startswith(os.path.join(PREPROCESSED_FOLDER, "clean_"))]
files

In [None]:
def read_csv(file: str):
    return pd.read_csv(file, dtype={"fips": str})

In [None]:
def numerical_df(df: pd.DataFrame):
    return df.select_dtypes(include=np.number)

# Affichage des heatmap des données nos nettoyées

In [None]:
def show_heatmap(df: pd.DataFrame, title: str = "Correlation Heatmap", annot: bool = False):
    corr = df.corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))
    plt.figure(figsize=(30, 20))
    plt.title(title, fontsize=20)
    sns.heatmap(corr, mask=mask, vmin=-1, vmax=1, annot=annot, center=0)
    plt.show()

In [None]:
def remove_correlated_columns(df: pd.DataFrame, threshold: float = 0.9):
    # keep all columns with correlation < threshold
    corr = df.corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))
    corr = corr.mask(mask)
    correlated_columns = [column for column in corr.columns if any(corr[column].abs() > threshold)]
    return df.drop(columns=correlated_columns)

In [None]:
for file in files:
    df = read_csv(file)
    show_heatmap(df, title=os.path.basename(file))

# Nettoyage des données et affichage des heatmap sur les données nettoyées

In [None]:
for file in files:
    df = read_csv(file)
    df = remove_correlated_columns(df, threshold=0.8)
    show_heatmap(df, title=os.path.basename(file), annot=True)
    df.to_csv(os.path.join(PREPROCESSED_FOLDER, os.path.basename(file)), index=False)