# Numerical checks

This notebook is designed to use machine learning platforms such as Hugging Face (specifically the transformers model) and scikit-learn (in particular the Isolation Forest algorithm).

## Setup and installations

In [0]:
%pip install transformers
%pip install torch
%pip install sentencepiece
%pip install scikit-learn
%restart_python

In [0]:
from transformers import pipeline
from sklearn.ensemble import IsolationForest
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

## Read in data from the publication of interest, check columns and data types

In [0]:
df = pd.read_csv("data/202324_national_char_data_revised.csv")
df.head()

In [0]:
df.dtypes

## Make columns numeric where possible to do so 

In [0]:
convertible_cols = [
    col for col in df.columns
    if pd.to_numeric(df[col], errors='coerce').notna().sum() > 0
]

df_numeric = df.copy()
df_numeric[convertible_cols] = df[convertible_cols].apply(pd.to_numeric, errors='coerce')

print("Numeric columns:")
print(df_numeric.select_dtypes(include='number').columns)


In [0]:
df_numeric.head()

## Use Isolation Forest to look through columns for outliers