In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from neo4j import GraphDatabase

In [None]:
# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# Read a csv file
df = pd.read_csv('../data/NORS_20250114.csv', low_memory=False)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df = df[df["Year"] >= 2009]

In [None]:
def show_missing(df):
    """
    Takes a dataframe and returns a dataframe with stats
    on missing and null values with their percentages.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    dfx = pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})
    return dfx

In [None]:
show_missing(df)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
animal_df = df[df["Animal Type"].notna()]
animal_df.head()
animal_df.shape

In [None]:
show_missing(animal_df)

In [None]:
animal_df['Year'].value_counts(dropna=False)

In [None]:
animal_df_20 = animal_df[animal_df["Year"] >= 2009]
animal_df_20.head()
animal_df_20.shape

In [None]:
def get_values(df, columns):
    """
    Take a dataframe and a list of columns and
    returns the value counts for the columns.
    """
    for column in columns:
        print(column)
        print('=====================================')
        print(df[column].value_counts(dropna=False))
        print('\n')

def show_values(df, param):
    if param == 'all':
        get_values(df, df.columns)
    else:
        get_values(df, param) 

In [None]:
show_values(animal_df_20, ['Primary Mode', 'Etiology', 'Animal Type'])

In [None]:
df = animal_df_20[['Etiology',
                     'Illnesses',
                     'Animal Type'
                    ]]

In [None]:
df.shape

In [None]:
df.columns

In [None]:
show_values(df, ['Etiology', 'Animal Type'])

In [None]:
# Split the 'Animal Type' column by ';' and explode
exploded_df = df.assign(**{'Animal Type': df['Animal Type'].str.split(';')}).explode('Animal Type')

In [None]:
exploded_df.shape

In [None]:
exploded_df.columns

In [None]:
# Split the 'Etiology' column by ';' and explode
exploded_df = exploded_df.assign(**{'Etiology': df['Etiology'].str.split(';')}).explode('Etiology')

In [None]:
df = exploded_df.copy()

In [None]:
show_values(df, ['Etiology', 'Animal Type'])

In [None]:
df.shape