## Data Understanding

In [1]:
# Import Libraries
import pandas as pd
import numpy as np

In [2]:
# Read the data into a dataframe
df_accidents = pd.read_json('crash_data.json')

In [3]:
class DataUnderstanding:
    """This class gives the general quick overview of a Dataframe"""
    
    def __init__(self, dataframe):
        self.df = dataframe
        
    def print_shape(self, title="Record"):
        """This function gives us the shape of the data"""
        print(title.upper())
        print("-"*len(title))
        print("There are", self.df.shape[0], "rows")
        print("There are", self.df.shape[1], "columns")
        print("\n")
        
    def information(self, title="Info"):
        """This function gives us the descriptive information of the data"""
        print(title.upper())
        print("-"*len(title))
        print(self.df.info())
        print("\n")
        
    def duplicate_values(self, title="Duplicate Values"):
        """This function gives us the number of duplicates in the data"""
        print(title.upper())
        print("-"*len(title))
        try:
            print(f"There are {self.df.duplicated().sum()} duplicated values in this dataset")
        except:
            print("There was a Type Error: One column has a 'dict' values")
        print("\n")
        
    def null_values(self, title="Null Values"):
        """This function gives us the number of null values in the data"""
        print(title.upper())
        print("-"*len(title))
        isnulls = self.df.isnull().sum()
        print(f"There are {isnulls.sum()} null values in this dataset")
        print("\n")
        
        print(f"There are {len(isnulls[isnulls > 0])} columns with null values while {len(isnulls[isnulls == 0])}"\
             +" do not have null values")
        print("\n")
        
        print("Null Values per column")
        print(isnulls[isnulls > 0])
        print("\n")        
        
    def unique_values(self, title="Unique Values"):
        """This function gives us the unique and nunique values in the data"""
        print(title.upper())
        print("-"*len(title))
        for col in self.df.columns:
            print(col)
            print("-"*len(col))
            try:
                print(f"Number of unique values: {self.df[col].nunique()}")
            except:
                print("Error - This column has a type error")
            print("\n")
        
        
    def run_all(self):
        self.print_shape()
        self.information()
        self.duplicate_values()
        self.null_values()
        self.unique_values()


In [4]:
# Implement the class on the dataframe
data = DataUnderstanding(df_accidents)
data.run_all()

RECORD
------
There are 1000 rows
There are 44 columns


INFO
----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 44 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   report_number                 1000 non-null   object        
 1   local_case_number             1000 non-null   int64         
 2   agency_name                   1000 non-null   object        
 3   acrs_report_type              1000 non-null   object        
 4   crash_date_time               1000 non-null   datetime64[ns]
 5   number_of_lanes               1000 non-null   object        
 6   distance_unit                 1000 non-null   object        
 7   off_road_description          133 non-null    object        
 8   at_fault                      1000 non-null   object        
 9   collision_type                1000 non-null   object        
 10  weather                       