In [183]:
from dataclasses import dataclass
from abc import ABC, abstractmethod
import pandas as pd
from typing import List

@dataclass
class StructTypeInfo:
    datatypes: List[str]
    frequency: List[int]
    columns: List[List[str]]

    def to_dataframe(self) -> pd.DataFrame:
        data = {
            "datatypes": self.datatypes,
            "frequency": self.frequency,
            "columns": self.columns,
        }

        return pd.DataFrame(data)

class Eda(ABC):
    @abstractmethod
    def structtype_info(self, df: pd.DataFrame) -> pd.DataFrame:
        """The struct type of the dataframe"""

    @abstractmethod
    def completness_info(self, df: pd.DataFrame) -> pd.DataFrame:
        """the completness of the dataframe"""

    @abstractmethod
    def highcardinality_info(self, df: pd.DataFrame) -> List[dict]:
        """the completness of the dataframe"""

    @abstractmethod
    def feature_info(self, df: pd.DataFrame) -> pd.DataFrame:
        """Information about object features in the dataset"""

class PreprocessEda(Eda):
    def __init__(self):
        self.structinfo = None

    def structtype_info(self, df: pd.DataFrame) -> StructTypeInfo:
        datatypes = pd.value_counts(df.dtypes).keys().tolist()
        frequency = pd.value_counts(df.dtypes).values.tolist()
        columns = [df.select_dtypes(include=key).columns.tolist() for key in pd.value_counts(df.dtypes).keys()]
        self.structinfo = StructTypeInfo(datatypes, frequency, columns)
        return self.structinfo
    
    def completness_info(self, df: pd.DataFrame) -> pd.DataFrame:
        return (df.isnull().sum()/df.shape[0]).sort_values(ascending=False)
    
    def highcardinality_info(self, df: pd.DataFrame) -> List[dict]:
        result = [
            {
                "column": column,
                "nunique %": df.loc[~df[column].isnull(), column].nunique() / df.loc[~df[column].isnull(), column].shape[0]
            }
            if df.loc[~df[column].isnull(), column].nunique() != 0
            else {"column": column, "nunique %": 1.00}
            for column in df.columns
        ]

        result = sorted(result, key=lambda x: x["nunique %"], reverse=True)

        return result
    
    def structure_validation(self, df: pd.DataFrame, type: str) -> pd.DataFrame:
        if self.structinfo is None:
            struct = self.structtype_info(df)
            struct = struct.to_dataframe()
        else:
            struct = self.structinfo.to_dataframe()

        if type == "object":
            struct = df[struct.loc[struct["datatypes"] == "object"]["columns"].values.tolist()[0]]
        elif type == "int":
            struct = df[struct.loc[struct["datatypes"] == "int"]["columns"].values.tolist()[0]]
        elif type == "float": 
            struct = df[struct.loc[struct["datatypes"] == "float"]["columns"].values.tolist()[0]]
        else:
            raise ValueError("type is not defined")
        
        return struct
    
    def feature_info(self, df: pd.DataFrame, type: str) -> pd.DataFrame:
        struct = self.structure_validation(df, type)

        object = pd.DataFrame({
            "Features": struct.nunique().keys(),
            "Frequency": struct.nunique().values
        }).sort_values(ascending=False, by="Frequency")
        
        return object
    
    def missinginformation_info(self, df:pd.DataFrame, type: str) -> pd.DataFrame:
        struct = self.structure_validation(df, type)
        
        object = pd.DataFrame({
            "Features": struct.isnull().sum().keys(),
            "Missing": struct.isnull().sum().values / struct.shape[0]
        }).sort_values(ascending=False, by="Missing")

        return object

In [184]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)

df_counties = pd.read_csv("../data/counties.csv", sep=";")
df_crashpoverty = pd.read_csv("../data/crashes_poverty.csv", sep=";")
df_vehicles = pd.read_csv("../data/vehicles.csv")

eda = PreprocessEda()
scaler = MinMaxScaler()
label_encoder = LabelEncoder()

In [185]:
eda = PreprocessEda()

In [187]:
struct_vehicles = eda.structtype_info(df_vehicles).to_dataframe()

Unnamed: 0,datatypes,frequency,columns
0,object,15,"[region, manufacturer, model, condition, cylin..."
1,float64,5,"[year, odometer, county, lat, long]"
2,int64,1,[price]


In [191]:
df_vehicles[struct_vehicles.loc[struct_vehicles["datatypes"] == "object"]["columns"].values.tolist()[0]]

Unnamed: 0,region,manufacturer,model,condition,cylinders,fuel,title_status,transmission,vin,drive,size,type,paint_color,description,state
0,palm springs,volkswagen,jetta gls,excellent,4 cylinders,gas,clean,automatic,WVWSK61J62W209823,,,wagon,,This 2002 Volkswagen Jetta GLS Wagon is simply...,ca
1,salem,chevrolet,silverado 2500 crew cab,,8 cylinders,gas,clean,automatic,1GCHK23U94F144317,4wd,,pickup,white,We want to let you know up front that Synergiz...,or
2,long island,nissan,xterra,excellent,,gas,clean,automatic,5N1AN0NWXBC502031,,,,,Year : 2011 \t\t Make : NISSAN \t\t\tMode...,ny
3,amarillo,chevrolet,malibu,,4 cylinders,gas,clean,automatic,,fwd,mid-size,sedan,red,Use this link to see more information on this ...,tx
4,inland empire,cadillac,cts-v,excellent,8 cylinders,gas,clean,automatic,,rwd,mid-size,sedan,silver,"2009 CTS-V, FAST! Awesome car, super clean, we...",ca
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26983,mcallen / edinburg,chevrolet,tahoe lt,fair,8 cylinders,gas,clean,automatic,,rwd,full-size,SUV,grey,"Selling a 2007 chevy tahoe lt 5.3 lt V8 210,00...",tx
26984,western IL,ford,explorer,,,gas,clean,automatic,1FM5K7F89KGB42455,fwd,,,silver,Blackwell Motors address: 1001 Highway K Bo...,il
26985,ft myers / SW florida,chevrolet,corvette,,,gas,clean,automatic,,,,,,"Good condition, reply to post to see, nights a...",fl
26986,tucson,dodge,grand caravan,,,gas,clean,automatic,,,,,,"2005 Dodge grand caravan, $3,800, title clean,...",az


In [192]:
struct = df_vehicles[struct_vehicles.loc[struct_vehicles["datatypes"] == "object"]["columns"].values.tolist()[0]]

In [197]:
struct.duplicated

Unnamed: 0,region,manufacturer,model,condition,cylinders,fuel,title_status,transmission,vin,drive,size,type,paint_color,description,state
0,palm springs,volkswagen,jetta gls,excellent,4 cylinders,gas,clean,automatic,WVWSK61J62W209823,,,wagon,,This 2002 Volkswagen Jetta GLS Wagon is simply...,ca
1,salem,chevrolet,silverado 2500 crew cab,,8 cylinders,gas,clean,automatic,1GCHK23U94F144317,4wd,,pickup,white,We want to let you know up front that Synergiz...,or
2,long island,nissan,xterra,excellent,,gas,clean,automatic,5N1AN0NWXBC502031,,,,,Year : 2011 \t\t Make : NISSAN \t\t\tMode...,ny
3,amarillo,chevrolet,malibu,,4 cylinders,gas,clean,automatic,,fwd,mid-size,sedan,red,Use this link to see more information on this ...,tx
4,inland empire,cadillac,cts-v,excellent,8 cylinders,gas,clean,automatic,,rwd,mid-size,sedan,silver,"2009 CTS-V, FAST! Awesome car, super clean, we...",ca
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26983,mcallen / edinburg,chevrolet,tahoe lt,fair,8 cylinders,gas,clean,automatic,,rwd,full-size,SUV,grey,"Selling a 2007 chevy tahoe lt 5.3 lt V8 210,00...",tx
26984,western IL,ford,explorer,,,gas,clean,automatic,1FM5K7F89KGB42455,fwd,,,silver,Blackwell Motors address: 1001 Highway K Bo...,il
26985,ft myers / SW florida,chevrolet,corvette,,,gas,clean,automatic,,,,,,"Good condition, reply to post to see, nights a...",fl
26986,tucson,dodge,grand caravan,,,gas,clean,automatic,,,,,,"2005 Dodge grand caravan, $3,800, title clean,...",az


In [168]:


struct.isnull().sum().keys()
struct.isnull().sum().values / struct.shape[0]

array([0.00000000e+00, 4.42418853e-02, 1.56736327e-02, 4.31451015e-01,
       4.18926930e-01, 6.22498888e-03, 5.07633022e-03, 8.74462724e-03,
       4.02845709e-01, 2.96279828e-01, 6.90751445e-01, 2.65340151e-01,
       3.25700311e-01, 1.11160516e-04, 0.00000000e+00])

In [160]:
struct.shape[0]

26988

In [157]:
struct["region"].isnull().sum()

0