In [48]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor, LGBMClassifier


def cols_to_impute(df):
    cols = []
    for col in df.columns:
        if df[col].isnull().sum() != 0:
            cols.append(col)
    return cols


def complete_columns(df):
    cols = []
    for col in df.columns:
        if df[col].isnull().sum() == 0:
            cols.append(col)
    return cols


def missing_indices(df):
    indices = {}
    for col in cols_to_impute(df):
        indices[col] = df[df[col].isnull()].index.tolist()
    return indices

def main(path, exclude=None):
    df = pd.read_csv(path)
    if exclude != None:
        df.drop(exclude, axis=1, inplace=True)
    numeric_cols = df.select_dtypes(include='number').columns
    categorical_cols = df.select_dtypes(exclude='number').columns
    print(categorical_cols)

    missing_cols = cols_to_impute(df)
    complete_cols = complete_columns(df)

    numeric_missing_cols = list(set(missing_cols) & set(numeric_cols))
    categorical_missing_cols = list(
        set(missing_cols) & set(categorical_cols))
    missing_cols = [categorical_missing_cols, numeric_missing_cols]

    df_numeric = df.select_dtypes(include='number')
    print(df_numeric.shape)
    df_categoric = df.select_dtypes(exclude='number').astype('category')
    display(df_categoric.info())

    # Train and Test df
    train_df_numeric = df_numeric.dropna()
    train_df_categoric = df_categoric.dropna()

    pred = {}
    for x in missing_cols:
        for i, target_column in enumerate(x):
            print(f'target column: {target_column}')

            if x == numeric_missing_cols:
                imputer = LGBMRegressor(n_jobs=-1)
                df_to_impute = df_numeric
                train_df = train_df_numeric
            if x == categorical_missing_cols:
                imputer = LGBMClassifier(n_jobs=-1)
                df_to_impute = df_categoric
                train_df = train_df_categoric
                display(train_df.info())

            X_train = train_df.drop(columns=[target_column])
            y_train = train_df[[target_column]]

            test_df = df_to_impute[df_to_impute[target_column].isnull()]
            X_test = test_df.drop(columns=[target_column])

            print(f'Fitting {i+1}/{len(x)} columns')

            imputer.fit(X_train, y_train,
                categorical_feature=categorical_cols.to_list())

            print(f'{i+1}/{len(x)} columns fitted')
            pred[target_column] = imputer.predict(X_test)
            
            for i, index in enumerate(missing_indices(df_to_impute)[target_column]):
                df_to_impute.loc[index, target_column] = pred[target_column][i]
        if x == numeric_missing_cols:
            df_numeric = df_to_impute
        if x == categorical_missing_cols:
            df_categoric = df_to_impute

    return df_numeric.info(), df_categoric.info()


# main('data\df.csv', 'lgbm')

In [81]:
df1 = pd.read_csv(('data\df.csv')).drop(
    ['time', 'desc'], axis=1)
# display(var1)
# display(np.count_nonzero(var1))

In [86]:
possible_cat = []
for col in df1.select_dtypes(include='number').columns:
    unique_count = np.count_nonzero(df1[col].unique())
    if unique_count < 15:
        possible_cat.append(col)
possible_cat

['qtr', 'down', 'GoalToGo', 'FirstDown']

In [92]:
def find_cat(df, unique_count_lim=15):
    possible_cat = []
    for col in df.select_dtypes(include='number').columns:
        unique_count = np.count_nonzero(df1[col].unique())
        if unique_count < unique_count_lim:
            possible_cat.append(col)
    return possible_cat
var1 = find_cat(df1) + [1]
var1

['qtr', 'down', 'GoalToGo', 'FirstDown', 1]

In [47]:
main('data\df.csv', exclude=['time', 'desc'])

Index(['SideofField', 'posteam', 'DefensiveTeam'], dtype='object')
(407688, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407688 entries, 0 to 407687
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   SideofField    407160 non-null  category
 1   posteam        382696 non-null  category
 2   DefensiveTeam  382696 non-null  category
dtypes: category(3)
memory usage: 1.2 MB


None

target column: DefensiveTeam
<class 'pandas.core.frame.DataFrame'>
Index: 382696 entries, 0 to 407687
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   SideofField    382696 non-null  category
 1   posteam        382696 non-null  category
 2   DefensiveTeam  382696 non-null  category
dtypes: category(3)
memory usage: 4.0 MB


None

Fitting 1/3 columns


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


TypeError: Wrong type(str) or unknown name(DefensiveTeam) in categorical_feature

In [111]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor, LGBMClassifier


def cols_to_impute(df):
    cols = []
    for col in df.columns:
        if df[col].isnull().sum() != 0:
            cols.append(col)
    return cols


def complete_columns(df):
    cols = []
    for col in df.columns:
        if df[col].isnull().sum() == 0:
            cols.append(col)
    return cols


def missing_indices(df):
    indices = {}
    for col in cols_to_impute(df):
        indices[col] = df[df[col].isnull()].index.tolist()
    return indices


def find_cat(df, unique_count_lim=15):
    possible_cat = []
    for col in df.select_dtypes(include='number').columns:
        unique_count = np.count_nonzero(df1[col].unique())
        if unique_count < unique_count_lim:
            possible_cat.append(col)
    return possible_cat


def main(path, exclude=None):
    df = pd.read_csv(path)
    if exclude != None:
        df.drop(exclude, axis=1, inplace=True)
    
    cat_cols = df.select_dtypes(exclude='number').columns.to_list()
    cat_cols += find_cat(df)
    df[cat_cols] = df[cat_cols].astype('category')
    # display(df.info())

    missing_cols = cols_to_impute(df)

    pred = {}
    for i, target_column in enumerate(missing_cols):
        print(f'target column: {target_column}')

        # select imputer
        if target_column in cat_cols:
            imputer = LGBMClassifier(n_jobs=-1, verbose=-1)
        else:
            imputer = LGBMRegressor(n_jobs=-1, verbose=-1)

        # split trainset testset
        train_df = df.dropna()
        test_df = df[df[target_column].isnull()]
        X_train = train_df.drop(columns=[target_column])
        y_train = train_df[target_column]
        X_test = test_df.drop(columns=[target_column])

        # fitting
        # print(f'Fitting {i+1}/{len(missing_cols)} columns')
        imputer.fit(X_train, y_train)
        print(f'{i+1}/{len(missing_cols)} columns fitted')

        # prediction
        pred[target_column] = imputer.predict(X_test)

        # fill na
        for i, index in enumerate(missing_indices(df)[target_column]):
            df.loc[index, target_column] = pred[target_column][i]

    return df

In [112]:
df_imp = main('data\df.csv', exclude=['time', 'desc'])

target column: down
1/10 columns fitted
target column: TimeSecs
2/10 columns fitted
target column: PlayTimeDiff
3/10 columns fitted
target column: SideofField
4/10 columns fitted
target column: yrdln
5/10 columns fitted
target column: yrdline100
6/10 columns fitted
target column: GoalToGo
7/10 columns fitted
target column: FirstDown
8/10 columns fitted
target column: posteam
9/10 columns fitted
target column: DefensiveTeam
10/10 columns fitted


In [117]:
df_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407688 entries, 0 to 407687
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   Drive          407688 non-null  int64   
 1   qtr            407688 non-null  category
 2   down           407688 non-null  category
 3   TimeUnder      407688 non-null  int64   
 4   TimeSecs       407688 non-null  float64 
 5   PlayTimeDiff   407688 non-null  float64 
 6   SideofField    407688 non-null  category
 7   yrdln          407688 non-null  float64 
 8   yrdline100     407688 non-null  float64 
 9   ydstogo        407688 non-null  int64   
 10  ydsnet         407688 non-null  int64   
 11  GoalToGo       407688 non-null  category
 12  FirstDown      407688 non-null  category
 13  posteam        407688 non-null  category
 14  DefensiveTeam  407688 non-null  category
dtypes: category(7), float64(4), int64(4)
memory usage: 27.6 MB


In [116]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407688 entries, 0 to 407687
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Drive          407688 non-null  int64  
 1   qtr            407688 non-null  int64  
 2   down           346534 non-null  float64
 3   TimeUnder      407688 non-null  int64  
 4   TimeSecs       407464 non-null  float64
 5   PlayTimeDiff   407244 non-null  float64
 6   SideofField    407160 non-null  object 
 7   yrdln          406848 non-null  float64
 8   yrdline100     406848 non-null  float64
 9   ydstogo        407688 non-null  int64  
 10  ydsnet         407688 non-null  int64  
 11  GoalToGo       406848 non-null  float64
 12  FirstDown      378877 non-null  float64
 13  posteam        382696 non-null  object 
 14  DefensiveTeam  382696 non-null  object 
dtypes: float64(7), int64(5), object(3)
memory usage: 46.7+ MB
