In [288]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [289]:

def fill_cols(df, null_limit=300, null_flag="tbd"):
    
    encoder = LabelEncoder()
           
    null_ct = df.isnull().sum()
    null_drop = null_ct[null_ct <= null_limit].index
    null_fill = null_ct.index.difference(null_drop)
    
    df = df.dropna(subset=null_drop)
    
    for col in null_fill:
        if df[col].dtype != "float64":
            # get indices with non-null values
            not_null = df[col].notna() 
             
            # replace null-flagged strings with null
            df.loc[df[col].values == null_flag, col] = None
            
            # try to convert col to numeric, else use label encoder
            try:
                df.loc[not_null, col]= pd.to_numeric(df[col], errors="raise")
            except ValueError:
                df.loc[not_null, col] = encoder.fit_transform(df.loc[not_null, col])

        # fill in missing values with column mean
        df.loc[df[col].isna(), col] = df[col].mean(axis=0)
        
    for col in df.select_dtypes(exclude='float').columns:
        df.loc[:, col] = encoder.fit_transform(df.loc[:, col]).astype('float64')
        
    return df


In [290]:

df = pd.read_csv('data\\Video_Games.csv')
df.head()


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [291]:

df = fill_cols(df)
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 16416 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16416 non-null  object 
 1   Platform         16416 non-null  object 
 2   Year_of_Release  16416 non-null  float64
 3   Genre            16416 non-null  object 
 4   Publisher        16416 non-null  object 
 5   NA_Sales         16416 non-null  float64
 6   EU_Sales         16416 non-null  float64
 7   JP_Sales         16416 non-null  float64
 8   Other_Sales      16416 non-null  float64
 9   Global_Sales     16416 non-null  float64
 10  Critic_Score     16416 non-null  float64
 11  Critic_Count     16416 non-null  float64
 12  User_Score       16416 non-null  float64
 13  User_Count       16416 non-null  float64
 14  Developer        16416 non-null  object 
 15  Rating           16416 non-null  object 
dtypes: float64(10), object(6)
memory usage: 2.1+ MB
