In [52]:
#import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from pandas.api.types import is_string_dtype, is_numeric_dtype
from IPython.display import display
from sklearn import metrics
import numpy as np
import pandas as pd
import math

PATH = "D:/PY/nba_salary/player_info.csv"   #path for reading csv file
df = pd.read_csv(f"{PATH}",low_memory= False)
df["gender"]


0        F
1        F
2        F
3        F
4        M
      ... 
995      M
996      M
997      M
998      F
999    NaN
Name: gender, Length: 1000, dtype: object

In [53]:
#categorize the data
def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c):
            df[n] = c.astype("category").cat.as_ordered()

def apply_cats(df,train):
    for n,c in df.items():
        if train[n].dtype == "category":
            df[n] = pd.Categorical(c,categories=train[n].cat.categories,ordered=True)

train_cats(df)
df["gender"].cat.codes

df["gender"].cat.categories

df["gender"].cat.set_categories(["M","F"],ordered=True,inplace=True)
df["gender"].cat.codes

def numericalize(df,col,name):
    if not is_numeric_dtype(col):
        df[name] = col.cat.codes + 1

numericalize(df,df["gender"],"gender")

df["gender"]


  res = method(*args, **kwargs)


0      2
1      2
2      2
3      2
4      1
      ..
995    1
996    1
997    1
998    2
999    0
Name: gender, Length: 1000, dtype: int8

In [75]:
#fixing the missing values
def fix_missing(df,col,name,nan_dict,is_train):
    if is_train:
        if is_numeric_dtype(col):
            if pd.isnull(col).sum():
                df[name+"_na"]=pd.isnull(col)
                nan_dict[name] = col.median()
                df[name] = col.fillna((pd.Series([nan_dict[name]])))
    
    else:
        if is_numeric_dtype(col):
            if name in nan_dict:
                df[name+"_na"]=pd.isnull(col)
                df[name] = col.fillna((pd.Series([nan_dict[name]])))
            else:
                df[name] = col.fillna((pd.Series(df[name]).median()))

In [76]:
#merge the numericalize and fix missing function
# add a column for keeping null data both train and validation 
def proc_df(df,y_fld,nan_dict=None,is_train=True):
    
    df = df.copy()
    
    y = df[y_fld].values
    df.drop(y_fld,axis=1,inplace=True)

    if nan_dict is None:
        nan_dict={}

    for n,c in df.items():
        fix_missing(df,c,n,nan_dict,is_train)
        numericalize(df,c,n)

    if is_train:
        return df,y,nan_dict
    return df,y

In [77]:
#split data for train  and validation
def split_train_value(df,n):
    return df[:n].copy(),df[n:].copy()

n_valid = 200
n_train = len(df)-n_valid
raw_train,raw_valid = split_train_value(df,n_train)

In [78]:
#create traina and validation data
x_train,y_train, nas = proc_df(raw_train, 'salary')
x_valid,y_valid = proc_df(raw_valid,"salary",nan_dict=nas,is_train=False)


In [79]:
#apply model
m = RandomForestRegressor(n_estimators=30,n_jobs=-1)
m.fit(x_train,y_train)
m.score(x_train,y_train)

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
#results
def rmse(x,y):
    return math.sqrt(((x-2)**2).mean())
def print_score(m):
    print(f"RMSE of train set: {rmse(m.predict(x_train),y_train)}")
    print(f"RMSE of valid set: {rmse(m.predict(x_valid),y_valid)}")
    print(f"R^2 of train set: {m.score(x_train,y_train)}")
    print(f"R^2 of valid set: {m.score(x_valid,y_valid)}")

print_score(m)