In [10]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    mean_absolute_percentage_error,
)

# Load the data
data = sns.load_dataset("titanic")

data.drop("deck", axis=1, inplace=True)
columns_to_encode = [
    "sex",
    "embarked",
    "class",
    "alone",
    "adult_male",
    "who",
    "embark_town",
    "alive",
]

# Dictionary to store both categories and their encoded mappings
encoding_info_dict = {}

for col in columns_to_encode:
    le = LabelEncoder()

    # Fit and transform
    data[col] = le.fit_transform(data[col])

    # Create mapping dictionary
    category_to_number = dict(zip(le.classes_, range(len(le.classes_))))
    number_to_category = dict(zip(range(len(le.classes_)), le.classes_))

    # Store comprehensive encoding information
    encoding_info_dict[col] = {
        "categories": le.classes_,
        "category_to_number": category_to_number,
        "number_to_category": number_to_category,
    }


df_with_mising_values = data[data["age"].isnull()]
df_with_mising_values
df_without_mising_values = data.dropna()
df_without_mising_values


X = df_without_mising_values.drop("age", axis=1)
y = df_without_mising_values["age"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

rf_model = RandomForestRegressor(n_estimators=1000, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print("Root mean_squared_error", np.sqrt(mean_squared_error(y_test, y_pred)))
print("r2_score", (r2_score(y_test, y_pred)))
print("mean_absolute_error", (mean_absolute_error(y_test, y_pred)))
print(
    "mean_absolute_percentage_error", (mean_absolute_percentage_error(y_test, y_pred))
)

Root mean_squared_error 11.127124182032897
r2_score 0.33220017616292374
mean_absolute_error 8.707152235084173
mean_absolute_percentage_error 0.4083347810017931


In [11]:
df_with_mising_values.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
embark_town      0
alive            0
alone            0
dtype: int64

In [13]:
y_pred = rf_model.predict(df_with_mising_values.drop(["age"], axis=1))
len(y_pred)

177

In [15]:
len(df_with_mising_values["age"])
df_with_mising_values["age"] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_mising_values["age"] = y_pred


In [17]:
df_with_mising_values.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [18]:
data_cleaned = pd.concat([df_without_mising_values, df_with_mising_values], axis=0)

In [21]:
# data_cleaned.to_csv("titanic_data_cleaned.csv", index=False)
data_cleaned

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.000000,1,0,7.2500,2,2,1,1,2,0,0
1,1,1,0,38.000000,1,0,71.2833,0,0,2,0,0,1,0
2,1,3,0,26.000000,0,0,7.9250,2,2,2,0,2,1,1
3,1,1,0,35.000000,1,0,53.1000,2,0,2,0,2,1,0
4,0,3,1,35.000000,0,0,8.0500,2,2,1,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,1,27.307609,0,0,7.2292,0,2,1,1,0,0,1
863,0,3,0,25.551054,8,2,69.5500,2,2,2,0,2,0,0
868,0,3,1,24.986880,0,0,9.5000,2,2,1,1,2,0,1
878,0,3,1,26.762896,0,0,7.8958,2,2,1,1,2,0,1
