In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = 'C:\\Users\\lenovo\\Downloads\\Car Ownership.csv'  # Ganti dengan path file Anda
data = pd.read_csv(file_path)

# Periksa apakah ada nilai yang hilang (NaN) dalam dataset
print("Missing values:\n", data.isnull().sum())

# Mengisi nilai hilang pada kolom numerik dengan rata-rata
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

# Mengisi nilai hilang pada kolom kategorikal dengan modus
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

# Salin dataset untuk pemrosesan
df = data.copy()

# Target variable
target = 'Car'

# Encode target variable (Yes -> 1, No -> 0)
label_encoder = LabelEncoder()
df[target] = label_encoder.fit_transform(df[target])

# One-hot encoding for categorical columns, kecuali kolom target
df = pd.get_dummies(df, columns=categorical_cols.drop(target, errors='ignore'), drop_first=True)

# Pastikan kolom numerik ada dalam format numerik dan tidak ada simbol atau teks yang mengganggu
for col in numerical_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Normalisasi kolom numerik
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Definisikan fitur dan target
X = df.drop(columns=[target])
y = df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)


Missing values:
 Occupation              10
Monthly Income          13
Credit Score            40
Years of Employment     43
Finance Status          23
Finance History         28
Car                     20
Number of Children     125
dtype: int64
Accuracy: 0.89

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.84      0.85        38
           1       0.90      0.92      0.91        62

    accuracy                           0.89       100
   macro avg       0.88      0.88      0.88       100
weighted avg       0.89      0.89      0.89       100

