In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [54]:
income_df = pd.read_csv('/home/student/IngajiDemo/dataset.csv')

In [142]:
# print(income_df.isna().sum())

In [55]:
columns_to_delete = [
    'Region',
    'Agricultural_Household_indicator',
    'Bread_and_Cereals_Expenditure',
    'Total_Rice_Expenditure',
    'Meat_Expenditure',
    'Total_Fish_and__marine_products_Expenditure',
    'Fruit_Expenditure',
    'Vegetables_Expenditure',
    'Restaurant_and_hotels_Expenditure',
    'Alcoholic_Beverages_Expenditure',
    'Tobacco_Expenditure',
    'Clothing,_Footwear_and_Other_Wear_Expenditure',
    'Special_Occasions_Expenditure',
    'Household_Head_Age',
    'Household_Head_Marital_Status',
    'Household_Head_Highest_Grade_Completed',
    'Household_Head_Job_or_Business_Indicator',
    'Household_Head_Occupation',
    'Total_number_of_family_members_employed',
    'Type_of_Building/House',
    'Type_of_Roof',
    'Type_of_Walls',
    'House_Floor_Area',
    'House_Age',
    'Number_of_bedrooms',
    'Toilet_Facilities',
    'Main_Source_of_Water_Supply',
    'Number_of_Television',
    'Number_of_CD/VCD/DVD',
    'Household_Head_Class_of_Worker',
    'Type_of_Household',
    'Number_of_Component/Stereo_set', 
    'Number_of_Refrigerator/Freezer',
    'Number_of_Washing_Machine',
    'Number_of_Airconditioner',
    'Number_of_Car,_Jeep,_Van',
    'Number_of_Landline/wireless_telephones',
    'Number_of_Cellular_phone',
    'Number_of_Personal_Computer',
    'Number_of_Stove_with_Oven/Gas_Range',
    'Number_of_Motorized_Banca',
    'Number_of_Motorcycle/Tricycle',     
]

In [56]:
income_df = income_df.drop(columns=columns_to_delete )

In [57]:
print(income_df.columns)

Index(['Total_Household_Income', 'Total_Food_Expenditure',
       'Main_Source_of_Income', 'Housing_and_water_Expenditure',
       'Imputed_House_Rental_Value', 'Medical_Care_Expenditure',
       'Transportation_Expenditure', 'Communication_Expenditure',
       'Education_Expenditure', 'Miscellaneous_Goods_and_Services_Expenditure',
       'Crop_Farming_and_Gardening_expenses',
       'Total_Income_from_Entrepreneurial_Acitivites', 'Household_Head_Sex',
       'Total_Number_of_Family_members',
       'Members_with_age_less_than_5_year_old',
       'Members_with_age_5_-_17_years_old', 'Tenure_Status', 'Electricity'],
      dtype='object')


In [58]:
# Defining my categorical columns
categorical_columns = ['Main_Source_of_Income', 'Tenure_Status']

In [59]:
#Extract categorical columns from the dataframe
#Here we extract the columns with object datatype as they are the categorical columns
categorical_columns = income_df.select_dtypes(include=['object']).columns.tolist()

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(income_df[categorical_columns])

#Create a DataFrame with the one-hot encoded columns
#We use get_feature_names_out() to get the column names for the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded dataframe with the original dataframe
df_encoded = pd.concat([income_df, one_hot_df], axis=1)

# Drop the original categorical columns
df_encoded = df_encoded.drop(categorical_columns, axis=1)

# Display the resulting dataframe
print(f"Encoded Employee data : \n{df_encoded}")

Encoded Employee data : 
       Total_Household_Income  Total_Food_Expenditure  \
0                      480332                  117848   
1                      198235                   67766   
2                       82785                   61609   
3                      107589                   78189   
4                      189322                   94625   
...                       ...                     ...   
41539                  119773                   44875   
41540                  137320                   31157   
41541                  133171                   45882   
41542                  129500                   81416   
41543                  128598                   78195   

       Housing_and_water_Expenditure  Imputed_House_Rental_Value  \
0                              63636                       30000   
1                              41370                       27000   
2                              14340                        7200   
3                 

In [25]:
print(income_df.columns)

Index(['Total_Household_Income', 'Total_Food_Expenditure',
       'Main_Source_of_Income', 'Housing_and_water_Expenditure',
       'Imputed_House_Rental_Value', 'Medical_Care_Expenditure',
       'Transportation_Expenditure', 'Communication_Expenditure',
       'Education_Expenditure', 'Miscellaneous_Goods_and_Services_Expenditure',
       'Crop_Farming_and_Gardening_expenses',
       'Total_Income_from_Entrepreneurial_Acitivites', 'Household_Head_Sex',
       'Total_Number_of_Family_members',
       'Members_with_age_less_than_5_year_old',
       'Members_with_age_5_-_17_years_old', 'Tenure_Status', 'Electricity'],
      dtype='object')


In [41]:
income_df['Creditworthiness'] = np.where(
    (income_df['Total_Household_Income'] > 100000) &
    (income_df['Total_Food_Expenditure'] < 50000) &
    (income_df['Total_Number_of_Family_members'] <= 4),
    1,
    0
)

In [42]:
# Define features and target
features = ['Total_Household_Income', 'Total_Food_Expenditure', 'Main_Source_of_Income',
             'Housing_and_water_Expenditure', 'Imputed_House_Rental_Value', 'Medical_Care_Expenditure',
             'Transportation_Expenditure', 'Communication_Expenditure', 'Education_Expenditure',
             'Miscellaneous_Goods_and_Services_Expenditure', 'Crop_Farming_and_Gardening_expenses',
             'Total_Income_from_Entrepreneurial_Acitivites', 'Household_Head_Sex', 'Total_Number_of_Family_members',
             'Members_with_age_less_than_5_year_old', 'Members_with_age_5_-_17_years_old', 'Tenure_Status']

In [44]:
# Extract features and target variable
X = income_df[features]
y = income_df['Creditworthiness']

In [50]:
# Identify categorical columns
categorical_features = ['Main_Source_of_Income', 'Tenure_Status']
numerical_features = [
    'Total_Household_Income', 'Total_Food_Expenditure', 'Housing_and_water_Expenditure', 
    'Imputed_House_Rental_Value', 'Medical_Care_Expenditure', 'Transportation_Expenditure', 
    'Communication_Expenditure', 'Education_Expenditure', 'Miscellaneous_Goods_and_Services_Expenditure', 
    'Crop_Farming_and_Gardening_expenses', 'Total_Income_from_Entrepreneurial_Acitivites', 
    'Total_Number_of_Family_members', 'Members_with_age_less_than_5_year_old', 
    'Members_with_age_5_-_17_years_old']

In [51]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

NameError: name 'StandardScaler' is not defined

In [47]:
# Create and fit the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [49]:
# Define the pipeline with preprocessing and model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])

# Fit the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

ValueError: could not convert string to float: 'Male'

In [39]:
# Predict and evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred) * 100  # Convert to percentage

ValueError: could not convert string to float: 'Male'

In [17]:
print(f'Accuracy: {accuracy:.2f}%')

NameError: name 'accuracy' is not defined