In [135]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [136]:
income_df = pd.read_csv('/home/student/IngajiDemo/dataset.csv')

In [137]:
# print(income_df.isna().sum())

In [138]:
income_df['Household_Head_Occupation'] = income_df.groupby('Household_Head_Highest_Grade_Completed')['Household_Head_Occupation'].transform(lambda x: x.ffill().bfill())

In [80]:
# print(income_df.isna().sum())

In [139]:
income_df['Household_Head_Class_of_Worker'] = income_df.groupby('Household_Head_Occupation')['Household_Head_Class_of_Worker'].transform(lambda x: x.ffill().bfill())

In [140]:
income_df['Total_Number_of_Family_members'] = income_df.groupby('Type_of_Household')['Total_Number_of_Family_members'].transform(lambda x: x.ffill().bfill())

In [14]:
# print(income_df.isna().sum())

## Column Description
1. Total_Household_Income: Total income of the household, which is crucial for assessing repayment capacity.
2. Region: Geographical location, which may influence income levels and access to resources.
3. Total_Food_Expenditure: Indicates spending habits and financial management skills, important for understanding financial stability.
4. Main_Source_of_Income: Identifies primary income source, affecting economic stability and predictability of income.
5. Agricultural_Household_Indicator: Whether the household is primarily engaged in agriculture, relevant for understanding risks associated with agricultural loans.
6. Bread_and_Cereals_Expenditure: Spending on staples; high expenditure may indicate financial strain or food security issues.
7. Total_Rice_Expenditure: Specific food expenditure, important in rice-dependent regions for assessing basic needs coverage.
8. Meat_Expenditure: Indicates dietary habits and potential disposable income.
9. Total_Fish_and_Marine_Products_Expenditure: Reflects dietary diversity and potential economic stability.
10. Fruit_Expenditure: Another measure of dietary habits, can indicate health and economic conditions.
11. Vegetables_Expenditure: Similar to fruit expenditure, reflecting spending habits and nutritional status.
12. Restaurant_and_Hotels_Expenditure: Indicates discretionary spending; lower spending could signify economic hardship.
13. Alcoholic_Beverages_Expenditure: Spending on alcohol may reflect lifestyle choices affecting financial management.
14. Tobacco_Expenditure: Spending on tobacco can indicate financial priorities and health-related costs.
15. Clothing, Footwear, and Other Wear Expenditure: A measure of consumer spending reflecting lifestyle and disposable income.
16. Housing_and_Water_Expenditure: Vital for understanding housing stability, which impacts overall creditworthiness.
17. Imputed_House_Rental_Value: Represents potential income from owning property, relevant for asset valuation.
18. Medical_Care_Expenditure: Indicates financial burden from healthcare, affecting available resources for loan repayment.
19. Transportation_Expenditure: Reflects mobility and potential for economic activity, impacting income generation.
20. Communication_Expenditure: Vital for assessing connectivity and access to information, which can affect business opportunities.
21. Education_Expenditure: Higher spending may correlate with better future income potential, influencing loan repayment prospects.
22. Miscellaneous_Goods_and_Services_Expenditure: Other expenditures can indicate overall financial health.
23. Special_Occasions_Expenditure: Reflects cultural practices and potential financial stress during celebrations.
24. Crop_Farming_and_Gardening_expenses: Directly relevant for assessing investment in agriculture, crucial for loan evaluation.
25. Total_Income_from_Entrepreneurial_Activities: Additional income source, important for total income assessment and repayment capacity.
26. Household_Head_Sex: Gender may influence household income dynamics and access to credit.
27. Household_Head_Age: Age may correlate with experience and financial stability.
28. Household_Head_Marital_Status: Marital status can affect household stability and income generation.
29. Household_Head_Highest_Grade_Completed: Education level can influence income potential and loan eligibility.
30. Household_Head_Job_or_Business_Indicator: Employment status directly impacts financial stability.
31. Household_Head_Occupation: Specific job type can indicate income levels and financial security.
32. Household_Head_Class_of_Worker: Employment classification affects income stability.
33. Type_of_Household: Structure of the household can impact financial dynamics and resource allocation.
34. Total_Number_of_Family_members: Family size can affect income distribution and financial burden.
35. Members_with_age_less_than_5_year_old: Younger dependents can indicate higher household expenses.
36. Members_with_age_5_-_17_years_old: Children in this age range may reflect education-related expenditures.
37. Total_number_of_family_members_employed: Employment levels can provide insights into household income stability.
38. Type_of_Building/House: Type of dwelling can reflect economic status and potential asset value.
39. Type_of_Roof: Indicates quality of housing, which can correlate with wealth.
40. Type_of_Walls: Similar to roof type, reflects household stability and asset quality.
41. House_Floor_Area: Larger areas may indicate better economic conditions and asset stability.
42. House_Age: Older houses may require more maintenance, impacting financial health.
43. Number_of_bedrooms: Indicates household size and living conditions.
44. Tenure_Status: Owning vs. renting impacts financial stability and asset value.
45. Toilet_Facilities: Access to sanitation can reflect overall living standards.
46. Electricity: Access to electricity indicates economic development and potential for productivity.
47. Main_Source_of_Water_Supply: Water access impacts living conditions and agricultural viability.
48. Number_of_Television: Ownership may reflect disposable income.
49. Number_of_CD/VCD/DVD: Similar to television ownership, reflects spending on entertainment.
50. Number_of_Component/Stereo_set: Indicates lifestyle and economic status.
52. Number_of_Washing_Machine: Ownership reflects financial capability and lifestyle
53. Number_of_Airconditioner: Indicates economic status and comfort level.
54. Number_of_Car,_Jeep,_Van: Vehicle ownership signifies economic capability and mobility.
55. Number_of_Landline/wireless_telephones: Communication access affects connectivity and business potential.
56. Number_of_Cellular_phone: Smartphone ownership reflects economic status and communication access.
57. Number_of_Personal_Computer: Indicates technological access, affecting education and economic opportunities.
58. Number_of_Stove_with_Oven/Gas_Range: Kitchen appliances can indicate household economic status.
59. Number_of_Motorized_Banca: Relevant for rural economies; reflects livelihood and mobility.
60. Number_of_Motorcycle/Tricycle: Transportation availability can impact access to markets and services.

In [141]:
income_df.rename(columns={'Total_Food_Expenditure': 'total_food_expenditure', 
                          'Bread_and_Cereals_Expenditure': 'bread_and_cereals_expenditure',
                          'Total_Rice_Expenditure': 'total_rice_expenditure',
                          'Meat_Expenditure': 'meat_expenditure',
                          'Total_Fish_and__marine_products_Expenditure': 'total_fish_and__marine_products_expenditure',
                          'Fruit_Expenditure': 'fruit_expenditure',
                          'Vegetables_Expenditure': 'vegetables_expenditure',
                          'Restaurant_and_hotels_Expenditure': 'restaurant_and_hotels_expenditure',
                          'Alcoholic_Beverages_Expenditure': 'alcoholic_beverages_expenditure',
                          'Tobacco_Expenditure': 'tobacco_expenditure',
                          'Clothing,_Footwear_and_Other_Wear_Expenditure': 'clothing_footwear_and_other_wear_expenditure',
                          'Housing_and_water_Expenditure': 'housing_and_water_expenditure',
                          'Medical_Care_Expenditure': 'medical_care_expenditure',
                          'Transportation_Expenditure': 'transportation_expenditure',
                          'Communication_Expenditure': 'communication_expenditure',
                          'Education_Expenditure': 'education_expenditure',
                          'Miscellaneous_Goods_and_Services_Expenditure': 'miscellaneous_goods_and_services_expenditure',
                          'Special_Occasions_Expenditure': 'special_occasions_expenditure',
                          'Main_Source_of_Income': 'main_source_of_income',
                          'Household_Head_Highest_Grade_Completed': 'household_head_highest_grade_completed',
                          'Household_Head_Occupation': 'household_head_occupation',
                          'Tenure_Status': 'tenure_status',
                          'Type_of_Building/House': 'type_of_building/house',
                          'Type_of_Household': 'type_of_household',
                          'Household_Head_Class_of_Worker': 'household_head_class_of_worker',
                          'Total_Household_Income': 'total_household_income',
                          'Total_Expenditures': 'total_expenditures',
                          'Crop_Farming_and_Gardening_expenses': 'crop_farming_and_gardening_expenses'
                         }, inplace=True)

In [142]:
# Define expenditure columns
expenditure_columns = [
    'total_food_expenditure', 'bread_and_cereals_expenditure', 'total_rice_expenditure',
    'meat_expenditure', 'total_fish_and__marine_products_expenditure', 'fruit_expenditure',
    'vegetables_expenditure', 'restaurant_and_hotels_expenditure', 'alcoholic_beverages_expenditure',
    'tobacco_expenditure', 'clothing_footwear_and_other_wear_expenditure',
    'housing_and_water_expenditure', 'medical_care_expenditure', 'transportation_expenditure',
    'communication_expenditure', 'education_expenditure', 'miscellaneous_goods_and_services_expenditure',
    'special_occasions_expenditure', 'crop_farming_and_gardening_expenses'
]

In [143]:
# Filter for farmers' occupations
farmer_occupations = [
    'Farmers', 'Agricultural laborers', 'Rice farmers', 'Crop Farming and Gardening',
    'Livestock farmers', 'Agricultural technicians', 'Farmhands and laborers', 'Hog raising farmers'
]

In [144]:
farmers_df = income_df[income_df['household_head_occupation'].isin(farmer_occupations)].copy()

In [145]:
farmers_df['total_expenditures'] = farmers_df[expenditure_columns].sum(axis=1)

In [146]:
farmers_df['remaining_income'] = farmers_df['total_household_income'] - farmers_df['total_expenditures']

In [147]:
# Assign credit scores
def assign_credit_score(row):
    if row['remaining_income'] > 50000:
        return 750
    elif row['remaining_income'] > 20000:
        return 600
    elif row['remaining_income'] > 5000:
        return 500
    else:
        return 400

In [164]:
farmers_df['credit_score'] = farmers_df.apply(assign_credit_score, axis=1)
# farmers_df.head(50)

In [133]:
# Print DataFrame info to diagnose data types
# print("DataFrame Info:\n", farmers_df.info())

In [149]:
# Check each numeric column for non-numeric values
numeric_columns = ['total_household_income', 'total_expenditures', 'remaining_income']
for col in numeric_columns:
    print(f"Unique values in {col}:")
    print(farmers_df[col].unique())

 # Identify non-numeric entries
    non_numeric = farmers_df[~farmers_df[col].apply(lambda x: isinstance(x, (int, float)))][col]
    if not non_numeric.empty:
        print(f"Non-numeric entries in {col}:")
        print(non_numeric)


Unique values in total_household_income:
[ 82785 107589 198621 ... 137320 133171 129500]
Unique values in total_expenditures:
[178294 223671 365287 ... 168937 189125 311099]
Unique values in remaining_income:
[ -95509 -116082 -166666 ...  -31617  -55954 -181599]


In [150]:
# Convert numeric columns, coercing errors to NaN
for col in numeric_columns:
    farmers_df[col] = pd.to_numeric(farmers_df[col], errors='coerce')

In [151]:
print("NaN counts in numeric columns:\n", farmers_df[numeric_columns].isnull().sum())

NaN counts in numeric columns:
 total_household_income    0
total_expenditures        0
remaining_income          0
dtype: int64


In [152]:
# Prepare features and target variable
categorical_columns = [
    'main_source_of_income', 'household_head_highest_grade_completed', 'household_head_occupation', 
    'tenure_status', 'type_of_building/house', 'type_of_household', 'household_head_class_of_worker'
]

In [160]:
# Encode categorical variables
le = LabelEncoder()
for col in categorical_columns:
    farmers_df[col] = le.fit_transform(farmers_df[col])
# farmers_df.head(20)

In [161]:
# Define features and target variable
X = farmers_df.drop(columns=['credit_score', 'remaining_income', 'total_expenditures'], errors='ignore')
y = farmers_df['credit_score']

In [162]:
# Check if target variable y exists
if 'credit_score' in farmers_df.columns:
    print("Unique credit scores in target variable:", y.unique())
else:
    print("credit_score not found.")

Unique credit scores in target variable: [400 750 500 600]


In [156]:
# Check if target variable y exists
if 'credit_score' in farmers_df.columns:
    print("Unique credit scores in target variable:", y.unique())
else:
    print("credit_score not found.")

Unique credit scores in target variable: [400 750 500 600]


In [157]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [158]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

ValueError: could not convert string to float: 'Caraga'