In [142]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


## 1. BUSINESS UNDERSTANDING

## 2. DATA UNDERSTANDING

### 1. Loading Data

In [143]:
# Load the datasets
train_features = pd.read_csv(r"C:\Users\David\Documents\PHASE 4 PROJECT\PHASE 5 FINAL PROJECT\train_features.csv")
test_features = pd.read_csv(r"C:\Users\David\Documents\PHASE 4 PROJECT\PHASE 5 FINAL PROJECT\test_features.csv")
train_labels = pd.read_csv(r"C:\Users\David\Documents\PHASE 4 PROJECT\PHASE 5 FINAL PROJECT\train_labels.csv")

# Check the first few rows of each dataframe
print(train_features.head())
print(test_features.head())
print(train_labels.head())



    uid    age_03     urban_03  married_03  n_mar_03    edu_gru_03  \
0  aace       NaN          NaN         NaN       NaN           NaN   
1  aanz       NaN          NaN         NaN       NaN           NaN   
2  aape       NaN          NaN         NaN       NaN           NaN   
3  aard  1. 50–59  1. 100,000+  3. Widowed       1.0  3. 7–9 years   
4  ablr       NaN          NaN         NaN       NaN           NaN   

  n_living_child_03  migration_03 glob_hlth_03  adl_dress_03  ...  \
0               NaN           NaN          NaN           NaN  ...   
1               NaN           NaN          NaN           NaN  ...   
2               NaN           NaN          NaN           NaN  ...   
3         1. 1 or 2           0.0      4. Fair           0.0  ...   
4               NaN           NaN          NaN           NaN  ...   

            rrelgimp_12            rrfcntx_m_12              rsocact_m_12  \
0  2.somewhat important                 9.Never                   9.Never   
1      1.v

#### a) View Summary Statistics

In [144]:
print(train_features.describe())
print(test_features.describe())
print(train_labels.describe())

          n_mar_03  migration_03  adl_dress_03  adl_walk_03  adl_bath_03  \
count  2222.000000   2241.000000   2105.000000  2235.000000  2235.000000   
mean      1.134113      0.099063      0.041805     0.017002     0.007159   
std       0.482953      0.298813      0.200191     0.129308     0.084325   
min       0.000000      0.000000      0.000000     0.000000     0.000000   
25%       1.000000      0.000000      0.000000     0.000000     0.000000   
50%       1.000000      0.000000      0.000000     0.000000     0.000000   
75%       1.000000      0.000000      0.000000     0.000000     0.000000   
max       5.000000      1.000000      1.000000     1.000000     1.000000   

        adl_eat_03   adl_bed_03  adl_toilet_03     n_adl_03  iadl_money_03  \
count  2234.000000  2235.000000    2235.000000  2234.000000    2105.000000   
mean      0.004476     0.026398       0.013423     0.068487       0.005226   
std       0.066770     0.160352       0.115102     0.392793       0.072117   
min

#### b) Checking and Handling Missing Values


In [145]:
# Find missing values
print(train_features.isnull().sum())
print(test_features.isnull().sum())


uid              0
age_03        1036
urban_03      1034
married_03    1034
n_mar_03      1054
              ... 
a21_12        3234
a22_12        3240
a33b_12       3234
a34_12        1164
j11_12          75
Length: 184, dtype: int64
uid             0
age_03        249
urban_03      249
married_03    249
n_mar_03      251
             ... 
a21_12        808
a22_12        809
a33b_12       808
a34_12        277
j11_12         24
Length: 184, dtype: int64


There are so many missing values in this dataset. I will proceed with handling the missing values

In [146]:
# Drop columns with more than 40% missing values in both datasets
threshold = 0.4
train_features_cleaned = train_features.loc[:, train_features.isnull().mean() < threshold]
test_features_cleaned = test_features.loc[:, test_features.isnull().mean() < threshold]



In [147]:
# Impute missing values for numerical features
for col in train_features_cleaned.select_dtypes(include=['float64', 'int64']).columns:
    train_features_cleaned[col].fillna(train_features_cleaned[col].mean(), inplace=True)

for col in test_features_cleaned.select_dtypes(include=['float64', 'int64']).columns:
    test_features_cleaned[col].fillna(test_features_cleaned[col].mean(), inplace=True)

# Impute missing values for categorical features
for col in train_features_cleaned.select_dtypes(include=['object']).columns:
    train_features_cleaned[col].fillna(train_features_cleaned[col].mode()[0], inplace=True)

for col in test_features_cleaned.select_dtypes(include=['object']).columns:
    test_features_cleaned[col].fillna(test_features_cleaned[col].mode()[0], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [148]:
print(train_features.dtypes)


uid            object
age_03         object
urban_03       object
married_03     object
n_mar_03      float64
               ...   
a21_12        float64
a22_12         object
a33b_12        object
a34_12         object
j11_12         object
Length: 184, dtype: object


Identify Columns for Conversion

In [149]:
# List of object columns
object_columns = train_features.select_dtypes(include=['object']).columns
print("Object Columns:\n", object_columns)


Object Columns:
 Index(['uid', 'age_03', 'urban_03', 'married_03', 'edu_gru_03',
       'n_living_child_03', 'glob_hlth_03', 'bmi_03', 'decis_famil_03',
       'employment_03', 'age_12', 'urban_12', 'married_12', 'edu_gru_12',
       'n_living_child_12', 'glob_hlth_12', 'bmi_12', 'decis_famil_12',
       'decis_personal_12', 'employment_12', 'satis_ideal_12',
       'satis_excel_12', 'satis_fine_12', 'cosas_imp_12', 'wouldnt_change_12',
       'memory_12', 'ragender', 'rameduc_m', 'rafeduc_m', 'sgender_03',
       'rjlocc_m_03', 'rjobend_reason_03', 'rrelgimp_03', 'sgender_12',
       'rjlocc_m_12', 'rjobend_reason_12', 'rrelgimp_12', 'rrfcntx_m_12',
       'rsocact_m_12', 'rrelgwk_12', 'a22_12', 'a33b_12', 'a34_12', 'j11_12'],
      dtype='object')


Identify Categorical Columns

In [150]:
# Check unique values in some categorical columns
for col in ['age_03', 'urban_03', 'married_03']:
    print(f"{col} unique values:\n", train_features[col].unique())


age_03 unique values:
 [nan '1. 50–59' '3. 70–79' '2. 60–69' '0. 49 or younger' '4. 80+']
urban_03 unique values:
 [nan '1. 100,000+' '0. <100,000']
married_03 unique values:
 [nan '3. Widowed' '1. Married or in civil union' '4. Single'
 '2. Separated or divorced']


Convert Categorical Columns

I will use label encoder

In [151]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoders = {}

# Convert categorical columns to numeric
categorical_columns = ['age_03', 'urban_03', 'married_03', 'edu_gru_03', 'employment_03']  

for col in categorical_columns:
    le = LabelEncoder()
    train_features[col] = le.fit_transform(train_features[col].astype(str))  # Convert to string first 
    label_encoders[col] = le  # Save the encoder for inverse transformation later


##### Convert Relevant Numeric Columns

Use pd.to_numeric() for columns that should be numeric.

In [152]:
# Identify columns that should be numeric
numeric_columns = ['n_mar_03', 'bmi_03'] 

for col in numeric_columns:
    train_features[col] = pd.to_numeric(train_features[col], errors='coerce')


Check and fill or drop NaN values as needed

In [153]:
# Check for NaN values
print(train_features.isnull().sum())


uid              0
age_03           0
urban_03         0
married_03       0
n_mar_03      1054
              ... 
a21_12        3234
a22_12        3240
a33b_12       3234
a34_12        1164
j11_12          75
Length: 184, dtype: int64


In [154]:
# Fill NaNs for numeric columns
for col in numeric_columns:
    train_features[col].fillna(train_features[col].mean(), inplace=True)

# drop rows with NaN values for categorical columns 
train_features.dropna(subset=categorical_columns, inplace=True)

In [155]:
#check the data types again
print(train_features.dtypes)


uid            object
age_03          int32
urban_03        int32
married_03      int32
n_mar_03      float64
               ...   
a21_12        float64
a22_12         object
a33b_12        object
a34_12         object
j11_12         object
Length: 184, dtype: object


In [156]:
# Identify remaining object columns
object_columns = train_features.select_dtypes(include=['object']).columns
print("Remaining Object Columns:\n", object_columns)


Remaining Object Columns:
 Index(['uid', 'n_living_child_03', 'glob_hlth_03', 'decis_famil_03', 'age_12',
       'urban_12', 'married_12', 'edu_gru_12', 'n_living_child_12',
       'glob_hlth_12', 'bmi_12', 'decis_famil_12', 'decis_personal_12',
       'employment_12', 'satis_ideal_12', 'satis_excel_12', 'satis_fine_12',
       'cosas_imp_12', 'wouldnt_change_12', 'memory_12', 'ragender',
       'rameduc_m', 'rafeduc_m', 'sgender_03', 'rjlocc_m_03',
       'rjobend_reason_03', 'rrelgimp_03', 'sgender_12', 'rjlocc_m_12',
       'rjobend_reason_12', 'rrelgimp_12', 'rrfcntx_m_12', 'rsocact_m_12',
       'rrelgwk_12', 'a22_12', 'a33b_12', 'a34_12', 'j11_12'],
      dtype='object')


In [157]:

# Convert remaining categorical columns
for col in object_columns:
    le = LabelEncoder()
    train_features[col] = le.fit_transform(train_features[col].astype(str))  


In [158]:
# Check for NaN values
nan_counts = train_features.isnull().sum()
print("NaN Counts:\n", nan_counts[nan_counts > 0])


NaN Counts:
 migration_03       1035
adl_dress_03       1171
adl_walk_03        1041
adl_bath_03        1041
adl_eat_03         1042
                   ... 
hinc_cap_12          89
rinc_pension_12      89
sinc_pension_12    1185
a16a_12            3252
a21_12             3234
Length: 140, dtype: int64


In [159]:
 #drop specific columns that have too many missing values
train_features.drop(columns=['a16a_12', 'a21_12'], inplace=True) 

In [160]:
# Fill numerical columns with mean 
numerical_cols = train_features.select_dtypes(include=['float64', 'int64', 'int32']).columns
for col in numerical_cols:
    train_features[col].fillna(train_features[col].mean(), inplace=True)  

# Fill categorical columns with mode
categorical_cols = train_features.select_dtypes(include=['object']).columns
for col in categorical_cols:
    train_features[col].fillna(train_features[col].mode()[0], inplace=True)  


In [161]:
#check the data again to ensure there are no remaining missing values
print(train_features.isnull().sum().sum())  


3276


In [162]:
# Count of missing values in each column
missing_counts = train_features.isnull().sum()
missing_columns = missing_counts[missing_counts > 0]

print("Columns with Missing Values:\n", missing_columns)


Columns with Missing Values:
 bmi_03    3276
dtype: int64


In [163]:
# Drop the bmi_03 column
train_features.drop(columns=['bmi_03'], inplace=True)

# Check to ensure it has been dropped
print("Columns after dropping bmi_03:", train_features.columns)


Columns after dropping bmi_03: Index(['uid', 'age_03', 'urban_03', 'married_03', 'n_mar_03', 'edu_gru_03',
       'n_living_child_03', 'migration_03', 'glob_hlth_03', 'adl_dress_03',
       ...
       'rinc_pension_12', 'sinc_pension_12', 'rrelgimp_12', 'rrfcntx_m_12',
       'rsocact_m_12', 'rrelgwk_12', 'a22_12', 'a33b_12', 'a34_12', 'j11_12'],
      dtype='object', length=181)


In [164]:
# Check for NaN values
print(train_features.isnull().sum().sum())  # Should be 0 if all missing values are handled


0


All missing values have been handled

In [165]:
# Align the test features with the training features
test_features_aligned = test_features_cleaned.reindex(columns=train_features_cleaned.columns, fill_value=0)


### 3. Define Labels

In [167]:
# Load the training labels
train_labels = pd.read_csv(r"C:\Users\David\Documents\PHASE 4 PROJECT\PHASE 5 FINAL PROJECT\train_labels.csv")  
train_labels.head()


Unnamed: 0,uid,year,composite_score
0,aace,2021,175
1,aanz,2021,206
2,aape,2016,161
3,aape,2021,144
4,aard,2021,104


#### a) Check the Shape of the Datasets

In [169]:
print("Training Features Shape:", train_features.shape)
print("Training Labels Shape:", train_labels.shape)  


Training Features Shape: (3276, 181)
Training Labels Shape: (4343, 3)


##### b) Align Features and Labels

In [170]:
print(train_features.dtypes)
print(train_labels.dtypes)


uid             int32
age_03          int32
urban_03        int32
married_03      int32
n_mar_03      float64
               ...   
rrelgwk_12      int32
a22_12          int32
a33b_12         int32
a34_12          int32
j11_12          int32
Length: 181, dtype: object
uid                object
year                int64
composite_score     int64
dtype: object


##### 5) Check for Duplicates

Inspect both datasets for duplicate rows that may cause discrepancies.

In [175]:
print("Duplicate rows in features:", train_features.duplicated().sum())
print("Duplicate rows in labels:", train_labels.duplicated().sum())


Duplicate rows in features: 0
Duplicate rows in labels: 0


There are no duplicate rows in both the features and labels

### 3. Exploratory Data Analysis (EDA)

- Visualize Data: Use visualizations to understand the distributions of key features and their relationships with the target variable