In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

In [None]:
df_train = pd.read_csv('train.csv')
display(df_train.shape, df_train.head())

In [None]:
df_test = pd.read_csv('test.csv')
display(df_test.shape, df_test.head())

In [None]:
df_train['Target'].dtype

## 1. Identify the output variable.

In [None]:
# the output variable is 'Target', and is an ordinal variable
# if it is oridinal each value then represents a level of severity
#1: Extreme Poverty
#2: Moderate Poverty
#3: Vulnerable Households
#4: Non Vulnerable Households

df_train['Target'].unique()

## 2. Understand the type of data.

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.columns

In [None]:
df_train.select_dtypes(include=['object']).columns

In [None]:
df_train.loc[:, ['dependency', 'edjefe', 'edjefa']].head()

In [None]:
df_test.loc[:, ['dependency', 'edjefe', 'edjefa']].head()

In [None]:
# replace 'yes' and 'no' with 1 and 0
mapping = {'yes': 1, 'no': 0}

for col in ['dependency', 'edjefa', 'edjefe']:
    df_train[col] = df_train[col].replace(mapping).astype(np.float64)
    df_test[col] = df_test[col].replace(mapping).astype(np.float64)

In [None]:
df_train.loc[:, ['dependency', 'edjefe', 'edjefa']].head()

In [None]:
df_test.loc[:, ['dependency', 'edjefe', 'edjefa']].head()

## 3. Check if there are any biases in your dataset.

In [None]:
print(df_train.groupby('Target')['Target'].count())
sns.countplot(data=df_train, x='Target')

# dataset is clearly biased

In [None]:
from imblearn.over_sampling import RandomOverSampler

X_over, y_over = RandomOverSampler().fit_resample(df_train.drop('Target', axis=1), df_train['Target'])

In [None]:
df_train = pd.concat([X_over, y_over], axis=1)

In [None]:
print(df_train.groupby('Target')['Target'].count())
sns.countplot(data=df_train, x='Target')

In [None]:
df_train['idhogar']

## 4. Check whether all members of the house have the same poverty level.

In [None]:
household_groups = df_train.groupby('idhogar')

household_same_poverty_level = 0
household_diff_poverty_level = 0

# iterate over each household group
for household_id, household_data in household_groups:
    # get the unique poverty levels within the household group
    unique_poverty_levels = household_data['Target'].dropna().unique()
    if len(unique_poverty_levels) > 1:
        # members of the household have different poverty levels
        household_diff_poverty_level += 1
        # print(f"Household {household_id} has members with different poverty levels.")
    else:
        # members of the household have the same poverty level
        household_same_poverty_level += 1
        # print(f"Household {household_id} has members with the same poverty level.")



In [None]:
print(f'Total number of households with same poverty level = {household_same_poverty_level}')
print(f'Total number of households with different poverty levels = {household_diff_poverty_level}')

## 5. Check if there is a house without a family head.

In [None]:
# Group the data by household ID
household_groups = df_train.groupby('idhogar')

num_households_with_no_family_head = 0

for household_id, household_data in household_groups:
  if (household_data['parentesco1'] == 1).sum() == 0:
    num_households_with_no_family_head += 1
    print(f"Household {household_id} has no family head.")

print('\n')
print(f'Total number of households with a family head = {num_households_with_no_family_head}')

In [None]:
num_households = df_train['idhogar'].nunique()
print(f"There are {num_households} households in the dataset.")

In [None]:
df_train['parentesco1'].unique()

## 6. Set the poverty level of the members and the head of the house same in a family.

In [None]:
head_poverty_levels = df_train.loc[df_train['parentesco1'] == 1, ['idhogar', 'Target']].set_index('idhogar')['Target'].to_dict()
df_train['Target'] = df_train['idhogar'].map(head_poverty_levels)

## 7. Count how many null values are existing in columns.

In [None]:
df_test.isna().sum()

In [None]:
df_train.columns[df_train.isna().any()]

In [None]:
df_train['v2a1']

In [None]:
df_info = pd.DataFrame(columns=['Name of Col', 'Num of Null', 'Dtype', 'Num of Unique'])

for i in range(0, len(df_train.columns)):
    df_info.loc[i] = [df_train.columns[i],
                 df_train[df_train.columns[i]].isna().sum(),
                 df_train[df_train.columns[i]].dtypes,
                 df_train[df_train.columns[i]].nunique()]
df_info

In [None]:
df_info[df_info['Num of Null'] > 0]

In [None]:
# show the percentage of missing values in each column
missing_val_per = round((df_train.isna().sum() / df_train.shape[0]) * 100)
missing_val_per

# print features that have more than 50% of missing values
missing_val_per[missing_val_per > 50]

## 7b. Create a strategy to handle the missing vlaues

In [None]:
# this is the number of tablets in a household, in this case nan means 0
df_train['v18q1'].unique()

In [None]:
# replace the missing values with 0
for data in [df_train, df_test]:
  data['v18q1'].fillna(0, inplace=True)

In [None]:
df_train['rez_esc'].unique()

In [None]:
df_train['rez_esc'].isna().sum()

In [None]:
for data in [df_train, df_test]:
    data['rez_esc'].fillna(0, inplace=True)

In [None]:
df_train['v2a1'].unique()

v2a1 - Monthly rent payment it looks related to tipovivi[i]<br><br>
tipovivi1 = own and fully paid house<br>
tipovivi2 = own, paying in installments<br>
tipovivi3 = rented<br>
tipovivi4 = precarious<br>
tipovivi5 = other(assigned, borrowed)<br>

In [None]:
data = df_train[df_train['v2a1'].isnull()].head(10)
data.loc[:,['v2a1', 'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5']]

we can see that the nan values in 'v2a1' all coincide with 'tipovivi1'<br>
so we will replace them with 0, because if the house is owned<br>
no rent has to be paid

In [None]:
for data in [df_train, df_test]:
  data['v2a1'].fillna(0, inplace=True)

In [None]:
df_train['meaneduc'].unique()

In [None]:
meaneduc_imputer = SimpleImputer(strategy='mean')

for data in [df_train, df_test]:
  data['meaneduc'] = meaneduc_imputer.fit_transform(data[['meaneduc']])

In [None]:
df_train['SQBmeaned'].unique()

In [None]:
SQBmeaned_imputer = SimpleImputer(strategy='mean')

for data in [df_train, df_test]:
  data['SQBmeaned'] = SQBmeaned_imputer.fit_transform(data[['SQBmeaned']])

dropping the squared columns

In [None]:
cols = ['SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin',
        'SQBovercrowding', 'SQBdependency', 'SQBmeaned', 'agesq']

for data in [df_train, df_test]:
  data.drop(cols, axis=1, inplace=True)

## 7c. Check if the columns for v2a1 and rez_esc are significantly important,<br> if they don't drop them since they have a large amount of missing values.

In [None]:
def dist_plot(feature, title, xlabel):
  fig = plt.figure(figsize=(10, 10))

  sns.displot(feature, kde=True, bins=50)
  plt.title(title)
  plt.xlabel(xlabel)
  plt.show()

In [None]:
dist_plot(df_train['v2a1'], 'Distribution of Monthly Rent Payment', 'Monthly Rent Payment')

In [None]:
dist_plot(df_train['rez_esc'], 'Distribution of Years Behind in School', 'Years Behind in School')

### Data for both 'v2a1' and 'rez_esc' is not normally distributed. So we opt for kruskal-wallis test

In [None]:
from scipy.stats import kruskal

X1 = df_train[['v2a1']]
X2 = df_train[['rez_esc']]
y = df_train['Target']

H1, pval1 = kruskal(X1[y==1], X1[y==2], X1[y==3], X1[y==4])
H2, pval2 = kruskal(X2[y==1], X2[y==2], X2[y==3], X2[y==4])

if pval1 < 0.05:
  print('The Kruskal-Wallis test for v2a1 is statistically significant. Reject the null hypothesis. ')
else:
  print('The Kruskal-Wallis test for v2a1 is not statistically significant. Fail to reject the null hypothesis.')

if pval2 < 0.05:
  print('The Kruskal-Wallis test for rez_esc is statistically significant. Reject the null hypothesis. ')
else:
  print('The Kruskal-Wallis test for rez_esc is not statistically significant. Fail to reject the null hypothesis.')


## 8. Remove null value rows of the target variable.

In [None]:
# drop rows in target column that have NaN values
df_train = df_train.dropna(subset=['Target'])
df_train

In [None]:
df_train.isna().sum().sum()
# show the columns that still have miussing values  
df_train.columns[df_train.isna().any()]

df_train['meaneduc'].unique()

In [None]:
df_train.isna().sum().sum()

## 9. Predict the accuracy using random forest classifier.

In [None]:
# give me the object columns
df_train.select_dtypes(include='object').columns

In [None]:
corr = df_train.corr(numeric_only=True)

In [None]:
# select the upper triangle of the correlation matrix
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

In [None]:
dropped_features = []
for col in upper.columns:
  if any(abs(upper[col]) > 0.95):
    dropped_features.append(col)

dropped_features

we are also dropping 'male', 'Id' and 'idhogar' because they are not useful for our model

In [None]:
[dropped_features.append(items) for items in ['Id', 'idhogar', 'male']]

dropped_features

In [None]:
df_train_copy = df_train.drop(dropped_features, axis=1)
df_test_copy = df_test.drop(dropped_features, axis=1)

In [None]:
X = df_train_copy.drop(['Target'], axis=1)
y = df_train_copy['Target']


train_data, val_data, train_target, val_target = train_test_split(X, y, test_size=0.2, random_state=42)

rfc = RandomForestClassifier(n_estimators=150, random_state=42, max_depth=50, min_samples_leaf=4, max_features='sqrt')
rfc.fit(train_data, train_target)

train_score = rfc.score(train_data, train_target)
val_score = rfc.score(val_data, val_target)

print("Test {} Train {} RS {}".format(val_score,train_score,i))

if val_score > train_score:
  print("Test {} Train {} RS {}".format(val_score,train_score,i))

In [None]:
val_predict = rfc.predict(val_data)

In [None]:
confusion_matrix(val_target, val_predict)

In [None]:
print(classification_report(val_target, val_predict))

### Predictions on the test data

In [None]:
test_predict = rfc.predict(df_test_copy)
df_test_copy['Target'] = test_predict

## 10. Check the accuracy using a random forest with cross-validation.

In [None]:
kfold = KFold(n_splits=10, random_state=142, shuffle=True)
print(cross_val_score(rfc, X, y, cv=kfold, scoring='accuracy'))

In [None]:
print(cross_val_score(rfc, X, y, cv=kfold, scoring='accuracy').mean())