In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("data/train.csv")

# 1. Target are per household

In [5]:
target_per_household = df.groupby(['idhogar'])['Target'].nunique()

no_target = len(target_per_household.loc[target_per_household == 0])
unique_target = len(target_per_household.loc[target_per_household == 1])
more_targets = len(target_per_household.loc[target_per_household > 1])
more_targets_perc = more_targets / (no_target + unique_target + more_targets)

print("No per household: {}".format(no_target))
print("1 target per household: {}".format(unique_target))
print("More targets per household: {} or {:.1f}%" .format(more_targets, more_targets_perc * 100))

No per household: 0
1 target per household: 2903
More targets per household: 85 or 2.8%


As in the competition title, "Household Poverty Level Prediction", we will consider the Target per household, and define the other as **outliers** that we will in a first time **delete**.

Kaggle discussion, mention to clean the data using the household value in caseof discrepency: https://www.kaggle.com/c/costa-rican-household-poverty-prediction/discussion/61403

# 2. Categorical features

In [28]:
categorical_features = df.columns.tolist()
for feature in df.describe().columns:
    categorical_features.remove(feature)

# Just for saving them
numerical_features = df.columns.tolist()
for categorical_feature in categorical_features:
    numerical_features.remove(categorical_feature)
    
categorical_features

['Id', 'idhogar', 'dependency', 'edjefe', 'edjefa']

In [12]:
df[categorical_features].head()

Unnamed: 0,Id,idhogar,dependency,edjefe,edjefa
0,ID_279628684,21eb7fcc1,no,10,no
1,ID_f29eb3ddd,0e5d7a658,8,12,no
2,ID_68de51c94,2c7317ea8,8,no,11
3,ID_d671db89c,2b58d945f,yes,11,no
4,ID_d56d6f5f5,2b58d945f,yes,11,no


Features ID, those will obviously not beeing predictive (or will overfit), so we can ignore:
- Id
- idhogar

Other categorical features:
- **dependency**': Dependency rate. We can use its squraed feature **SQBdependency**.
- **edjefe**, years of education of male head of household. We can use its squared feature **SQBedjefe**
- **edjefa**, years of education of female head of household.

In [14]:
df[['edjefe', 'SQBedjefe']].head()

Unnamed: 0,edjefe,SQBedjefe
0,10,100
1,12,144
2,no,0
3,11,121
4,11,121


In [16]:
df[['edjefe', 'SQBedjefe']].head()

Unnamed: 0,edjefe,SQBedjefe
0,10,100
1,12,144
2,no,0
3,11,121
4,11,121


# 3. Empty values in numerical features

In [17]:
print("Number of observations {}".format(len(df)))

Number of observations 9557


In [21]:
features_with_null = df.isna().sum().sort_values(ascending=False)
features_with_null = features_with_null.loc[features_with_null > 0]
feature_names_with_null = features_with_null.index.tolist()

features_with_null

rez_esc      7928
v18q1        7342
v2a1         6860
meaneduc        5
SQBmeaned       5
dtype: int64

* rez_esc      7928 null values for Years behind in school. Too much null values: unusable
* v2a1         6860 null values for Monthly rent payment. Unusable.

* v18q1        7342 null values for number of tablets household owns. Unusable but summing **v18q** by household may help.

* meaneduc        5 null values for average years of education for adults. We may fullfill those values.
* SQBmeaned       5 null values for square of the mean years of education of adults. We may fullfill those values.

# 4. Feature selection with RandomForest

In [35]:
selectable_features = numerical_features.copy()
selectable_features.remove('Target')
for feature in feature_names_with_null:
    selectable_features.remove(feature)

X = df[selectable_features]
y = df.Target

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=112, test_size=0.2)

In [38]:
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf= RandomForestClassifier()
clf.fit(X_train, y_train)

sorted(zip(X.columns, clf.feature_importances_ * 100), key=lambda x: -x[1])

[('SQBedjefe', 3.1738779558617964),
 ('SQBescolari', 3.1080936779177093),
 ('SQBdependency', 3.0549420719404408),
 ('SQBovercrowding', 2.922785569188261),
 ('qmobilephone', 2.804497025812769),
 ('escolari', 2.641310138377618),
 ('SQBage', 2.6192385217816443),
 ('SQBhogar_nin', 2.5603483789128876),
 ('agesq', 2.5476068769762614),
 ('hogar_nin', 2.5289363570864096),
 ('overcrowding', 2.4631057182236247),
 ('age', 2.382853394978264),
 ('rooms', 2.3097185423369404),
 ('r4t2', 2.018727640018228),
 ('r4h2', 1.9090608573386556),
 ('r4m3', 1.8664462248387432),
 ('r4h3', 1.7717784310300164),
 ('cielorazo', 1.6814540234984985),
 ('bedrooms', 1.5963041304152152),
 ('hogar_adul', 1.520240712732043),
 ('r4m2', 1.3840545959798576),
 ('paredblolad', 1.3773482308953038),
 ('r4m1', 1.3634778502601774),
 ('r4t1', 1.3164900585489878),
 ('r4t3', 1.2180905602347234),
 ('v18q', 1.2177644424785852),
 ('energcocinar2', 1.2076817902622963),
 ('tamviv', 1.1821583827230668),
 ('tamhog', 1.1586422259271574),
 ('r

# 5. First evaluation
How to resist?

In [39]:
selected_features = ['SQBedjefe', 'SQBdependency', 'overcrowding', 'qmobilephone', 'SQBage', 'rooms', 'SQBhogar_nin']

X_train_4predict = X_train[selected_features]
predictor = RandomForestClassifier()
predictor.fit(X_train_4predict, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [40]:
X_test_4predict = X_test[selected_features]
y_predict = predictor.predict(X_test_4predict)

In [42]:
from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(y_test, y_predict)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.62406015 0.71565495 0.60606061 0.85020243]
recall: [0.58041958 0.63456091 0.57613169 0.89514066]
fscore: [0.60144928 0.67267267 0.5907173  0.87209302]
support: [ 143  353  243 1173]


In [58]:
from sklearn.metrics import f1_score
f1_score(y_test, y_predict, average='macro')

0.6842330677172161

# 5. Predicting and sending

In [45]:
df_eval = pd.read_csv("data/test.csv")

In [51]:
X_eval = df_eval[selected_features]
df_eval['Target'] = predictor.predict(X_eval)

In [57]:
df_eval[['Id', 'Target']].to_csv("data/out.csv", index=False)

Kaggle gave me a result of 0.349  on this first try, which show a huge overfitt, which is also normal with a model like random forest.

I also ranked 82 / 106.