In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [17]:
processed_dataset = pd.read_csv('processed_dataset.csv')
# encoded_dataset = pd.read_csv('encoded_dataset.csv')
numeric_features = np.load('numeric_features.npy')
target = processed_dataset[["target"]]

processed_dataset.drop(["target", "Total Household Income"], axis=1, inplace=True)
encoded_dataset = pd.get_dummies(processed_dataset, sparse=True)

## Data reduction

### Factor analysis of mixed data (FAMD)

In [4]:
import prince

In [4]:
famd = prince.FAMD(n_components=6, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42)
famd = famd.fit()
corr = famd.column_correlations(processed_dataset)
columns = corr.idxmax()
corr.loc[columns, range(6)]

Unnamed: 0,0,1,2,3,4,5
Total Food Expenditure,0.759984,0.47463,0.19089,0.220258,0.253304,-0.011398
Total Number of Family members,0.172079,0.77743,0.046792,0.561884,0.523693,0.354891
Number of Airconditioner,0.596956,0.011561,0.425061,-0.174133,0.027311,-0.335515
Total Number of Family members,0.172079,0.77743,0.046792,0.561884,0.523693,0.354891
Total Number of Family members,0.172079,0.77743,0.046792,0.561884,0.523693,0.354891
Agricultural Household indicator,-0.060387,0.221194,-0.143347,-0.666846,0.50625,0.778338


### PCA

In [38]:
pca_p = prince.PCA(n_components=6, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42)
pca_p = pca_p.fit(processed_dataset[numeric_features])
reduced_data = pca_p.transform(processed_dataset[numeric_features])
corr = pca_p.column_correlations(processed_dataset[numeric_features])
columns = corr.idxmax()
corr.loc[columns, range(6)]

Unnamed: 0,0,1,2,3,4,5
Total Food Expenditure,0.842704,0.352396,0.027802,-0.149604,0.012247,0.132561
Total Number of Family members,0.239667,0.814827,0.071334,0.027213,0.199304,-0.181753
Agricultural Household indicator,-0.007081,-0.002611,0.421454,-0.226829,0.06838,0.118673
Crop Farming and Gardening expenses,0.056014,0.068605,0.228613,0.531361,-0.17924,0.196338
Household Head Age,0.081501,-0.266032,-0.234476,0.225488,0.454918,0.401465
Alcoholic Beverages Expenditure,0.199135,0.160082,-0.121955,-0.112601,-0.212697,0.458982


## Classifier

In [46]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(processed_dataset[numeric_features], target, test_size=0.2, random_state=20)
x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, random_state=20)

In [60]:
train = y_train.value_counts()
print(train)
validation = y_validation.value_counts()
print(validation)
test = y_test.value_counts()
print(test)

target
2         6688
3         6666
0         6648
1         6586
dtype: int64
target
1         1709
0         1688
3         1662
2         1588
dtype: int64
target
2         2110
1         2090
3         2058
0         2051
dtype: int64


### Numeric classifier

#### LogisticRegression

##### No reduction

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
clf = LogisticRegression(penalty='l2',random_state=20, max_iter=250, multi_class='multinomial', solver='saga')
result = cross_validate(clf, x_train, y_train.values.ravel(), cv=5)
print("Test score: ", result['test_score'].mean())

Test score:  0.6926804774371418


In [48]:
clf = clf.fit(x_train, y_train.values.ravel())
result = clf.score(x_train, y_train)
print("Train score: ", result)
result = clf.score(x_validation, y_validation)
print("Validation score: ", result)
result = clf.score(x_test, y_test)
print("Test score: ", result)

Train score:  0.6996389348578306
Validation score:  0.691740634872875
Test score:  0.6968347574918763


#### Reduction

In [52]:
results = []
for n in range(1, 30):
    pca_p = prince.PCA(n_components=n, n_iter=1000, copy=True, check_input=True, engine='auto', random_state=42)
    pca_p = pca_p.fit(x_train)
    reduced_x_train = pca_p.transform(x_train)
    
    clf = LogisticRegression(penalty='l2',random_state=20, max_iter=1000, multi_class='multinomial', solver='saga')
    clf = clf.fit(reduced_x_train, y_train.values.ravel())
    
    print(f"#{n}\n")
    result_train = clf.score(reduced_x_train, y_train)
    print("Train score: ", result_train)
    reduced_x_validation = pca_p.transform(x_validation)
    result_validation = clf.score(reduced_x_validation, y_validation)
    print("Validation score: ", result_validation)
    reduced_x_test = pca_p.transform(x_test)
    result_test = clf.score(reduced_x_test, y_test)
    print("Test score: ", result_test)
    results.append([result_train, result_validation, result_test])

#1

Train score:  0.6802316834662253
Validation score:  0.6836166691740635
Test score:  0.6756529064869419
#2

Train score:  0.6818113434632165
Validation score:  0.6843688882202498
Test score:  0.6752918522084487
#3

Train score:  0.6938092372498872
Validation score:  0.7021212577102453
Test score:  0.6839571548922855
#4

Train score:  0.6982849405746954
Validation score:  0.7088912291259215
Test score:  0.6950294860994103
#5

Train score:  0.6977583872423649
Validation score:  0.7084398976982097
Test score:  0.6950294860994103
#6

Train score:  0.7043403038964947
Validation score:  0.7052805777042275
Test score:  0.7008063545553015
#7

Train score:  0.7081013991274259
Validation score:  0.7093425605536332
Test score:  0.7044168973402335
#8

Train score:  0.717128027681661
Validation score:  0.7195727395817663
Test score:  0.7145264171380431
#9

Train score:  0.7236723333834812
Validation score:  0.723634722431172
Test score:  0.7225899626910579
#10

Train score:  0.7250639386189258
V

In [None]:
% matplotlib inline
plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(1, 30, step=1)
y = np.cumsum(pca.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, y, marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 11, step=1)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

ax.grid(axis='x')
plt.show()

In [40]:
clf_reduced = clf_reduced.fit(reduced_x_train, reduced_y_train.values.ravel())
result = clf_reduced.score(reduced_x_train, reduced_y_train)
print("Train score: ", result)
result = clf_reduced.score(reduced_x_test, reduced_y_test)
print("Test score: ", result)

Train score:  0.6991424702873477
Test score:  0.6949091346732459


In [9]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()