In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.preprocessing import OneHotEncoder, RobustScaler, MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, precision_recall_curve, roc_curve, roc_auc_score
import numpy as np
from pprint import pprint

In [None]:
incomes = pd.read_csv('income_evaluation.csv')
incomes.head()
incomes.rename(columns= lambda x: x.strip(), inplace=True)

: 

In [None]:
incomes.describe().applymap(lambda x: f"{x:0.1f}")

: 

In [None]:
incomes.columns

: 

In [None]:
numeric_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

plt.figure(figsize=(12, 6))

for i, column in enumerate(numeric_cols, 1):
    plt.subplot(len(numeric_cols), len(numeric_cols)//2, i)
    sns.boxplot(x=incomes[column])

plt.suptitle('Wykresy pudełkowe dla zmiennych liczbowych')
plt.tight_layout()
plt.show()

: 

In [None]:
null_counts = pd.DataFrame(incomes[incomes.select_dtypes('number').columns].isna().sum(), columns=['Null Counts'])
null_counts

# plt.bar(null_counts.index, null_counts['Null Counts'], color='dimgray')
# plt.xticks(rotation=45, ha='right')
# plt.xlabel('Columns')
# plt.ylabel('Brakujące wartości')
# plt.title('Brakujące wartości w zbiorze danych')
# for i, value in enumerate(null_counts['Null Counts']):
#     plt.text(i, value + 1, str(value), ha='center', va='bottom', fontsize=10)
# plt.show()

: 

In [None]:
incomes = incomes.rename(columns=lambda x: x.strip())
categories = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
incomes[categories] = incomes[categories].astype("category")
incomes = incomes.applymap(lambda x: x.strip() if not isinstance(x, int) else x)

: 

In [None]:
for column in categories:
    print(column, set(incomes[column]))

: 

In [None]:
for column in ['workclass', 'occupation', 'native-country']:
    incomes[column].replace("?", np.NaN, inplace=True)
    incomes[column] = incomes[column].fillna(incomes[column].mode()[0])

: 

In [None]:
incomes['additional_money'] = incomes['capital-gain'] - incomes['capital-loss']
incomes = incomes.drop(columns=['capital-gain', 'capital-loss', 'education-num', 'relationship'], axis=1)
incomes.head()

: 

In [None]:
sns.boxplot(x=incomes['additional_money'])

plt.suptitle('Wykres pudełkowy dla zmiennej additional_money')
plt.tight_layout()
plt.show()

: 

In [None]:
# Zaznacz outliery dla zmiennej 'additional_money' na wykresie punktowym
plt.clf()
plt.scatter(incomes.index, incomes['additional_money'], label='Dane mieszczące się w normie')
plt.scatter(incomes[incomes['additional_money'] > 40000].index, incomes[incomes['additional_money'] > 40000]['additional_money'], color='red', label='Wartości odstające')
# Dostosuj etykiety osi
plt.xticks([])
plt.ylabel('additional_money')
# Dodaj tytuł wykresu
plt.title('Wartości odstające dla zmiennej additional_money')
# Dodaj legendę
plt.legend()
# Wyświetl wykres

plt.show()

: 

In [None]:
# Usuń wszystkie rekordy, dla których zmienna additional_money przyjmuje wartości większe od 40000
# incomes = incomes[incomes['additional_money'] <= 40000]

: 

In [None]:
sns.boxplot(x=incomes['additional_money'])

plt.suptitle('Wykres pudełkowy dla zmiennej additional_money')
plt.tight_layout()
plt.show()

: 

In [None]:
ctplot = sns.countplot(data=incomes, x='income', hue='sex')

ctplot.set_xlabel('Dochód')
ctplot.set_ylabel('Liczba osób')
ctplot.set_title('Rozkład dochodu w zależności od płci')
ctplot.set_ylim(0, 17000)

ctplot.legend(title='Płeć', labels=['Mężczyzna', 'Kobieta'])
occurrences = incomes.groupby(['income', 'sex']).size()
percentages = occurrences.groupby(level=0, group_keys=False).apply(lambda x: 100 * x / float(x.sum()))

print(percentages)

: 

In [None]:
def adjust_income(income: str):
  if income == '<=50K':
    return 0
  elif income == '>50K':
    return 1
  else:
      return 0

incomes['income_adj'] = incomes['income'].apply(adjust_income)
numeric_cols = incomes.select_dtypes('number').columns
correlation_matrix = incomes[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5)
plt.title('Macierz korelacji')
incomes.drop('income_adj', axis=1, inplace=True)

: 

In [None]:
def change_education(education: str):
    if education in ["10th", "11th", "12th", "1st-4th", "5th-6th", "7th-8th", "9th", "Preschool"]:
        return "Primary"
    elif education in ["Bachelors", "Some-college"]:
        return "Bachelors"
    elif education in ["Assoc-acdm", "Assoc-voc"]:
        return "Associate"
    else:
        return education
incomes['education'] = incomes['education'].apply(change_education)
set(incomes['education'])


: 

In [None]:
education_counts = incomes['education'].value_counts()


plt.figure(figsize=(8, 8))
plt.pie(education_counts, labels=education_counts.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.tab20.colors)
plt.title('Procentowy udział poszczególnych poziomów wykształcenia')
plt.show()

: 

In [None]:
training_cols = ['age', 'workclass', 'education', 'race', 'sex', 'hours-per-week', 'native-country', 'additional_money', 'income', 'fnlwgt']
training_incomes = incomes[training_cols]
categorical_cols = ['workclass', 'race', 'sex', 'native-country', 'education']
numeric = ['age', 'hours-per-week', 'additional_money', 'fnlwgt']

encoder = OneHotEncoder(sparse=False)
encoded_cols = pd.DataFrame(encoder.fit_transform(training_incomes[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))
training_incomes = pd.concat([training_incomes.drop(columns=categorical_cols), encoded_cols], axis=1)

scaler = RobustScaler()
# scaler = StandardScaler()
# scaler = MinMaxScaler()

training_incomes[numeric] = scaler.fit_transform(training_incomes[numeric])

training_incomes.columns
X = training_incomes.drop('income', axis=1)
y = incomes['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

: 

In [None]:
rand_for_1 = RandomForestClassifier(n_estimators=100, random_state=42, max_leaf_nodes=100, max_features="sqrt", bootstrap=False)

rand_for_1.fit(X_train, y_train)
rand_for_1.score(X_test, y_test)

: 

In [None]:
print('Obecnie używane parametry dla modelu rand_for_1:\n')
pprint(rand_for_1.get_params())

: 

In [None]:
y_pred_1 = rand_for_1.predict(X_test)
cm = confusion_matrix(y_test, y_pred_1)
cm

: 

In [None]:
precision_score(y_test, y_pred_1, pos_label="<=50K")

: 

In [None]:
recall_score(y_test, y_pred, average="binary", pos_label="<=50K")

: 

In [None]:
f1_score(y_test, y_pred, pos_label="<=50K")

: 

In [None]:
rand_for_2 = RandomForestClassifier(n_estimators=500, random_state=42, max_leaf_nodes=160)

rand_for_2.fit(X_train, y_train)
rand_for_2.score(X_test, y_test)

: 

In [None]:
print('Obecnie używane parametry dla modelu rand_for_2:\n')
pprint(rand_for_2.get_params())

: 

In [None]:
rfc = RandomForestClassifier()

my_param_grid = {
    'bootstrap': [True, False],
    'n_estimators': [250, 500],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [2, 4],
    'min_samples_split': [5, 10]
}
gs = GridSearchCV(estimator = rfc, param_grid = my_param_grid, cv = 3, n_jobs = -1, verbose = 3)

: 

In [None]:
# gs.fit(X_train, y_train)

: 

In [None]:
gs.best_params_

: 

In [None]:
best_grid = gs.best_estimator_

: 

In [None]:
feature_importances = rand_for.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df.sort_values("Importance", ascending=False)
feature_importance_df['Importance'].map(lambda x: f"{x:0.1f}")

# Plot the feature importances
plt.figure(figsize=(15, 10))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Istotność zmiennych w modelu Random Forest Classifier (n_estimators=500, random_state=42, max_leaf_nodes=160)')
plt.show()

: 

In [None]:
y_pred = rand_for.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

: 

In [None]:
precision_score(y_test, y_pred, pos_label="<=50K")

: 

In [None]:
recall_score(y_test, y_pred, average="binary", pos_label="<=50K")

: 

In [None]:
f1_score(y_test, y_pred, pos_label="<=50K")

: 

In [None]:
y_pred_proba = rand_for.predict_proba(X_test)[:, 1]
y_pred_proba
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=">50K")
roc_auc = roc_auc_score(y_test, y_pred_proba)

: 

In [None]:
# Plot the ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
# roc curve for tpr = fpr
plt.plot([0, 1], [0, 1], 'k--', label='Random classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

: 

In [None]:
y_probas_forest = cross_val_predict(rand_for, X_train, y_train, cv=3,
                                    method="predict_proba")
y_scores_forest = y_probas_forest[:, 1]
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(
    y_train, y_scores_forest)

ValueError: y_true takes value in {'<=50K', '>50K'} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly.