# Import dependencies

In [None]:
# To mute annoying warnings in notebook
import warnings

# For Data science
import pandas as pd
import numpy as np
import missingno as msno

# For graph
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import (
    model_selection,
    tree,
    linear_model,
    metrics,
)

# For modules
from sources import (
    check_is_na,
    get_category_encoded,
    get_heatmap,
    get_dataframe_scaled,
    three_sigma_cleared,
)

warnings.filterwarnings("ignore")

# Getting data, observations
Get dataset

In [None]:
# Set column names
column_names = [
    "A1",
    "A2",
    "A3",
    "A4",
    "A5",
    "A6",
    "A7",
    "A8",
    "A9",
    "A10",
    "A11",
    "A12",
    "A13",
    "A14",
    "A15",
    "Target",
]

# Get dataset from file
data = pd.read_csv("../data/crx.data", delimiter=",", names=column_names)

In [None]:
# Get info about data
data.info()

In [None]:
data.describe()

Get columns by data type.

In [None]:
# Get dataframes for columns by data type
categorical_columns = data.select_dtypes(include=["object"])
float_columns = data.select_dtypes(include=["float64"])
integer_columns = data.select_dtypes(include=["int64"])

Replace ? symbols to Nan.

In [None]:
# Replace ? to np.Nan
categorical_columns.replace(to_replace="?", value=np.NaN, inplace=True)

Concatenate subsets to clear them.

In [None]:
# Get subsets
frames = [float_columns, categorical_columns]

# Concatenate subsets horizontally
df = pd.concat(frames, axis=1)

Drop rows with Nans.

In [None]:
# Drop rows
dropped_df = df.dropna(axis="rows")

# Check for Nans
check_is_na(dropped_df)

In [None]:
# Change A2 type from object to numeric
dropped_df["A2"] = pd.to_numeric(dropped_df["A2"]);

In [None]:
# Get diagram with missing values
msno.matrix(dropped_df);

There are really no missing values.

Scale continuous data.

In [None]:
# # Scale dataset
# scaled_df = get_dataframe_scaled(
#     dataset=dropped_df,
#     omit_feature_names=['Target', 'A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13', 'A14']
# )

In [None]:
# Show value distribution for continuous data
sns.boxplot([dropped_df.A2, dropped_df.A3, dropped_df.A8]);

At least feature 'A8' has out-layers. Let's clean data.

In [None]:
# Remove values with deviation more than 3 sigma
cleared_df = three_sigma_cleared(
    dataset=dropped_df, feature_names=["A2", "A3", "A8"], sigmas=3
)

In [None]:
sns.boxplot([cleared_df.A2, cleared_df.A3, cleared_df.A8]);

Now, there are no values out 3-sigma threshold.

Encode categorical columns.

In [None]:
# Get categorical feature name list from dataset
columns_to_encode = categorical_columns.columns[:-1]

# Encode categorical features
encoded_df = get_category_encoded(
    dataset=cleared_df,
    category_names=columns_to_encode,
    encoder_type="LabelEncoder",
)

# Get Tree classification
Split dataset.

In [None]:
# Set feature and target subsets
X = encoded_df.drop("Target", axis=1)
y = encoded_df["Target"]

# Get split subsets
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=100
)

Get classifier.

In [None]:
# Get tree classifier
tree_classifier = tree.DecisionTreeClassifier(
    random_state=42, max_depth=6, criterion="gini"
).fit(X_train, y_train)

In [None]:
# Get Gini Importance as a metrics
importance_array = tree_classifier.feature_importances_

Here: Gini Importance (Mean Decrease in Impurity) calculates each feature importance as the sum over the number of splits (across all tress) that include the feature, proportionally to the number of samples it splits.

In [None]:
# Make dataframe from array
importance_df = pd.DataFrame(
    data=importance_array,
    columns=["importance"],
)

# Make column of indexes
importance_df["feature"] = [x for x in range(0, len(importance_df.values))]

# Visualize importance
sns.barplot(data=importance_df, x="feature", y="importance");

There is a obviously main feature in dataset. Let's find it.

In [None]:
# Get maximal importance
max_importance = importance_df.importance.max()

# Get index of maximal importance
important_feature_index = importance_df[
    importance_df["importance"] == max_importance
].index.values[0]

# Find feature name by index
main_feature = encoded_df.columns[important_feature_index]

print(f"Maximum of importance {max_importance:.2f} has feature: {main_feature}")

In [None]:
# Show tree
tree.plot_tree(tree_classifier);

The tree looks amazing).

Make prediction.

In [None]:
# Get predicted values
y_predicted_tree = tree_classifier.predict(X_test)

In [None]:
# Make dataframe from test and predicted values
data_to_compare = pd.DataFrame(
    {"values from test": y_test, "predicted values": y_predicted_tree}
)

# Visualize amount of predicted values
sns.barplot(
    x="value",
    y="index",
    hue="variable",
    data=pd.melt(data_to_compare.reset_index(), id_vars="index"),
)

plt.title("Distribution of observations in test dataset and and predicted dataset");

Prediction looks good. Let's make metrics.

In [None]:
# Make prediction
y_pred = tree_classifier.predict(X_train)

# Get accuracy
tree_accuracy = metrics.accuracy_score(y_train, y_pred)

print(f"Accuracy of tree is: {tree_accuracy:.3f}")

# Regression

Regularisation by encoding and scaling

In [None]:
# Encode categorical features
encoded_df = get_category_encoded(
    dataset=encoded_df,
    category_names=["Target"],
    encoder_type="LabelEncoder",
)

In [None]:
# Scale features
scaled_df = get_dataframe_scaled(dataset=encoded_df, omit_feature_names=["Target"]);

In [None]:
# Get new heatmap
get_heatmap(scaled_df, "encoded_dataset")

There are correlating features A5 and A4.

Deal with correlating features

In [None]:
# Set new combination for weight features
encoded_df["A4+A5"] = encoded_df["A4"] + encoded_df["A5"]

# Drop combination components
encoded_df.drop(
    columns=[
        "A4",
        "A5",
    ],
    inplace=True,
)

# Get new heatmap
get_heatmap(encoded_df, "encoded_dataset")

Looks good)

In [None]:
# Set feature and target subsets
X = encoded_df.drop("Target", axis=1)
y = encoded_df["Target"]

# Get split subsets
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=100
)

In [None]:
# Get regression object
regression_model = linear_model.LogisticRegression()

# Get fit
regression_model.fit(X_train, y_train)

coefficients = regression_model.coef_.T

coefficients

In [None]:
# Make dataframe from array
coefficient_df = pd.DataFrame(
    data=coefficients,
    columns=["coefficients"],
)

# Make column of indexes
coefficient_df["feature"] = [x for x in range(0, len(coefficient_df.values))]

# Visualize importance
sns.barplot(data=coefficient_df, x="feature", y="coefficients");

Looks like there is a main features. Let's find it.

In [None]:
# Get maximal importance
max_index = coefficient_df.coefficients.min()

# Get index of maximal importance
important_feature_index = coefficient_df[
    coefficient_df["coefficients"] == max_index
].index

# Find feature name by index
main_feature = list(encoded_df.columns[important_feature_index])[0]

print(f"Maximum coefficient value {max_importance:.2f} has feature: {main_feature}")

In [None]:
# Get predicted values
y_predicted_regression = regression_model.predict(X_test)

In [None]:
# Get density plot
# for test data
sns.kdeplot(
    y_test,
    fill=False,
    color="r",
    label="test subset",
)

# for predicted data
sns.kdeplot(
    y_predicted_regression,
    fill=True,
    color="b",
    label="predicted",
)

# Plot
plt.title("Distribution of observations in test dataset and and predicted dataset")
plt.legend();

In [None]:
# Make prediction
y_predicted_regression = regression_model.predict(X_train)

# Get accuracy
regression_accuracy = metrics.accuracy_score(y_train, y_predicted_regression)

print(f"Accuracy of regression is: {regression_accuracy:.3f}")