# Libreries

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import seaborn as sns  #for data visualization operations
import matplotlib.pyplot as plt  #for data visualization operations
import plotly.express as px #for scatter 3d
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler # for encoding and standardization
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from prettytable import PrettyTable
%matplotlib inline

# to ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Load DataSet

In [None]:
pathAndFileName = '../data/smoking.csv'
# Load the dataset
data = pd.read_csv(pathAndFileName)
df = data.copy()
df.head(n = 10).style.background_gradient(cmap = "Greens") 


In [None]:
TRAIN_FILE = 'train'
TEST_FILE = 'test'
PATH_DATA = '../data/'
EXTENSION_FILE = '.csv'

comp_data_df_train = pd.read_csv(PATH_DATA + TRAIN_FILE + EXTENSION_FILE)
df_test = pd.read_csv(PATH_DATA + TEST_FILE + EXTENSION_FILE)
comp_data_df_train.head(n = 10).style.background_gradient(cmap = "Greens")


In [None]:
# drop columns
comp_data_df_train = comp_data_df_train.drop(columns=['id'])
df_test = df_test.drop(columns=['id'])
data = data.drop(columns=['ID', 'gender', 'oral', 'tartar'])

data.shape
print("competition train shape:", comp_data_df_train.shape)
print("original train shape:", data.shape)

In [None]:
# combine datasets
df_train = pd.concat([comp_data_df_train, data])
df_train.shape

df_train = df_train.drop_duplicates()
df_train.shape

df_train.isna().sum()
df_test.isna().sum()

new_col_names = {
    "systolic": "systolic bp",
    "relaxation": "diastolic bp",
    "Gtp": "GGT",
    "dental caries": "dental cavities"
}


In [None]:
# Drop ID
df = df.drop("ID", axis = 1)
df.columns



In [None]:
# Checking the nulls
df.isnull().sum()

In [None]:
df.shape

In [None]:
# Droping the redundant data
df = df.drop_duplicates()

In [None]:
df.shape

# Removing and Checking Outliers

In [None]:
df.head(n = 10).style.background_gradient(cmap = "Greens")

In [None]:
# Specify the columns to plot
numeric_cols = ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride', 'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'AST', 'ALT', 'Gtp', 'dental caries']

# Create a figure and axis object
fig, ax = plt.subplots(figsize=(28,8))

# Create the boxplot
df[numeric_cols].boxplot(ax=ax)

# Set the plot title
ax.set_title('Boxplot of Numerical Columns')

# Show the plot
plt.show()

# Plotting each colummns


In [None]:
# Loop over each column in the DataFrame
for column_name in numeric_cols:
    
    sns.set_style("whitegrid")
    sns.kdeplot(data=df[column_name])
    plt.xlabel("Values")
    plt.title("Density plot of " + column_name)
    plt.show()

# Division Smoking Not Smoking

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='smoking', data=data)
plt.title('Distribution of Smokers vs Non-Smokers')
plt.xlabel('Smoker')
plt.ylabel('Count')
plt.show()

In [None]:
# ----------------------------------------------------------------------------------- #
# visually exploring the distribution of each feature and how it relates
# to the target variable 'smoking'. It helps in identifying potential patterns
# and differences between the two groups.
# ----------------------------------------------------------------------------------- #
plt.pie(df_train['smoking'].value_counts(), labels=['Smoking', 'Non-smoking'], autopct='%1.1f%%', textprops={'fontsize': 14});
num_cols = len(df_train.columns)
plt.figure(figsize=(16, num_cols*1.5))
for i, col in enumerate(df_train.columns):
    plt.subplot(num_cols//2 + num_cols%2, 4, i + 1)
    sns.histplot(x=col, hue='smoking', data=df_train, bins=50)
    plt.title(f'{col} Distribution')
    plt.tight_layout()
plt.show()



##### Here we use IQR to detect the outliers and be able to remove them

In [None]:
def detect_outliers(df, min_outlier_occurrences, columns):
    outlier_rows = []
    for col in columns:
        q1 = np.nanpercentile(df[col], 25)
        q3 = np.nanpercentile(df[col], 75)
        iqr = q3 - q1
        outlier_point = 1.5 * iqr
        outliers = df[(df[col] < q1 - outlier_point) | (df[col] > q3 + outlier_point)].index
        outlier_rows.extend(outliers)

    outlier_counts = Counter(outlier_rows)
    outlier_rows_to_drop = [row for row, count in outlier_counts.items() if count >= min_outlier_occurrences]

    return outlier_rows_to_drop


outlier_rows = detect_outliers(df, 5, df.select_dtypes(["float", "int"]).columns)
df = df.drop(outlier_rows)

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.groupby('gender').size().plot(kind='pie', explode=[0,0.1], autopct='%1.1f%%', shadow=True, colors=["pink", "gray"], title="Gender")

In [None]:
df.groupby('tartar').size().plot(kind='bar', title="Tartar", rot=0, xlabel='Age', ylabel='Count')

In [None]:
fig, axes = plt.subplots(2, 3, figsize = (20, 12))
axes = axes.flatten()

sns.scatterplot(ax = axes[0], x = "relaxation", y = "hemoglobin",
                hue = "smoking", size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'Relaxation' and 'hemoglobin'");

sns.scatterplot(ax = axes[1], x = "systolic", y = "Cholesterol", hue = "smoking",
                size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'Systolic' and 'cholesterol'");

sns.scatterplot(ax = axes[2], x = "LDL", y = "Urine protein", hue = "smoking",
                size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'LDL' and 'urine protein'");

sns.scatterplot(ax = axes[3], x = "HDL", y = "serum creatinine", hue = "smoking",
                size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'HDL' and 'serum creatinine'");

sns.scatterplot(ax = axes[4], x = "weight(kg)", y = "Gtp", hue = "smoking",
                size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'GTP' and 'age'");

sns.scatterplot(ax = axes[5], x = "AST", y = "fasting blood sugar", hue = "smoking",
                size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'AST' and 'fasting blood sugar'");

In [None]:
df.hist(figsize = (20, 20), bins = 12, legend = False);

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (12, 12))
axes = axes.flatten()

sns.barplot(ax=axes[0],
            x=df["gender"].value_counts().index,
            y=df["gender"].value_counts().values,
            saturation=1).set(title="Frequency of classes of the 'gender' variable")

sns.barplot(ax=axes[1],
            x=df["tartar"].value_counts().index,
            y=df["tartar"].value_counts().values,
            saturation=1).set(title="Frequency of cases of tartar")

sns.barplot(ax=axes[2],
            x=df["dental caries"].value_counts().index,
            y=df["dental caries"].value_counts().values,
            saturation=1).set(title="Frequency of cases of dental caries")

sns.barplot(ax=axes[3],
            x=df["smoking"].value_counts().index,
            y=df["smoking"].value_counts().values,
            saturation=1).set(title="Frequency of classes of the 'smoking' variable")

In [None]:
fig = px.scatter_3d(df, 
                    x = "hemoglobin",
                    y = "age",
                    z = "Cholesterol",
                    color="smoking")
fig.show();

# DATA PROCESSING

In [None]:
lbe = LabelEncoder()
lbe.fit_transform(df["gender"])
df["gender"] = lbe.fit_transform(df["gender"])
lbe.fit_transform(df["tartar"])
df["tartar"] = lbe.fit_transform(df["tartar"])
lbe.fit_transform(df["oral"])
df["oral"] = lbe.fit_transform(df["oral"])
lbe.fit_transform(df["smoking"])
df["smoking"] = lbe.fit_transform(df["smoking"])

In [None]:
df.head()

In [None]:
# select dependent variable (label)
y = df["smoking"]

# select independent variable (estimator)
x = df.drop("smoking", axis = 1)

In [None]:
# Train-Test-Split   
x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size=0.2, shuffle = True, random_state=1)

# Data Scalling

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# Clasisfication Models

## Support Vector Machine (SVM) Model 

In [None]:
svm = SVC(kernel='rbf')
# fit the model with data
svm.fit(x_train, y_train)
# predict with test dataset
y_predict_SVM = svm.predict(x_test)
print(classification_report(y_test,y_predict_SVM))
accuracy_Score_SVM = metrics.accuracy_score(y_test, y_predict_SVM)
print('SVM model accuracy is: {:.2f}%'.format(accuracy_Score_SVM*100))

### Confusion Matrix for SVM

In [None]:
cm = metrics.confusion_matrix(y_test, y_predict_SVM)
print('Confusion Matrix for SVM :\n', cm, '\n')
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.5)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i,s=cm[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix of SVM', fontsize=15)
plt.show()

## Random Forest Classifier  Model 

In [None]:
models = RandomForestClassifier(n_estimators=500)
models.fit(x_train, y_train)
# predict with test dataset
y_predict_random = models.predict(x_test)
print(classification_report(y_test,y_predict_random))
accuracy_Score_random = metrics.accuracy_score(y_test, y_predict_random)
print('RandomForest model accuracy is: {:.2f}%'.format(accuracy_Score_random*100))

### Confusion Matrix for Random Forest Classifier

In [None]:
cm = metrics.confusion_matrix(y_test, y_predict_random)
print('Confusion Matrix for Random Forest Classifier :\n', cm, '\n')
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.5)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i,s=cm[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix of Random Forest Classifier', fontsize=15)
plt.show()

### XGBoost model

In [None]:
xgb_model = XGBClassifier(n_estimators = 600)
xgb_model.fit(x_train, y_train)
pred_xgb = xgb_model.predict(x_test)
print(classification_report(y_test,pred_xgb))
accuracy_Score_xgb = metrics.accuracy_score(y_test, pred_xgb)
print('XGBoosting model accuracy is: {:.2f}%'.format(accuracy_Score_xgb*100))

### Confusion Matrix for XGBoost model

In [None]:
cm = metrics.confusion_matrix(y_test, pred_xgb)
print('Confusion Matrix for XGBoosting Classifier :\n', cm, '\n')
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.5)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i,s=cm[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix of XGBoosting Classifier', fontsize=15)
plt.show()

### Logistic Regression

In [None]:
C_list = [0.001, 0.01, 0.1, 1, 10]
overall_scores = []
for C in C_list:
    print(f'Trying C={C}')
    model = LogisticRegression(solver='lbfgs', C=C, random_state=SEED, max_iter=1000)
    fold_score = fit_model_with_skf(X_train, y_train, model)
    fold_score.append(C)
    overall_scores.append(fold_score)

In [None]:
for mean, std, C in overall_scores:
    print(f'C={C: <5} mean: {mean:.4f}, std: +- {std:.4f}')

In [None]:
log_reg_final = LogisticRegression(solver='lbfgs', C=1, random_state=SEED, max_iter=1000)

### Confussion Matrix for Logistic Regression

In [None]:
cm = metrics.confusion_matrix(y_test, pred_xgb)
print('Confusion Matrix for Logistic Regression :\n', cm, '\n')
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.5)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i,s=cm[i, j], va='center', ha='center', size='xx-large')

### ROC Curve

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_test, pred_xgb)
auc = roc_auc_score(y_test, pred_xgb)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

### Feature Importance

In [None]:
# Get the feature importance scores
importance_scores = xgb_model.feature_importances_

# Get the names of the features
feature_names = x.columns

# Create a dictionary with feature names as keys and importance scores as values
feature_importance = dict(zip(feature_names, importance_scores))

# Sort the features based on their importance scores
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

# Print the feature importance in descending order
for feature, importance in sorted_features:
    print(f"{feature}: {importance}")

# Plot the feature importance
plt.figure(figsize=(10, 6))
plt.barh(range(len(importance_scores)), importance_scores, align='center')
plt.yticks(range(len(feature_names)), feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance Scores')
plt.show()

In [None]:
x = PrettyTable()
print('\n')
print("Comparison of Models Results")
x.field_names = ["Model", "Accuracy"]

x.add_row(["Random Forest", round(accuracy_score(y_test, models.predict(x_test)), 3)])
x.add_row(["SVM", round(accuracy_score(y_test, svm.predict(x_test)), 3)])
x.add_row(["XGBoost", round(accuracy_score(y_test, xgb_model.predict(x_test)), 2)])
x.add_row(["Logistic Regression", round(accuracy_score(y_test, log_reg_final.predict(x_test)), 2)])

print(x)
print('\n')