<a href="https://www.kaggle.com/code/kitsewio/social-anxiety-acc-93?scriptVersionId=229629890" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 📌If the notebook seemed useful to you, please do not forget to vote!❤️


I considered two approaches to the solution. In the first case, I approached it as a classification problem, and in the second case, as a regression problem. In the second case, I predicted the value as continuous and rounded it up. This approach showed a better result than the classification approach.

During the experiments, it was decided to divide the levels of social anxiety into 2 classes - 0: the first 5, 1: the last 5.Due to class imbalance, it is difficult for the model to distinguish between closely related classes. The implementation can be seen in the "Second variant" block.

# Importing libraries

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from scipy.stats import uniform, randint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from scipy.stats import uniform, randint
from sklearn.metrics import make_scorer
import xgboost as xgb

# EDA

In [None]:
data = pd.read_csv("/kaggle/input/social-anxiety-dataset/enhanced_anxiety_dataset.csv")

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data["Occupation"].unique()

# Data visualization

In [None]:
age_counts = data['Age'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
age_counts.plot(kind='bar', color='skyblue')
plt.title('Age visualisation')
plt.xlabel('Age')
plt.ylabel('Count')
plt.xticks(rotation=45) 
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
plt.figure(figsize=(5,5))
sns.countplot(x=data["Anxiety Level (1-10)"], palette="Set1")
plt.title("Distribution of anxiety Level")
plt.show()

There is a strong imbalance visible here.

In [None]:
plt.figure(figsize=(15, 7))
sns.countplot(data=data, x='Gender', hue='Anxiety Level (1-10)', palette='Set1')
plt.title("Distribution of anxiety Level by Gender")
plt.legend()
plt.show()

In [None]:
data.head()

Removing categorical values to build a correlation matrix.

In [None]:
objects = ['Gender', 'Occupation', 'Smoking', 'Family History of Anxiety', 'Dizziness', 'Recent Major Life Event', 'Medication']
df = data.drop(objects, axis=1)

df.head()

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(12, 12))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation matrix")
plt.show()

# Encoder *get_dummies*

The values of the columns *'Gender*' and *'Occupation'* affect the value of "social anxiety", so I use the method **get_dummies**.

In [None]:
data_dum = pd.get_dummies(data, columns = ['Gender', 'Occupation'])

data_dum

In [None]:
for column in data_dum.columns:
    if column == 'Anxiety Level (1-10)':
        data_dum[column] = data_dum[column].astype(int)

data_dum

Shifting the anxiety level by -1, since the classifier only works with values starting with 0.

In [None]:
for column in data_dum.columns:
    if column == 'Anxiety Level (1-10)':
        data_dum[column] = data_dum[column]-1

data_dum

# Label Encoder

In [None]:
label_encoder = LabelEncoder()

for column in data_dum.columns:
    if data_dum[column].dtype == 'object':
        data_dum[column] = label_encoder.fit_transform(data_dum[column])

data_dum

In [None]:
for column in data_dum.columns:
    if data_dum[column].dtype == 'bool':
        data_dum[column] = data_dum[column].astype(int)

data_dum

In [None]:
target = "Anxiety Level (1-10)"

X = data_dum.drop(target, axis=1)
y = data_dum[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost Classifier

Automatic hyperparameter selection

In [None]:
model_class = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=10, 
    random_state=42,
    n_jobs=-1
)

params = {
    "colsample_bytree": uniform(0.6, 0.4),  
    "gamma": uniform(0, 0.5),              
    "learning_rate": uniform(0.03, 0.3),    
    "max_depth": randint(2, 6),             
    "n_estimators": randint(100, 150),      
    "subsample": uniform(0.6, 0.4),         
    "reg_alpha": uniform(0, 1),           
    "reg_lambda": uniform(0, 1)            
}

search = RandomizedSearchCV(
    model_class,
    param_distributions=params,
    n_iter=200,              
    cv=3,                     
    scoring="accuracy",       
    verbose=1,
    random_state=42,
    n_jobs=-1                
)

search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)
print("Best accuracy:", search.best_score_)

In [None]:
best_params = {
    'colsample_bytree': 0.919942102378926, 
    'gamma': 0.21260675223461672, 
    'learning_rate': 0.03674079249603522, 
    'max_depth': 3, 
    'n_estimators': 108, 
    'reg_alpha': 0.5416342146608669, 
    'reg_lambda': 0.6334782198261473, 
    'subsample': 0.7031550741732809
}

final_model_class = xgb.XGBClassifier(
    objective="multi:softmax",
    **best_params,  
    random_state=42,
    n_jobs=-1
)

final_model_class.fit(X_train, y_train)

In [None]:
y_pred_c = final_model_class.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_c)

print("Accuracy:", accuracy)

In [None]:
cm = confusion_matrix(y_test, y_pred_c)
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.show()

# XGBoost Ragression

Automatic hyperparameter selection

In [None]:
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
import xgboost as xgb
import numpy as np

model_reg = xgb.XGBRegressor(
    objective="reg:squaredlogerror",  
    random_state=42,
    n_jobs=-1
)

params = {
    "colsample_bytree": uniform(0.6, 0.4), 
    "gamma": uniform(0, 0.5),               
    "learning_rate": uniform(0.03, 0.3),    
    "max_depth": randint(2, 6),             
    "n_estimators": randint(100, 150),     
    "subsample": uniform(0.6, 0.4),         
    "reg_alpha": uniform(0, 1),             
    "reg_lambda": uniform(0, 1)            
}

search = RandomizedSearchCV(
    model_reg, 
    param_distributions=params,
    n_iter=200,
    cv=3,
    scoring="neg_mean_squared_error",  
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)
print("Best MSE:", -search.best_score_)  

In [None]:
best_params = {
    'colsample_bytree': 0.7874772639179881,
    'gamma': 0.2074097511688326,
    'learning_rate': 0.11202212157921186,
    'max_depth': 5, 
    'n_estimators': 105,
    'reg_alpha': 0.691714397168912, 
    'reg_lambda': 0.5343462750294631, 
    'subsample': 0.8999642997879886
}

final_model_reg = xgb.XGBRegressor(
    objective="reg:squaredlogerror",
    **best_params,  
    random_state=42,
    n_jobs=-1
)

final_model_reg.fit(X_train, y_train)

In [None]:
y_pred_reg = final_model_reg.predict(X_test)
y_pred_reg

In [None]:
y_pred_round = np.round(y_pred_reg).astype(int)
y_pred_round

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score

mse = mean_squared_error(y_test, y_pred_round)
mae = mean_absolute_error(y_test, y_pred_round)
r2 = r2_score(y_test, y_pred_round)
accuracy = accuracy_score(y_test, y_pred_round)

print("MSE:", mse)
print("MAE:", mae)
print("R²:", r2)
print("Accuracy:", accuracy)

In [None]:
cm = confusion_matrix(y_test, y_pred_round)
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.show()

In [None]:
data_pred={
    'y_pred': y_pred_reg,
    'y_true': y_test,
    'y_fin': y_pred_round
}

df_check = pd.DataFrame(data_pred)
df_check.head(20)

In [None]:
df_check.info()

In [None]:
errors = df_check['y_true'] - df_check['y_pred']
plt.hist(errors, bins=20, edgecolor='black')
plt.xlabel("Error pred")
plt.ylabel("Count")
plt.title("Error distribution")
plt.show()

# Second variant anxiety levels

In [None]:
data_dum['Anxiety Level (1-10)'].unique()

In [None]:
df_copy = data_dum.copy()  
df_copy['Anxiety Level (1-10)'] = df_copy['Anxiety Level (1-10)'].apply(
    lambda x: 0 if 0 <= x <= 4 else (1 if 5 <= x <= 9 else x)
)
df_copy = df_copy.rename(columns={'Anxiety Level (1-10)': 'Anxiety Level (0-1)'})
df_copy['Anxiety Level (0-1)'].unique()

In [None]:
target = "Anxiety Level (0-1)"

X = df_copy.drop(target, axis=1)
y = df_copy[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost Regression

In [None]:
model_reg = xgb.XGBRegressor(
    objective="reg:squaredlogerror",  
    random_state=42,
    n_jobs=-1
)

params = {
    "colsample_bytree": uniform(0.6, 0.4), 
    "gamma": uniform(0, 0.5),               
    "learning_rate": uniform(0.03, 0.3),    
    "max_depth": randint(2, 6),             
    "n_estimators": randint(100, 150),     
    "subsample": uniform(0.6, 0.4),         
    "reg_alpha": uniform(0, 1),             
    "reg_lambda": uniform(0, 1)            
}

search = RandomizedSearchCV(
    model_reg, 
    param_distributions=params,
    n_iter=200,
    cv=3,
    scoring="neg_mean_squared_error",  
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)
print("Best MSE:", -search.best_score_)  

In [None]:
best_params = {
    'colsample_bytree': 0.6636982953117058, 
    'gamma': 0.1018506246381371, 
    'learning_rate': 0.057349347121857505, 
    'max_depth': 5, 
    'n_estimators': 114, 
    'reg_alpha': 0.45369363460528267,
    'reg_lambda': 0.5303543937282561, 
    'subsample': 0.6071447569913277
}

final_model_reg = xgb.XGBRegressor(
    objective="reg:squaredlogerror",
    **best_params,  
    random_state=42,
    n_jobs=-1
)

final_model_reg.fit(X_train, y_train)

In [None]:
y_pred_reg = final_model_reg.predict(X_test)
y_pred_reg

In [None]:
y_pred_round = np.round(y_pred_reg).astype(int)
y_pred_round

In [None]:
mse = mean_squared_error(y_test, y_pred_round)
mae = mean_absolute_error(y_test, y_pred_round)
r2 = r2_score(y_test, y_pred_round)
accuracy = accuracy_score(y_test, y_pred_round)

print("MSE:", mse)
print("MAE:", mae)
print("R²:", r2)
print("Accuracy:", accuracy)

In [None]:
cm = confusion_matrix(y_test, y_pred_round)
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.show()