## Simple machine learning notebook

In [226]:
# cell setup and imports
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score,\
 recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, label_binarize
import matplotlib.pyplot as plt

In [227]:
# data loading and exploration 
# you can use this link to download https://www.kaggle.com/datasets/adilshamim8/social-media-addiction-vs-relationships
df = pd.read_csv('StudentsSocialMediaAddiction.csv')
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Student_ID                    705 non-null    int64  
 1   Age                           705 non-null    int64  
 2   Gender                        705 non-null    object 
 3   Academic_Level                705 non-null    object 
 4   Country                       705 non-null    object 
 5   Avg_Daily_Usage_Hours         705 non-null    float64
 6   Most_Used_Platform            705 non-null    object 
 7   Affects_Academic_Performance  705 non-null    object 
 8   Sleep_Hours_Per_Night         705 non-null    float64
 9   Mental_Health_Score           705 non-null    int64  
 10  Relationship_Status           705 non-null    object 
 11  Conflicts_Over_Social_Media   705 non-null    int64  
 12  Addicted_Score                705 non-null    int64  
dtypes: fl

Unnamed: 0,Student_ID,Age,Avg_Daily_Usage_Hours,Sleep_Hours_Per_Night,Mental_Health_Score,Conflicts_Over_Social_Media,Addicted_Score
count,705.0,705.0,705.0,705.0,705.0,705.0,705.0
mean,353.0,20.659574,4.918723,6.868936,6.22695,2.849645,6.436879
std,203.660256,1.399217,1.257395,1.126848,1.105055,0.957968,1.587165
min,1.0,18.0,1.5,3.8,4.0,0.0,2.0
25%,177.0,19.0,4.1,6.0,5.0,2.0,5.0
50%,353.0,21.0,4.8,6.9,6.0,3.0,7.0
75%,529.0,22.0,5.8,7.7,7.0,4.0,8.0
max,705.0,24.0,8.5,9.6,9.0,5.0,9.0


In [228]:
df.head()

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7


In [229]:
# feature engineering
df_copy = df


In [230]:
# identify category columns
cat_cols = df.select_dtypes(include=['object']).columns

In [231]:
# one hot encode them
for col in cat_cols:
    df_copy[col] = LabelEncoder().fit_transform(df_copy[col])

In [232]:
df['Age'] = df['Age'].astype('int')
df['Gender'] = df['Gender'].astype('category')
df['Gender'] = df['Gender'].cat.codes

df['Academic_Level'] = df['Academic_Level'].astype('category')
df['Academic_Level'] = df['Academic_Level'].cat.codes

df['Most_Used_Platform'] = df['Most_Used_Platform'].astype('category')
df['Most_Used_Platform'] = df['Most_Used_Platform'].cat.codes

df['Affects_Academic_Performance'] = df['Affects_Academic_Performance'].astype('category')
df['Affects_Academic_Performance'] = df['Affects_Academic_Performance'].cat.codes

df['Relationship_Status'] = df['Relationship_Status'].astype('category')
df['Relationship_Status'] = df['Relationship_Status'].cat.codes
df.head()

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,1,19,0,2,10,5.2,1,1,6.5,6,1,3,8
1,2,22,1,0,39,2.1,7,0,7.5,8,2,0,3
2,3,20,0,2,102,6.0,6,1,5.0,5,0,4,9
3,4,18,1,1,101,3.0,11,0,7.0,7,2,1,4
4,5,21,1,0,18,4.5,0,1,6.0,6,1,2,7


In [233]:
def create_feature(df):
    affected_count = (df['Affects_Academic_Performance'] == 1).sum()
    total_count = len(df)
    percent_affected = (affected_count / total_count) * 100
    df['Percent_Affected'] = round(percent_affected, 1)
    return df

In [234]:
print(df.isnull().sum())


Student_ID                      0
Age                             0
Gender                          0
Academic_Level                  0
Country                         0
Avg_Daily_Usage_Hours           0
Most_Used_Platform              0
Affects_Academic_Performance    0
Sleep_Hours_Per_Night           0
Mental_Health_Score             0
Relationship_Status             0
Conflicts_Over_Social_Media     0
Addicted_Score                  0
dtype: int64


In [235]:
# or we can see any col is null 
print(df[df.columns[df.isnull().any()]])

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[705 rows x 0 columns]


In [236]:
df = create_feature(df)

In [237]:
df.head()

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score,Percent_Affected
0,1,19,0,2,10,5.2,1,1,6.5,6,1,3,8,64.3
1,2,22,1,0,39,2.1,7,0,7.5,8,2,0,3,64.3
2,3,20,0,2,102,6.0,6,1,5.0,5,0,4,9,64.3
3,4,18,1,1,101,3.0,11,0,7.0,7,2,1,4,64.3
4,5,21,1,0,18,4.5,0,1,6.0,6,1,2,7,64.3


In [238]:
# model training and evaluation
X = df.drop(['Addicted_Score'], axis=1)

In [239]:
y = df['Addicted_Score']

In [240]:
# One-hot encode them
X_encoded = pd.get_dummies(X, columns=cat_cols)

In [241]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [242]:
# train the model
model = RandomForestClassifier()

In [243]:
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [244]:
# evaluation metrics
y_pred = model.predict(X_test)

In [245]:
y_pred_train = model.predict(X_train)

In [246]:
y_pred_proba = model.predict_proba(X_test)

In [247]:
y_pred_proba_train = model.predict_proba(X_train)

In [248]:
y_pred, y_pred_train, y_pred_proba, y_pred_proba_train

(array([5, 7, 5, 7, 4, 9, 3, 4, 5, 7, 8, 8, 9, 7, 4, 5, 5, 7, 7, 8, 5, 8,
        8, 9, 4, 5, 7, 7, 5, 7, 9, 7, 7, 6, 8, 7, 7, 7, 7, 5, 8, 5, 7, 7,
        8, 5, 9, 6, 5, 7, 7, 6, 4, 5, 7, 8, 6, 7, 8, 9, 4, 5, 6, 7, 6, 5,
        8, 5, 8, 8, 7, 7, 5, 5, 4, 7, 5, 9, 7, 7, 7, 5, 8, 9, 8, 5, 7, 5,
        8, 6, 5, 9, 8, 7, 4, 8, 5, 5, 7, 4, 5, 3, 5, 7, 5, 5, 8, 5, 8, 5,
        7, 5, 5, 7, 8, 5, 8, 4, 4, 8, 7, 7, 9, 7, 7, 7, 8, 4, 4, 6, 8, 8,
        4, 7, 7, 8, 4, 7, 8, 4, 5]),
 array([5, 8, 8, 9, 4, 6, 8, 6, 7, 7, 8, 9, 8, 5, 7, 9, 5, 7, 9, 7, 7, 8,
        7, 8, 8, 7, 4, 7, 5, 9, 8, 7, 7, 9, 4, 7, 4, 5, 7, 6, 8, 8, 8, 7,
        8, 4, 7, 4, 9, 7, 5, 7, 9, 5, 5, 6, 8, 9, 7, 5, 5, 6, 4, 7, 8, 3,
        7, 5, 8, 6, 8, 9, 6, 8, 7, 7, 7, 5, 4, 6, 9, 7, 8, 7, 8, 4, 5, 6,
        8, 3, 7, 9, 8, 7, 7, 5, 4, 4, 4, 7, 8, 8, 7, 6, 4, 3, 7, 8, 4, 8,
        7, 5, 7, 8, 5, 5, 7, 5, 6, 5, 8, 6, 9, 5, 7, 5, 9, 7, 7, 6, 4, 5,
        5, 4, 9, 7, 5, 7, 8, 8, 5, 8, 5, 6, 7, 7, 5, 8, 9, 8, 5, 6, 4, 7,
 

In [249]:
accuracy = model.score(X_test, y_test)

In [250]:
accuracy

0.9858156028368794

In [251]:
scores = cross_val_score(model, X_encoded, y, cv=5)



In [252]:
# Each value in the list represents the accuracy score for one fold in 5-fold cross-validation
print(scores)

[0.82978723 0.89361702 0.95035461 0.93617021 0.82269504]


In [253]:
print("Mean accuracy:", scores.mean())


Mean accuracy: 0.8865248226950355


In [254]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet

model = ElasticNet(alpha=0.1, l1_ratio=0.7)   # Elastic Net

In [255]:
model.fit(X_train, y_train)

0,1,2
,alpha,0.1
,l1_ratio,0.7
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,


In [256]:
model.score(X_test, y_test)

0.9490024897182955

In [257]:
model = Lasso(alpha=0.1)   # L1 regularization

In [258]:
model.fit(X_train, y_train)

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [259]:
model.score(X_test, y_test)

0.9452489591991577

In [260]:
model = Ridge(alpha=1.0)   # L2 regularization

In [261]:
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [262]:
model.score(X_test, y_test)

0.9771131105619747

In [263]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()   

In [264]:
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [265]:
model.score(X_test, y_test)

0.9620251530481048

In [266]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=5) 

In [267]:
model.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [268]:
model.score(X_test, y_test)

0.9574468085106383

In [269]:
# ensemble method
import xgboost as xgb
from xgboost import XGBClassifier


In [270]:
print(X_train.shape)  # should be (n, m)
print(y_train.shape)  # should be (n,)


(564, 139)
(564,)


In [271]:
le = LabelEncoder()
y_encoded = le.fit_transform(df['Addicted_Score'])
print(len(y_encoded))  # must be 564

705


In [272]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(len(y_encoded))

705


In [273]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2)


In [274]:
print(len(X_train), len(y_train))  # both must be 564
print(len(X_test), len(y_test))  # both must be 141

564 564
141 141


In [275]:
model = XGBClassifier()

In [276]:
model.fit(X_train, y_train)  # Correct


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [277]:
model.score(X_test, y_test)  # Correct

0.9645390070921985

In [279]:
# True labels and predicted labels
cm = confusion_matrix(y_test, y_pred)

# Display as DataFrame for better readability
cm_df = pd.DataFrame(cm, index=[f"Actual {i}" for i in range(cm.shape[0])],
                        columns=[f"Predicted {i}" for i in range(cm.shape[1])])

print(cm_df)

          Predicted 0  Predicted 1  Predicted 2  Predicted 3  Predicted 4  \
Actual 0            0            0            0            1            1   
Actual 1            0            0            1            1            1   
Actual 2            0            0            0            3            5   
Actual 3            0            0            0            3            6   
Actual 4            0            0            1            3           10   
Actual 5            0            0            0            3            6   
Actual 6            0            0            0            2            6   
Actual 7            0            0            0            0            0   
Actual 8            0            0            0            0            0   

          Predicted 5  Predicted 6  Predicted 7  Predicted 8  
Actual 0            1            0            1            0  
Actual 1            1            6            3            0  
Actual 2            2            6      

In [285]:
le = LabelEncoder()
le.fit(df_copy['Addicted_Score'])

In [286]:
for idx, label in enumerate(le.classes_):
    print(f"{idx} → {label}")


0 → 2
1 → 3
2 → 4
3 → 5
4 → 6
5 → 7
6 → 8
7 → 9


In [280]:
# Select target class (e.g., class 3)
target_class = 3

# Convert to binary: 1 if class is target, else 0
y_test_binary = (y_test == target_class).astype(int)
y_pred_binary = (y_pred == target_class).astype(int)

# Compute binary confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test_binary, y_pred_binary).ravel()

print(f"Class {target_class} → TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")

Class 3 → TP: 0, FP: 2, TN: 116, FN: 23


In [None]:
from sklearn.preprocessing import label_binarize

# Use the exact classes seen during training
classes = np.unique(y_train)  # Or np.unique(y)

# Binarize test labels using same class list
y_test_bin = label_binarize(y_test, classes=classes)

# Predict class probabilities
y_proba = model.predict_proba(X_test)

# Now compute AUC
auc_score = roc_auc_score(y_test_bin, y_proba, multi_class='ovr', average='weighted')
print(f"AUC (OVR): {auc_score:.2f}")


In [None]:
# Accuracy
acc = accuracy_score(y_test, y_pred)

# Precision
prec = precision_score(y_test, y_pred, average='weighted')

# Recall
rec = recall_score(y_test, y_pred, average='weighted')

# F1 Score
f1 = f1_score(y_test, y_pred, average='weighted')

# Print all
print(f"Accuracy: {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall: {rec:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"AUC: {auc_score:.2f}")

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()
