In [3]:
import pandas as pd


df = pd.read_csv("./Datasets/reddit_depression_dataset.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2470778 entries, 0 to 2470777
Data columns (total 8 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Unnamed: 0    object 
 1   subreddit     object 
 2   title         object 
 3   body          object 
 4   upvotes       float64
 5   created_utc   float64
 6   num_comments  float64
 7   label         float64
dtypes: float64(4), object(4)
memory usage: 150.8+ MB


  df = pd.read_csv("./Datasets/reddit_depression_dataset.csv")


In [4]:
# Check for missing values in 'label' column
print(df['label'].isna().sum())

# Option 1: Fill NaNs with a specific value (e.g., 0)
df['label'].fillna(0, inplace=True)

# Option 2: Drop rows with NaN values in 'label'
df.dropna(subset=['label'], inplace=True)

# Convert 'label' to int after handling missing values
df['label'] = df['label'].astype(int)

106


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['label'].fillna(0, inplace=True)


In [5]:
df['text'] = df['title'] + ' ' + df['body']


# Split data into features (X) and target (y)
# Combine 'title' and 'body' into one column and handle NaN values
df['text'] = df['title'].fillna('') + ' ' + df['body'].fillna('')

# Check for any remaining NaN in the 'text' column (just in case)
df['text'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].fillna('', inplace=True)


In [6]:
# Ensure 'label' has no missing values and is of integer type
df['label'].fillna(0, inplace=True)
df['label'] = df['label'].astype(int)

# Combine 'title' and 'body' into 'text'
df['text'] = df['title'].fillna('') + ' ' + df['body'].fillna('')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['label'].fillna(0, inplace=True)


In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,title,body,upvotes,created_utc,num_comments,label,text
0,47951,DeepThoughts,Deep thoughts underdog,"Only when we start considering ourselves, the ...",4.0,1405309000.0,,0,Deep thoughts underdog Only when we start cons...
1,47952,DeepThoughts,"I like this sub, there's only two posts yet I ...",Anyway: Human Morality is a joke so long as th...,4.0,1410568000.0,1.0,0,"I like this sub, there's only two posts yet I ..."
2,47957,DeepThoughts,Rebirth!,Hello. \nI am the new guy in charge here (Besi...,6.0,1416458000.0,1.0,0,Rebirth! Hello. \nI am the new guy in charge h...
3,47959,DeepThoughts,"""I want to be like water. I want to slip throu...",,25.0,1416512000.0,2.0,0,"""I want to be like water. I want to slip throu..."
4,47960,DeepThoughts,Who am I?,You could take any one cell in my body and kil...,5.0,1416516000.0,4.0,0,Who am I? You could take any one cell in my bo...


In [8]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['text'])
y = df['label']


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [11]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


In [12]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}


In [13]:
evals = [(dtrain, 'train'), (dtest, 'eval')]

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=10
)


[0]	train-auc:0.85833	eval-auc:0.85861
[10]	train-auc:0.91009	eval-auc:0.90976
[20]	train-auc:0.92185	eval-auc:0.92138
[30]	train-auc:0.93013	eval-auc:0.92942
[40]	train-auc:0.93603	eval-auc:0.93523
[50]	train-auc:0.94079	eval-auc:0.94002
[60]	train-auc:0.94408	eval-auc:0.94327
[70]	train-auc:0.94689	eval-auc:0.94595
[80]	train-auc:0.94899	eval-auc:0.94808
[90]	train-auc:0.95103	eval-auc:0.95002
[100]	train-auc:0.95267	eval-auc:0.95164
[110]	train-auc:0.95409	eval-auc:0.95300
[120]	train-auc:0.95535	eval-auc:0.95420
[130]	train-auc:0.95646	eval-auc:0.95528
[140]	train-auc:0.95749	eval-auc:0.95626
[150]	train-auc:0.95848	eval-auc:0.95717
[160]	train-auc:0.95930	eval-auc:0.95795
[170]	train-auc:0.96003	eval-auc:0.95862
[180]	train-auc:0.96075	eval-auc:0.95928
[190]	train-auc:0.96147	eval-auc:0.95993
[200]	train-auc:0.96209	eval-auc:0.96051
[210]	train-auc:0.96269	eval-auc:0.96110
[220]	train-auc:0.96322	eval-auc:0.96161
[230]	train-auc:0.96372	eval-auc:0.96204
[240]	train-auc:0.96425	eva

In [14]:
y_pred_prob = bst.predict(dtest)
y_pred = (y_pred_prob > 0.5).astype(int)


In [15]:
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


              precision    recall  f1-score   support

           0       0.95      0.97      0.96    398074
           1       0.88      0.79      0.84     96082

    accuracy                           0.94    494156
   macro avg       0.92      0.88      0.90    494156
weighted avg       0.94      0.94      0.94    494156

Confusion Matrix:
 [[388044  10030]
 [ 19779  76303]]


In [16]:
bst.save_model("xgboost_model.json")

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Assuming your original text data is in df['text']
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_transformed = vectorizer.fit_transform(df['text'])

# Convert the sparse matrix to a dense array
X_dense = X_transformed.toarray()

# Create a DataFrame with the processed features
processed_df = pd.DataFrame(X_dense)

# Optionally, add the target column (if needed)
processed_df['label'] = df['label'].values

# Save to CSV
processed_df.to_csv('preprocessed_reddit_data.csv', index=False)


NameError: name 'df' is not defined