# Gradient Boosting

In [12]:
import pandas as pd

df = pd.read_csv("../data/reddit_depression_dataset_cleaned.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987011 entries, 0 to 1987010
Data columns (total 13 columns):
 #   Column                                      Dtype  
---  ------                                      -----  
 0   Unnamed: 0                                  int64  
 1   label                                       float64
 2   date                                        object 
 3   upvotes                                     float64
 4   num_comments                                float64
 5   combined_text                               object 
 6   tokenized_text                              object 
 7   alphanum_text                               object 
 8   stopword_removed_text                       object 
 9   stemmed_text                                object 
 10  non_stopword_removed_stemmed_text           object 
 11  combined_stemmed_text                       object 
 12  combined_non_stopword_removed_stemmed_text  object 
dtypes: float64(3), int64(1), ob

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,date,upvotes,num_comments,combined_text,tokenized_text,alphanum_text,stopword_removed_text,stemmed_text,non_stopword_removed_stemmed_text,combined_stemmed_text,combined_non_stopword_removed_stemmed_text
0,0,0.0,2014-07-14 03:35:09,4.0,0.0,Deep thoughts underdog Only when we start cons...,"['deep', 'thoughts', 'underdog', 'only', 'when...","['deep', 'thoughts', 'underdog', 'only', 'when...","['deep', 'thoughts', 'underdog', 'start', '99'...","['deep', 'thought', 'underdog', 'start', '99',...","['deep', 'thought', 'underdog', 'onli', 'when'...",deep thought underdog start 99 underdog start ...,deep thought underdog onli when we start consi...
1,1,0.0,2014-09-13 00:31:19,4.0,1.0,"I like this sub, there's only two posts yet I ...","['i', 'like', 'this', 'sub', ',', ""there's"", '...","['i', 'like', 'this', 'sub', 'only', 'two', 'p...","['posts', 'coming', 'human', 'morality', 'joke...","['post', 'come', 'human', 'moral', 'joke', 'lo...","['i', 'like', 'thi', 'sub', 'onli', 'two', 'po...",post come human moral joke long abscenc hope d...,i like thi sub onli two post yet i keep come b...
2,2,0.0,2014-11-20 04:31:58,6.0,1.0,Rebirth! Hello. \nI am the new guy in charge h...,"['rebirth', '!', 'hello', '.', 'i', 'am', 'the...","['rebirth', 'hello', 'i', 'am', 'the', 'new', ...","['rebirth', 'guy', 'charge', 'thegood', 'ofc',...","['rebirth', 'guy', 'charg', 'thegood', 'ofc', ...","['rebirth', 'hello', 'i', 'am', 'the', 'new', ...",rebirth guy charg thegood ofc bring weirdpinea...,rebirth hello i am the new guy in charg here b...
3,3,0.0,2014-11-20 19:38:05,25.0,2.0,"""I want to be like water. I want to slip throu...","['""', 'i', 'want', 'to', 'be', 'like', 'water'...","['i', 'want', 'to', 'be', 'like', 'water', 'i'...","['water', 'slip', 'fingers', 'hold', 'ship', '...","['water', 'slip', 'finger', 'hold', 'ship', 'm...","['i', 'want', 'to', 'be', 'like', 'water', 'i'...",water slip finger hold ship michel william,i want to be like water i want to slip through...
4,5,0.0,2014-11-22 19:17:39,8.0,23.0,What is the limit of the knowledge and power a...,"['what', 'is', 'the', 'limit', 'of', 'the', 'k...","['what', 'is', 'the', 'limit', 'of', 'the', 'k...","['limit', 'knowledge', 'power', 'human', 'pers...","['limit', 'knowledg', 'power', 'human', 'perso...","['what', 'is', 'the', 'limit', 'of', 'the', 'k...",limit knowledg power human person infinit grow,what is the limit of the knowledg and power a ...


In [18]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [15]:
y = df['label']

In [19]:
# Fill NaN values in the text column with an empty string
df['combined_non_stopword_removed_stemmed_text'].fillna('', inplace=True)


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['combined_non_stopword_removed_stemmed_text'])


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [22]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}


In [23]:
evals = [(dtrain, 'train'), (dtest, 'eval')]

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=10
)

[0]	train-auc:0.82149	eval-auc:0.82203
[10]	train-auc:0.88120	eval-auc:0.87986
[20]	train-auc:0.89893	eval-auc:0.89686
[30]	train-auc:0.91111	eval-auc:0.90855
[40]	train-auc:0.92034	eval-auc:0.91774
[50]	train-auc:0.92610	eval-auc:0.92325
[60]	train-auc:0.93038	eval-auc:0.92747
[70]	train-auc:0.93417	eval-auc:0.93118
[80]	train-auc:0.93678	eval-auc:0.93363
[90]	train-auc:0.93958	eval-auc:0.93637
[100]	train-auc:0.94167	eval-auc:0.93839
[110]	train-auc:0.94353	eval-auc:0.94022
[120]	train-auc:0.94517	eval-auc:0.94178
[130]	train-auc:0.94653	eval-auc:0.94308
[140]	train-auc:0.94786	eval-auc:0.94431
[150]	train-auc:0.94904	eval-auc:0.94538
[160]	train-auc:0.95010	eval-auc:0.94638
[170]	train-auc:0.95102	eval-auc:0.94727
[180]	train-auc:0.95192	eval-auc:0.94812
[190]	train-auc:0.95273	eval-auc:0.94886
[200]	train-auc:0.95347	eval-auc:0.94956
[210]	train-auc:0.95427	eval-auc:0.95027
[220]	train-auc:0.95498	eval-auc:0.95094
[230]	train-auc:0.95557	eval-auc:0.95151
[240]	train-auc:0.95620	eva

In [24]:
y_pred_prob = bst.predict(dtest)
y_pred = (y_pred_prob > 0.5).astype(int)


In [25]:
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


              precision    recall  f1-score   support

         0.0       0.96      0.98      0.97    352331
         1.0       0.84      0.67      0.75     45072

    accuracy                           0.95    397403
   macro avg       0.90      0.83      0.86    397403
weighted avg       0.95      0.95      0.95    397403

Confusion Matrix:
 [[346555   5776]
 [ 14756  30316]]
