In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix





In [9]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
df.head()

Unnamed: 0,Tweet,Suicide
0,making some lunch,Not Suicide post
1,@Alexia You want his money.,Not Suicide post
2,@dizzyhrvy that crap took me forever to put to...,Potential Suicide post
3,@jnaylor #kiwitweets Hey Jer! Since when did y...,Not Suicide post
4,Trying out &quot;Delicious Library 2&quot; wit...,Not Suicide post


In [18]:
df.describe()

Unnamed: 0,Tweet,Suicide
count,1785,1787
unique,1777,2
top,Became as hot as the persistent days resting h...,Not Suicide post
freq,2,1127


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1787 entries, 0 to 1786
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Tweet    1785 non-null   object
 1   Suicide  1787 non-null   object
dtypes: object(2)
memory usage: 28.1+ KB


In [20]:
#data cleaning and preprocessing

data_cleaned = df.dropna(subset=['Tweet'])

In [21]:
data_cleaned['Suicide'] = data_cleaned['Suicide'].str.strip().str.lower()


In [22]:
import re

def clean_text(text):

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)

    text = text.lower().strip()
    return text

data_cleaned['Tweet'] = data_cleaned['Tweet'].apply(clean_text)

In [23]:
data_cleaned.info()
data_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1785 entries, 0 to 1786
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Tweet    1785 non-null   object
 1   Suicide  1785 non-null   object
dtypes: object(2)
memory usage: 41.8+ KB


Unnamed: 0,Tweet,Suicide
0,making some lunch,not suicide post
1,you want his money,not suicide post
2,that crap took me forever to put together im g...,potential suicide post
3,kiwitweets hey jer since when did you start tw...,not suicide post
4,trying out quotdelicious library quot with mix...,not suicide post


In [24]:
data_cleaned.to_csv('cleaned_dataset.csv', index=False)


In [25]:
df = pd.read_csv('/Users/daniyalrosli/Suicidal-Tweet-Detection-on-X/cleaned_dataset.csv')

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the text data
X = vectorizer.fit_transform(data_cleaned['Tweet'])

# Convert to DataFrame for better readability
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display the first few rows of the vectorized data
X_df.head()

Unnamed: 0,able,about,absolutely,accident,account,actually,add,adding,afraid,after,...,yo,you,youll,young,your,youre,yourself,youtube,youve,yr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.325384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.200976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.382871,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# Ensure 'Processed_Tweet' column exists
data_cleaned['Processed_Tweet'] = data_cleaned['Tweet']

#Training and testing
X = data_cleaned['Processed_Tweet']
Y = data_cleaned['Suicide']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_tfidf, Y_train)

y_pred = model.predict(X_test_tfidf)


In [30]:
# Model evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(Y_test, y_pred)

report = classification_report(Y_test, y_pred)

conf_matrix = confusion_matrix(Y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.8711484593837535
Classification Report:
                        precision    recall  f1-score   support

      not suicide post       0.85      0.96      0.90       216
potential suicide post       0.93      0.73      0.82       141

              accuracy                           0.87       357
             macro avg       0.89      0.85      0.86       357
          weighted avg       0.88      0.87      0.87       357

Confusion Matrix:
[[208   8]
 [ 38 103]]


In [31]:
# Decision Tree Classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, Y_train)
dt_y_pred = dt_model.predict(X_test_tfidf)

# Random Forest Classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, Y_train)
rf_y_pred = rf_model.predict(X_test_tfidf)

# Support Vector Classifier
svc_model = SVC()
svc_model.fit(X_train_tfidf, Y_train)
svc_y_pred = svc_model.predict(X_test_tfidf)

# Evaluate the models
dt_accuracy = accuracy_score(Y_test, dt_y_pred)
rf_accuracy = accuracy_score(Y_test, rf_y_pred)
svc_accuracy = accuracy_score(Y_test, svc_y_pred)

dt_report = classification_report(Y_test, dt_y_pred)
rf_report = classification_report(Y_test, rf_y_pred)
svc_report = classification_report(Y_test, svc_y_pred)

dt_conf_matrix = confusion_matrix(Y_test, dt_y_pred)
rf_conf_matrix = confusion_matrix(Y_test, rf_y_pred)
svc_conf_matrix = confusion_matrix(Y_test, svc_y_pred)

print(f"Decision Tree Classifier Accuracy: {dt_accuracy}")
print(f"Decision Tree Classifier Report:\n{dt_report}")
print(f"Decision Tree Classifier Confusion Matrix:\n{dt_conf_matrix}")

print(f"Random Forest Classifier Accuracy: {rf_accuracy}")
print(f"Random Forest Classifier Report:\n{rf_report}")
print(f"Random Forest Classifier Confusion Matrix:\n{rf_conf_matrix}")

print(f"SVC Accuracy: {svc_accuracy}")
print(f"SVC Report:\n{svc_report}")
print(f"SVC Confusion Matrix:\n{svc_conf_matrix}")

Decision Tree Classifier Accuracy: 0.8543417366946778
Decision Tree Classifier Report:
                        precision    recall  f1-score   support

      not suicide post       0.85      0.92      0.88       216
potential suicide post       0.86      0.76      0.80       141

              accuracy                           0.85       357
             macro avg       0.85      0.84      0.84       357
          weighted avg       0.85      0.85      0.85       357

Decision Tree Classifier Confusion Matrix:
[[198  18]
 [ 34 107]]
Random Forest Classifier Accuracy: 0.8879551820728291
Random Forest Classifier Report:
                        precision    recall  f1-score   support

      not suicide post       0.85      1.00      0.91       216
potential suicide post       0.99      0.72      0.84       141

              accuracy                           0.89       357
             macro avg       0.92      0.86      0.88       357
          weighted avg       0.90      0.89      0.

In [None]:
# sentiment analysis

from textblob import TextBlob

def analyze_sentiment(tweet):
    analysis = TextBlob(tweet)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

data_cleaned['Sentiment'] = data_cleaned['Tweet'].apply(analyze_sentiment)
data_cleaned[['Tweet', 'Sentiment']].head()

In [None]:
import seaborn as sns

#Visualize the sentiment distribution

# Plot the sentiment distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Sentiment', data=data_cleaned)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess the data using TF-IDF with a limited vocabulary size
vectorizer = TfidfVectorizer(max_features=300)
X_train = vectorizer.fit_transform(data_cleaned['Tweet'])

# Split into training and testing sets again (adjust based on previous splits)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, data_cleaned['Suicide'], test_size=0.2, random_state=42)

# Fit a simpler model such as Naive Bayes
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

# Evaluate the model performance
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=["Non-Suicidal", "Potentially Suicidal"]))

                      precision    recall  f1-score   support

        Non-Suicidal       0.83      0.95      0.89       216
Potentially Suicidal       0.91      0.71      0.80       141

            accuracy                           0.86       357
           macro avg       0.87      0.83      0.84       357
        weighted avg       0.86      0.86      0.85       357

