In [19]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora, models
import string
from sklearn.feature_extraction.text import CountVectorizer
import ast
from nrclex import NRCLex
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import svm 
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from matplotlib.colors import ListedColormap
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm



In [8]:
df = pd.read_csv("preprocessed_data_new_negation.csv")

In [9]:
df["lemmatized_processed_text"] = df["lemmatized_processed_text"].apply(lambda x: ast.literal_eval(x))

In [76]:
def Negation(sentence):	
    '''
    Input: Tokenized sentence (List of words)
    Output: Tokenized sentence with negation handled (List of words)
    '''
    temp = int(0)
    for i in range(len(sentence)):
        if sentence[i-1] in ['not',"n't"]:
            antonyms = []
            for syn in wordnet.synsets(sentence[i]):
                syns = wordnet.synsets(sentence[i])
                w1 = syns[0].name()
                temp = 0
                for l in syn.lemmas():
                    if l.antonyms():
                        antonyms.append(l.antonyms()[0].name())
                max_dissimilarity = 0
                for ant in antonyms:
                    syns = wordnet.synsets(ant)
                    w2 = syns[0].name()
                    syns = wordnet.synsets(sentence[i])
                    w1 = syns[0].name()
                    word1 = wordnet.synset(w1)
                    word2 = wordnet.synset(w2)
                    if isinstance(word1.wup_similarity(word2), float) or isinstance(word1.wup_similarity(word2), int):
                        temp = 1 - word1.wup_similarity(word2)
                    if temp>max_dissimilarity:
                        max_dissimilarity = temp
                        antonym_max = ant
                        sentence[i] = antonym_max
                        sentence[i-1] = ''
    while '' in sentence:
        sentence.remove('')
    return sentence

In [7]:
df["lemmatized_processed_text"] = df["lemmatized_processed_text"].apply(Negation)

NameError: name 'Negation' is not defined

# Get NRClex emotions

In [10]:
df['clean_lemmatized_processed_text'] = [' '.join(map(str, l)) for l in df['lemmatized_processed_text']]

In [None]:
df['emotions'] = df['clean_lemmatized_processed_text'].apply(lambda x: NRCLex(x).affect_frequencies)

In [None]:
df = pd.concat([df.drop(['emotions'], axis = 1), df['emotions'].apply(pd.Series)], axis = 1)


In [None]:
df = df.fillna(0)

In [None]:
df['class'] = df['class'].apply(lambda x:1 if x == 'suicide' else 0)


In [None]:
tfidf_vectorizer= TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(df.clean_lemmatized_processed_text)
tfidf_feat = pd.DataFrame(tfidf.toarray())
#  Usin TF-IDF

In [None]:
final_dataset = tfidf_feat.join(df[["fear", "anger", "anticip", "trust", "surprise", "positive", "negative", "sadness","disgust", "joy", "anticipation", "class"]])


In [None]:
final_dataset.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(final_dataset.loc[:, final_dataset.columns != 'class']
, final_dataset["class"], test_size=0.25, random_state=0)



# SVM

In [None]:
# Train the SVM model
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)


In [None]:
print(metrics.classification_report(y_test, y_pred, target_names=['Non-Suicide', 'Suicide']))


In [None]:
# Evaluation: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
# Define labels for the confusion matrix
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Non-suicide', 'Suicide']

# Create heatmap
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=categories, yticklabels=categories, fmt='g')

# Add labels to the plot
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion matrix')
plt.show()

# Evaluation: Accuracy Score
print("SVM Accuracy Score -> ",accuracy_score(y_pred, y_test)*100)

# Logistics regression

In [None]:
# fit the training dataset on the Logistic Regression classifier
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train,y_train)
# predict the labels on validation dataset
y_pred_logreg = logreg.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred_logreg, target_names=['Non-Suicide', 'Suicide']))


In [None]:
print("Logistic Regression Accuracy Score -> ",accuracy_score(y_pred_logreg, y_test)*100)


In [None]:
# Evaluation: Confusion Matrix
cm = confusion_matrix(y_test, y_pred_logreg)
# Define labels for the confusion matrix
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Non-suicide', 'Suicide']

# Create heatmap
sns.heatmap(cm, annot=True, cmap='Reds', xticklabels=categories, yticklabels=categories, fmt='g')

# Add labels to the plot
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion matrix')
plt.show()

# Naive Bayes

In [None]:
#Naive Bayes Classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)
#predicted y
y_pred_nb = naive_bayes_classifier.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred_nb, target_names=['Non-Suicide', 'Suicide']))


In [None]:
print("Naive Bayes Accuracy Score -> ",accuracy_score(y_pred_nb, y_test)*100)


In [None]:
# Evaluation: Confusion Matrix
cm = confusion_matrix(y_test, y_pred_logreg)
# Define labels for the confusion matrix
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Non-suicide', 'Suicide']

# Create heatmap
sns.heatmap(cm, annot=True, cmap='Greens', xticklabels=categories, yticklabels=categories, fmt='g')

# Add labels to the plot
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion matrix')
plt.show()

# Obtaining polarity scores from VADER lexicon

In [10]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora, models
import string
from sklearn.feature_extraction.text import CountVectorizer
import ast
from nrclex import NRCLex
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import svm 
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from matplotlib.colors import ListedColormap
import seaborn as sns
import matplotlib.pyplot as plt
import text2emotion as te
import emoji

In [28]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [31]:
df = pd.read_csv("preprocessed_data_new_negation.csv")

In [32]:
df["lemmatized_processed_text"] = df["lemmatized_processed_text"].apply(lambda x: ast.literal_eval(x))

In [33]:
df['clean_lemmatized_processed_text'] = [' '.join(map(str, l)) for l in df['lemmatized_processed_text']]

In [35]:
analyzer = SentimentIntensityAnalyzer()
df["emotions"] = df['clean_lemmatized_processed_text'].progress_apply(lambda x: analyzer.polarity_scores(x))

100%|██████████| 232074/232074 [07:23<00:00, 523.55it/s]


In [37]:
df = pd.concat([df.drop(['emotions'], axis = 1), df['emotions'].apply(pd.Series)], axis = 1)
