# Logistic Regression (Logit)
This notebook aims to apply the logit model to perform text classification and detect suicidal text.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, recall_score, accuracy_score, f1_score, precision_score
from collections import Counter
from prettytable import PrettyTable

In [None]:
SEED = 4222
EPOCHS = 5

In [None]:
# Change to your own directory
try: 
    os.chdir("/content/drive/MyDrive/BT4222 Project")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


In [None]:
# Load dataset
suicide_detection_df = pd.read_csv('Data/suicide_detection_final_cleaned.csv', header=0)
suicide_detection_df.drop(columns=['text'], axis=1, inplace=True)
suicide_detection_df = suicide_detection_df.rename(columns={"cleaned_text": "text"})
classes = {"suicide": 1, "non-suicide": 0}
suicide_detection_df = suicide_detection_df.replace({"class": classes})
suicide_detection_df.head()

Unnamed: 0,class,text
0,1,sex wife threaten suicide recently leave wife ...
1,0,weird not affect compliment come know real lif...
2,0,finally hear bad year swear fucking god annoying
3,1,need help just help cry hard
4,1,end tonight not anymore quit


In [None]:
# Split dataset into train, validation and test sets
train_text, test_text, train_labels, test_labels = train_test_split(suicide_detection_df['text'], suicide_detection_df['class'],
                                                                    random_state=SEED,
                                                                    test_size=0.2,
                                                                    stratify=suicide_detection_df['class'])

## Import vocab

In [None]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = 'Data/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

## Import Embeddings

In [None]:
# load embedding as a dict
def load_embedding(filename):
	# load embedding into memory, skip first line
	file = open(filename,'r')
	lines = file.readlines()[1:]
	file.close()
	# create a map of words to vectors
	embedding = dict()
	for line in lines:
		parts = line.split()
		# key is string word, value is numpy array for vector
		embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
	return embedding

### Removing out of vocab words

In [None]:
# clean each line
def clean_line(line, vocab):
  tokens = line.split()
  # filter out tokens not in vocab
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

# clean entire dataset
def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    # add lines to list
    lines += line
  return lines

### Document Vector function

In [None]:
def document_vector(doc, embeddings):
    sentence = list()
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in doc if word in embeddings.keys()]
    for i in doc:
      word = embeddings[i]
      sentence.append(word)
    return np.mean(sentence, axis=0)

In [None]:
# function for all the data
def all_documents(df, labels_ori, embeddings):
  vec = list()
  labels = list()
  for i in range(len(df)):
    if len(df[i]) == 0:
      continue
    else:
      vec.append(document_vector(df[i], embeddings))
      labels.append(labels_ori.values[i])
  return vec, labels

## Word2Vec

In [None]:
word2vec = load_embedding('Data/embedding_word2vec.txt')

In [None]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)
train_vec, train_labels_new = all_documents(train_clean, train_labels,word2vec)
test_vec, test_labels_new = all_documents(test_clean, test_labels, word2vec)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec, train_labels_new)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_train_pred = lr.predict(train_vec)
print('Training set accuracy %s' % accuracy_score(train_labels_new, y_train_pred))
print(classification_report(train_labels_new, y_train_pred))

Training set accuracy 0.9103104308195029
              precision    recall  f1-score   support

           0       0.93      0.93      0.93     85664
           1       0.88      0.88      0.88     53884

    accuracy                           0.91    139548
   macro avg       0.91      0.91      0.91    139548
weighted avg       0.91      0.91      0.91    139548



In [None]:
y_test_pred = lr.predict(test_vec)
print('Test set accuracy %s' % accuracy_score(test_labels_new, y_test_pred))
print(classification_report(test_labels_new, y_test_pred))

Test set accuracy 0.9110716128662347
              precision    recall  f1-score   support

           0       0.93      0.93      0.93     21411
           1       0.88      0.89      0.89     13471

    accuracy                           0.91     34882
   macro avg       0.91      0.91      0.91     34882
weighted avg       0.91      0.91      0.91     34882



In [None]:
word2vec_test_accuracy_score = accuracy_score(test_labels_new, y_test_pred)
word2vec_test_precision_score = precision_score(test_labels_new, y_test_pred)
word2vec_test_recall_score = recall_score(test_labels_new, y_test_pred)
word2vec_test_f1_score = f1_score(test_labels_new, y_test_pred)

# GloVe

In [None]:
# load glove embedding from file
raw_embedding_glove = load_embedding('Data/glove_twitter_27B_200d.txt')

In [None]:
train_clean_glove = process_lines(train_text, raw_embedding_glove.keys())
test_clean_glove = process_lines(test_text, raw_embedding_glove.keys())
train_vec_glove, train_labels_glove_new = all_documents(train_clean_glove, train_labels, raw_embedding_glove)
test_vec_glove, test_labels_glove_new = all_documents(test_clean_glove, test_labels, raw_embedding_glove)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec_glove, train_labels_glove_new)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_train_pred = lr.predict(train_vec_glove)
print('Training set accuracy %s' % accuracy_score(train_labels_glove_new, y_train_pred))
print(classification_report(train_labels_glove_new, y_train_pred))

Training set accuracy 0.8778762203776181
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     85634
           1       0.84      0.84      0.84     53872

    accuracy                           0.88    139506
   macro avg       0.87      0.87      0.87    139506
weighted avg       0.88      0.88      0.88    139506



In [None]:
y_test_pred = lr.predict(test_vec_glove)
print('Training set accuracy %s' % accuracy_score(test_labels_glove_new, y_test_pred))
print(classification_report(test_labels_glove_new, y_test_pred))

Training set accuracy 0.8773866177398085
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     21410
           1       0.84      0.84      0.84     13472

    accuracy                           0.88     34882
   macro avg       0.87      0.87      0.87     34882
weighted avg       0.88      0.88      0.88     34882



In [None]:
glove_test_accuracy_score = accuracy_score(test_labels_glove_new, y_test_pred)
glove_test_precision_score = precision_score(test_labels_glove_new, y_test_pred)
glove_test_recall_score = recall_score(test_labels_glove_new, y_test_pred)
glove_test_f1_score = f1_score(test_labels_glove_new, y_test_pred)

# Summary 

In [None]:
table = PrettyTable()
table.field_names = ['Model - Logistic Regression', 'Accuracy', 'Precision', 'Recall', 'F1 Score']

table.add_row(['Word2Vec', 
               format(word2vec_test_accuracy_score, '.4f'), 
               format(word2vec_test_precision_score, '.4f'), 
               format(word2vec_test_recall_score, '.4f'), 
               format(word2vec_test_f1_score, '.4f')])

table.add_row(['GloVe', 
               format(glove_test_accuracy_score, '.4f'), 
               format(glove_test_precision_score, '.4f'), 
               format(glove_test_recall_score, '.4f'), 
               format(glove_test_f1_score, '.4f')])
print(table)

+-----------------------------+----------+-----------+--------+----------+
| Model - Logistic Regression | Accuracy | Precision | Recall | F1 Score |
+-----------------------------+----------+-----------+--------+----------+
|           Word2Vec          |  0.9111  |   0.8832  | 0.8870 |  0.8851  |
|            GloVe            |  0.8774  |   0.8394  | 0.8440 |  0.8417  |
+-----------------------------+----------+-----------+--------+----------+
