In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import datetime

In [None]:
#load the filtered data 
#please change the directory according to where you stored the file 
df = pd.read_csv("L:\\ML-Assignment\\training_data.csv")


In [None]:
df.astype({"joined_text" : "string" , "class" : "string" })

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
#split the data into 2 classes according to the sentiment levels 
splits = list(df.groupby("class"))


negative = splits[0][1]
positive1 = splits[1][1]
positive2 = splits[2][1]
suicidal = splits[4][1]

positive = pd.concat([positive1, positive2], axis = 0)



In [None]:
positive.sample(frac = 1)

In [None]:
#randomly select 50000 features from each class 
import random as rand

number = rand.randint(0,750000)
new_positive = positive[number : number + 50000]

number = rand.randint(0,750000)
new_negative = negative[number: number + 50000]

number = rand.randint(0,75000)
new_suicidal = suicidal[number: number+ 50000]

In [None]:
#concat 3 classes into 1 dataframe
df_concat = pd.concat([new_positive, new_negative, new_suicidal], axis = 0)

#shuffle the sequence of the dataframe
df_concat = df_concat.sample(frac = 1)

In [None]:
df_concat

In [None]:
#Split dataframes into random train and test subsets, the method returns lists
#default ratio in splitting the training and testing set is 3:1 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_concat["joined_text"], df_concat["class"], random_state = 0)

In [None]:
#convert the lists into numpy arrays of type string 
x_train = np.asarray(x_train.to_frame().to_numpy(dtype=np.string_).astype(np.string_))
x_test = np.asarray(x_test.to_frame().to_numpy(dtype=np.string_).astype(np.string_))
y_train = np.asarray(y_train.to_frame().to_numpy(dtype=np.string_).astype(np.string_))
y_test = np.asarray(y_test.to_frame().to_numpy(dtype=np.string_).astype(np.string_))


#create labels in the form of a 2 dimensional array to split the labels into 2 distinct classes 
def create_label(y):
    counter = 0
    result = np.array([[]])
    if y[0] == np.string_("0"):
        result = np.array([[1,0,0]])
    elif y[0] == np.string_("4"):
        result = np.array([[0,1,0]])
    elif y[0] == np.string_("suicide"):
        result = np.array([[0,0,1]])
    
    for i in y: 
        if counter > 0:
            if i == np.string_("0"):
                result = np.append(result, np.array([[1,0,0]]), axis = 0)
            elif i == np.string_("4"):
                result = np.append(result, np.array([[0,1,0]]), axis = 0)
            elif i == np.string_("suicide"):
                result = np.append(result, np.array([[0,0,1]]), axis = 0)
        counter+=1
    

    return result


#create labels from the y_train and y_test 
new_y_train = create_label(y_train)   
new_y_test = create_label(y_test)

                

In [None]:
x_test.shape

In [None]:
print(new_y_train[30])
print(y_train[30])

In [None]:
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential, load_model 
from tensorflow.keras.layers import Dense, TextVectorization, LSTM, Dropout
from sklearn.metrics import accuracy_score

In [None]:
#create our machine learning model 
model = Sequential(name = "Nice_model")
model.add(Input(shape=(1,), dtype = tf.string))

#text vectorization is needed to convert the texts into integers 
vectorize_layer = TextVectorization(split = "whitespace", 
                                    ngrams=(1,2),
                                    output_mode = "tf_idf")

vectorize_layer.adapt(np.asarray(df_concat["joined_text"].to_numpy(dtype=np.string_)).astype(np.string_))


#add the layers into our models 
model.add(vectorize_layer)

model.add(Dropout(0.2, input_shape=(128,)))
model.add(Dense(units = 128, activation = "relu", name = "L1"))
model.add(Dropout(0.2))
model.add(Dense(units = 128, activation = "relu", name = "L2"))
model.add(Dropout(0.2))
model.add(Dense(units = 64, activation = "relu", name = "L3"))
model.add(Dropout(0.2))
model.add(Dense(units = 3, activation = "sigmoid", name = "Output"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics = "accuracy")

print("OK")

In [None]:
model.summary()

In [None]:
#display the score of our model 
score = model.evaluate(x_test, new_y_test)

print(score[0])
print(score[1])

In [None]:
#start the training here 
#please change the directory according to where you wish to store the logs 
log_dir = "L:\\ML-Assignment\\Logs\\Logs" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

history = model.fit(x_train, new_y_train,
         epochs=4,
         batch_size=64,
         callbacks=[tensorboard_callback],
         verbose = 1,
         validation_data = (x_test, new_y_test))


In [None]:
%reload_ext tensorboard
%tensorboard --logdir logs/fit

In [None]:
print(y_train)
print(new_y_train)

In [None]:
#save the model, remember to change the model number in the file name 
#please change the directory according to where you wish to store the model 
model_dir = "L:\\ML-Assignment\\Model\\Model_07"
model.save(model_dir)

In [None]:
#delete the model after it is saved 
del model

In [None]:
#load the model 
model = load_model(model_dir)

print("Loaded")

In [None]:
#clean the input like how we cleaned the data for the model 
import contractions
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np 


def predict(model, input, stopword = stopwords):
    input = input.lower()
    input = re.sub("@[^\s]+", '', input)
    input = re.sub(r'[^\w\s]', '', input)
    input = input.split(" ")

    input = [x.encode("ascii","ignore").decode() for x in input]
    input = [contractions.fix(x) for x in input]
    input = ' '.join(map(str, input))
    input = word_tokenize(input, language = 'english')
    #input = [x for x in input if x not in stopwords]
    input = [x for x in input if not x.isdigit()]
    input = ' '.join(map(str, input))
    filtered_text = input
    
    x_input = np.array([input], dtype = np.string_)
    y = model.predict(x_input)
    result = ""
    
    if y[0][0] == y[0].max():
        result = "High probability of depression"
    elif y[0][1] == y[0].max():
        result = "Low probabiltity of depression"
    elif y[0][2] == y[0].max():
        result = "Suicidal thoughts"
        
    return (result, y, filtered_text)

#return a list wit


In [None]:
#test the model with inputs 
while True: 
    x = input("Enter Text: ")
    
    if x == "quit" or x == "exit":
        break
    
    y = predict(model, x)
    print("Filtered Text: " + y[2])
    print("Model Result: " + y[0])
    print("Probability of depression: ", "{:.4f}%".format(y[1][0][0] * 100))
    print("No depression: ", "{:.4f}%".format(y[1][0][1] * 100) )
    print("Suicidal thought: ", "{:.4f}%".format(y[1][0][2] * 100) )



    
print("Completed.")