In [None]:
import pandas as pd
import re
from spellchecker import SpellChecker
import contractions
from empath import Empath
import readability
import syntok.segmenter as segmenter
from sentence_transformers import SentenceTransformer
import fasttext.util
import numpy as np
from transformer_embeddings import TransformerEmbeddings, mean_pooling
from huggingface_hub import login

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_dev = pd.read_csv('dev.csv')

df_train['split'] = 'train'
df_test['split'] = 'test'
df_dev['split'] = 'dev'

data = pd.concat([df_train, df_test, df_dev])

In [None]:
data

In [None]:
def tokenize(text):
    return '\n\n'.join(
     '\n'.join(' '.join(token.value for token in sentence)
        for sentence in paragraph)
     for paragraph in segmenter.analyze(text))

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

data['text'] = data['text'].apply(lambda x: remove_urls(x))
data['text'] = data['text'].apply(lambda x: remove_html(x))

chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
B.S.=bullshit
B.S.,=bullshit,
BS=bullshit
BFF=Best Friends Forever
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

data['text'] = data['text'].apply(lambda x: chat_words_conversion(x))

# creating an empty list
def expand_contractions(text):
    expanded_words = []   
    for word in text.split():
        # using contractions.fix to expand the shortened words
        expanded_words.append(contractions.fix(word))  
    
    expanded_text = ' '.join(expanded_words)
    return expanded_text

data['text'] = data['text'].apply(lambda x: expand_contractions(x))

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_word = spell.correction(word)
            if corrected_word != None:
                corrected_text.append(corrected_word)
            else:
                corrected_text.append(word)
        else:
            corrected_text.append(word)
    try:
        return " ".join(corrected_text)
    except:
        print(text)

data['text'] = data['text'].apply(lambda x: correct_spellings(x))

In [None]:
'''
In this dataset:
0: severe depression
1: moderate depression
2: non depression
'''

In [None]:
for index, row in data.iterrows():
    f = open('txt/{}.txt'.format(row['pid']), 'w')
    f.write(row['text'])
    f.close()

In [None]:
data_seance = pd.read_csv('data_seance.csv')

In [None]:
data_seance['pid'] = data_seance['filename'].apply(lambda x: x.split('.')[0])
data_seance = data_seance.set_index('pid').drop('filename', axis=1)
data.set_index('pid', inplace=True)

In [None]:
data = data.join(data_seance, how='inner')

In [None]:
data

In [None]:
lexicon = Empath()

def get_empath_features(post):
    return lexicon.analyze(post, normalize=True)

data['empath_features'] = data['text'].apply(get_empath_features)

data = pd.concat([data.reset_index(), pd.DataFrame(list(data.empath_features)).add_prefix('emp_')], axis=1)
del data['empath_features']

In [None]:
data

In [None]:
data.to_csv('almostprocessed/data_preliwc.csv')

In [None]:
# make liwc analysis from mac

In [None]:
# load results
data_liwc = pd.read_csv('almostprocessed/data_ltedi_liwc.csv').drop('Unnamed: 0', axis=1)
data_liwc['labels'] = data_liwc['labels'].astype(int)

In [None]:
data_liwc

In [None]:
data = data_liwc

In [None]:
data['tokenized'] = data['text'].apply(tokenize)
od = data.apply(lambda x: readability.getmeasures(x['tokenized'], lang='en', merge=True), axis=1)
od = od.apply(pd.Series).add_prefix('readability_')
data = pd.concat([data, od], axis=1).drop(['tokenized'], axis=1)

In [None]:
data

In [None]:
data.columns.values

In [None]:
data.dropna(axis=1, how='all', inplace=True)

In [None]:
data.columns = data.columns.str.rstrip('.1')  # strip suffix at the right end only.

In [None]:
data

In [None]:
# embeddings

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
embeddings = model.encode(data.text, show_progress_bar=True)
data['transformer_embeddings'] = embeddings.tolist()

In [None]:
# fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('../pretrained-models/cc.en.300.bin')

In [None]:
# create a new column called embeddings and apply the function to the dataset
data['ft_embeddings'] = data['text'].apply(lambda x: ft.get_sentence_vector(x).tolist())

In [None]:
# save to csv
data.to_csv('../processed/depression_ltedi.csv', index=False)

In [None]:
data

In [None]:
data = pd.read_csv('../processed/depression_ltedi.csv')
data['label'] = data['labels'].map({0: 2, 1: 1, 2: 0})
data.drop('labels', axis=1, inplace=True)
data.to_csv('../processed/depression_ltedi.csv', index=False)

In [None]:
data = pd.read_csv('../processed/depression_ltedi.csv')

In [None]:
data

In [None]:
model = SentenceTransformer('all-distilroberta-v1')
embeddings = model.encode(data.text, show_progress_bar=True)
data['distil_roberta_embeddings'] = embeddings.tolist()

In [None]:
data.to_csv('../processed/dep_sign.csv', index=False)