In [None]:
# Funciones utiles
import sys
import os
import pickle

#pickle_path = "/datos/ecampillo/jupyter/dl-notebooks/pickles"
pickle_path = "/datos/ecampillo/jupyter/dl-notebooks/newensemble/early-risk-ensemble/pickles"

def logger(message, debug_file="log.txt"):
    print(message)
    original_stdout = sys.stdout # Save a reference to the original standard output
    with open(debug_file, 'a') as f:
        sys.stdout = f # Change the standard output to the file we created.
        print(message)
        sys.stdout = original_stdout # Reset the standard output to its original value
        
def save_pickle(filepath, filename, data):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    file = os.path.join(filepath, filename)
    with open(file, 'wb') as data_file:
        pickle.dump(data, data_file)
        
def load_pickle(filepath, filename):
    file = os.path.join(filepath, filename)
    with open(file, 'rb') as data_file:
        data = pickle.load(data_file)
    return data

def load_nssi_corpus():

    with open("/datos/erisk/ml/data/nssicorpus.txt", 'r') as file:
        nssi_corpus_original = file.read()

    nssi_corpus = nssi_corpus_original.replace('*', '')
    nssi_corpus = nssi_corpus.replace("Methods of NSSI", '')
    nssi_corpus = nssi_corpus.replace("NSSI Terms", '')
    nssi_corpus = nssi_corpus.replace("Instruments Used", '')
    nssi_corpus = nssi_corpus.replace("Reasons for NSSI", '')

    keys = ["methods", "terms", "instruments", "reasons"]

    nssi_corpus = nssi_corpus.split(':')
    nssi_corpus.remove('')
    nssi_corpus = [corpus.split("\n") for corpus in nssi_corpus]
    new_nssi_corpus = {}
    for idx, corpus in enumerate(nssi_corpus):
        new_list = [word for word in corpus if word != ""]
        new_nssi_corpus[keys[idx]] = new_list

    return new_nssi_corpus

In [None]:
train_users = load_pickle(pickle_path, "train_users.pkl")
test_users = load_pickle(pickle_path, "test_users.pkl")
X_train = train_users["clean_text"]
X_test = test_users["clean_text"]
y_train = load_pickle(pickle_path, "y_train.pkl")
y_test = load_pickle(pickle_path, "y_test.pkl")

In [None]:
import pandas as pd
feats_train = pd.DataFrame()
feats_test = pd.DataFrame()
#text len
feats_train['char_count'] = X_train.map(len)
feats_test['char_count'] = X_test.map(len)
#word count
feats_train['word_count'] = X_train.map(lambda x: len(x.split()))
feats_test['word_count'] = X_test.map(lambda x: len(x.split()))

import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#special features
#first prons
reg = r'\bI\b|\bme\b|\bmine\b|\bmy\b|\bmyself\b'
feats_train['first_prons'] = X_train.map(lambda x: len(re.findall(reg, x)))
feats_test['first_prons'] = X_test.map(lambda x: len(re.findall(reg, x)))
# sentiment analysis
sid = SentimentIntensityAnalyzer()
feats_train['sentiment'] = X_train.map(lambda x: round(sid.polarity_scores(x)['compound'], 2))
feats_test['sentiment'] = X_test.map(lambda x: round(sid.polarity_scores(x)['compound'], 2))

nssi_corpus = load_nssi_corpus()

# nssi dictionary
for key, values in nssi_corpus.items():
    feats_train[key] = train_users['stems'].map(lambda x: sum((' '.join(x)).count(word) for word in values))
    feats_test[key] = test_users['stems'].map(lambda x: sum((' '.join(x)).count(word) for word in values))

### Normalize

In [None]:
normalize_exceptions = ['char_count', 'word_density']
text_length = feats_train["char_count"]
#text_length = X_train.map(len)

norm_feats_train = pd.DataFrame()
norm_feats_test = pd.DataFrame()

for feature in feats_train.columns:
    if feature not in normalize_exceptions:
        norm_feats_train[feature] = feats_train[feature] / text_length

text_length = feats_test["char_count"]
#text_length = X_test.map(len)

for feature in feats_test.columns:
    if feature not in normalize_exceptions:
        norm_feats_test[feature] = feats_test[feature] / text_length

### Discretize

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

def discretize_features(train_feats, test_feats, size=3, strategy='uniform'):
    est = KBinsDiscretizer(n_bins=size, encode=encode, strategy=strategy)
    train = est.fit_transform(train_feats)
    test = est.transform(test_feats)

    return train, test

logger("Discretizing features")
dis_feats_train, dis_feats_test = discretize_features(feats_train, feats_test, size=10)

### Save features

In [None]:
# ejecutar para escoger los features no normalizados
logger("Saving non-normalized features")
feats_train_save = feats_train.values
feats_test_save = feats_test.values

In [None]:
save_pickle(pickle_path, "feats_train.pkl", feats_train_save)
save_pickle(pickle_path, "feats_test.pkl", feats_test_save)

### Functions to select features

In [None]:
def select_features(exclude_feats=[], normalize=False, discretize=False):
    feats_train_ret = feats_train
    feats_test_ret = feats_test
    if normalize:
        feats_train_ret = norm_feats_train
        feats_test_ret = norm_feats_test
    if discretize:
        feats_train_ret, feats_test_ret = discretize_features(feats_train_ret, feats_test_ret)
        
    for feat in exclude_feats:
        feats_train_ret.drop(feat, inplace=True, axis=1)
        feats_test_ret.drop(feat, inplace=True, axis=1)
    
    return feats_train_ret.values, feats_test_ret.values
        
    

### DL preprocc

In [None]:
embedding_matrix = load_pickle(pickle_path, "embedding_matrix.pkl")

In [None]:
def define_cnn_model(loc_input_len):
    meta_input = Input(shape=(loc_input_len,))
    nlp_input = Input(shape=(maxlen,))
    emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)(nlp_input)
    nlp_out = Conv1D(64, 5, activation='relu')(emb)
    max_pool = GlobalMaxPooling1D()(nlp_out)
    concat = concatenate([max_pool, meta_input])
    classifier = Dense(32, activation='relu')(concat)
    output = Dense(1, activation='sigmoid')(classifier)
    model_cnn = Model(inputs=[nlp_input, meta_input], outputs=[output])

    model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    
    return model_cnn

In [None]:
def define_lstm_model(loc_input_len):
    meta_input = Input(shape=(loc_input_len,))
    nlp_input = Input(shape=(maxlen,)) 
    emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)(nlp_input)
    nlp_out = Bidirectional(LSTM(128))(emb)
    concat = concatenate([nlp_out, meta_input])
    classifier = Dense(32, activation='relu')(concat)
    output = Dense(1, activation='sigmoid')(classifier)
    model_lstm = Model(inputs=[nlp_input , meta_input], outputs=[output])

    model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    
    return model_lstm

In [None]:
def train_model(model_to_train, X_train, feats_train, y_train):
    history = model_to_train.fit([X_train, feats_train], y_train, batch_size=2, epochs=10, verbose=1, validation_split=0.2, shuffle=True)
    return history

### Eval functions

In [None]:
def evaluate_model(model):
    score = model.evaluate([X_test, feats_test], y_test, verbose=1)
    logger("Test Score: {}".format(score[0]))
    logger("Test Accuracy: {}".format(score[1]))

    from sklearn.metrics import classification_report, confusion_matrix
    import numpy as np

    y_pred = model.predict([X_test, feats_test], batch_size=2, verbose=1)
    if y_pred.shape[-1] > 1:
        y_pred_label = y_pred.argmax(axis=-1)
    else:
        print("Entered here")
        y_pred_label = (y_pred > 0.5).astype('int32')

    from sklearn.metrics import classification_report, confusion_matrix

    logger(classification_report(y_test, y_pred_label))
    logger(confusion_matrix(y_test, y_pred_label))

## CNN Experiments

In [None]:
logger("Experiment 1: CNN, First person, sentiment analysis y nssi dictionary sin normalizar ni discretizar")

# selecting features
train_feats_new, test_feats_new = select_features()
model = define_cnn_model(len(feats_train_new[1,]))
logger("Training")
history = train_model(model, X_train, feats_train_new, y_train)
logger("Evaluating")
evaluate_model(model)

In [None]:
logger("Experiment 2: CNN, First person, sentiment analysis y nssi dictionary con normalizar ni discretizar")

# selecting features
train_feats_new, test_feats_new = select_features(normalize=True)
model = define_cnn_model(len(feats_train_new[1,]))
logger("Training")
history = train_model(model, X_train, feats_train_new, y_train)
logger("Evaluating")
evaluate_model(model)

In [None]:
logger("Experiment 3: CNN, First person, sentiment analysis y nssi dictionary sin normalizar con discretizar")

# selecting features
train_feats_new, test_feats_new = select_features(discretize=True)
model = define_cnn_model(len(feats_train_new[1,]))
logger("Training")
history = train_model(model, X_train, feats_train_new, y_train)
logger("Evaluating")
evaluate_model(model)

In [None]:
logger("Experiment 4: CNN, First person, sentiment analysis y nssi dictionary sin normalizar con discretizar")

# selecting features
train_feats_new, test_feats_new = select_features(discretize=True)
model = define_cnn_model(len(feats_train_new[1,]))
logger("Training")
history = train_model(model, X_train, feats_train_new, y_train)
logger("Evaluating")
evaluate_model(model)

In [None]:
logger("Experiment 5: CNN, First person, sentiment analysis y nssi dictionary con normalizar con discretizar")

# selecting features
train_feats_new, test_feats_new = select_features(normalize=True, discretize=True)
model = define_cnn_model(len(feats_train_new[1,]))
logger("Training")
history = train_model(model, X_train, feats_train_new, y_train)
logger("Evaluating")
evaluate_model(model)

#### Solo first person

In [None]:
logger("Experiment 6: CNN, First person sin normalizar con discretizar")

# selecting features
train_feats_new, test_feats_new = select_features(exclude_feats=["sentiment", "methods", 
                                                                 "terms", "instruments", "reasons"],
                                                  discretize=True)
model = define_cnn_model(len(feats_train_new[1,]))
logger("Training")
history = train_model(model, X_train, feats_train_new, y_train)
logger("Evaluating")
evaluate_model(model)

In [None]:
logger("Experiment 7: CNN, First person con normalizar con discretizar")

# selecting features
train_feats_new, test_feats_new = select_features(exclude_feats=["sentiment", "methods", 
                                                                 "terms", "instruments", "reasons"],
                                                  discretize=True, normalize=True)
model = define_cnn_model(len(feats_train_new[1,]))
logger("Training")
history = train_model(model, X_train, feats_train_new, y_train)
logger("Evaluating")
evaluate_model(model)

#### First person y sentiment

In [None]:
logger("Experiment 8: CNN, First person y sentiment sin normalizar con discretizar")

# selecting features
train_feats_new, test_feats_new = select_features(exclude_feats=["methods", "terms", "instruments", "reasons"],
                                                  discretize=True)
model = define_cnn_model(len(feats_train_new[1,]))
logger("Training")
history = train_model(model, X_train, feats_train_new, y_train)
logger("Evaluating")
evaluate_model(model)

In [None]:
logger("Experiment 8: CNN, First person y sentiment con normalizar sin discretizar")

# selecting features
train_feats_new, test_feats_new = select_features(exclude_feats=["methods", "terms", "instruments", "reasons"],
                                                  normalize=True)
model = define_cnn_model(len(feats_train_new[1,]))
logger("Training")
history = train_model(model, X_train, feats_train_new, y_train)
logger("Evaluating")
evaluate_model(model)

In [None]:
logger("Experiment 9: CNN, First person y sentiment con normalizar con discretizar")

# selecting features
train_feats_new, test_feats_new = select_features(exclude_feats=["methods", "terms", "instruments", "reasons"],
                                                  normalize=True, discretize=True)
model = define_cnn_model(len(feats_train_new[1,]))
logger("Training")
history = train_model(model, X_train, feats_train_new, y_train)
logger("Evaluating")
evaluate_model(model)