Prerequisito: Preprocessing

### 0. Funciones, etc.

#### Parametros

In [3]:
oversample = True
calculate_feats = True
normalize = True
discretize = True
exclude_feats = []
discretize_size = 10
#exclude_feats = ["sentiment", "methods", "terms", "instruments", "reasons"]

In [4]:
# imports
import pickle
from utils import *

In [6]:
def load_nssi_corpus():

    with open("/datos/erisk/ml/data/nssicorpus.txt", 'r') as file:
        nssi_corpus_original = file.read()

    nssi_corpus = nssi_corpus_original.replace('*', '')
    nssi_corpus = nssi_corpus.replace("Methods of NSSI", '')
    nssi_corpus = nssi_corpus.replace("NSSI Terms", '')
    nssi_corpus = nssi_corpus.replace("Instruments Used", '')
    nssi_corpus = nssi_corpus.replace("Reasons for NSSI", '')

    keys = ["methods", "terms", "instruments", "reasons"]

    nssi_corpus = nssi_corpus.split(':')
    nssi_corpus.remove('')
    nssi_corpus = [corpus.split("\n") for corpus in nssi_corpus]
    new_nssi_corpus = {}
    for idx, corpus in enumerate(nssi_corpus):
        new_list = [word for word in corpus if word != ""]
        new_nssi_corpus[keys[idx]] = new_list

    return new_nssi_corpus

In [7]:
import numpy
import tensorflow
import sys

from numpy.random import seed
seed(42)
tensorflow.random.set_seed(42) 
logger("Initialized numpy random and tensorflow random seed at 42")

Initialized numpy random and tensorflow random seed at 42


### 1. Loading preprocessed data

In [8]:
logger("Initializing Featurizing")

Initializing Featurizing


In [9]:
if calculate_feats:
    if oversample:
        train_users = load_pickle(pickle_path, "train_users_over.pkl")
    else:
        train_users = load_pickle(pickle_path, "train_users.pkl")
    test_users = load_pickle(pickle_path, "test_users.pkl")
    X_train = train_users["clean_text"]
    X_test = test_users["clean_text"]

### 2. Calculating features

In [10]:
import pandas as pd

if calculate_feats:
    feats_train = pd.DataFrame()
    feats_test = pd.DataFrame()
    #text len
    feats_train['char_count'] = X_train.map(len)
    feats_test['char_count'] = X_test.map(len)
    #word count
    feats_train['word_count'] = X_train.map(lambda x: len(x.split()))
    feats_test['word_count'] = X_test.map(lambda x: len(x.split()))

In [11]:
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer

if calculate_feats:
    #special features
    #first prons
    reg = r'\bI\b|\bme\b|\bmine\b|\bmy\b|\bmyself\b'
    feats_train['first_prons'] = X_train.map(lambda x: len(re.findall(reg, x)))
    feats_test['first_prons'] = X_test.map(lambda x: len(re.findall(reg, x)))
    # sentiment analysis
    sid = SentimentIntensityAnalyzer()
    feats_train['sentiment'] = X_train.map(lambda x: round(sid.polarity_scores(x)['compound'], 2))
    feats_test['sentiment'] = X_test.map(lambda x: round(sid.polarity_scores(x)['compound'], 2))




In [12]:
if calculate_feats:
    nssi_corpus = load_nssi_corpus()

In [13]:
# nssi dictionary
if calculate_feats:
    for key, values in nssi_corpus.items():
        feats_train[key] = train_users['stems'].map(lambda x: sum((' '.join(x)).count(word) for word in values))
        feats_test[key] = test_users['stems'].map(lambda x: sum((' '.join(x)).count(word) for word in values))

## Save or load calculated features

In [14]:
if calculate_feats:
    save_pickle(pickle_path, "feats_train_original.pkl", feats_train)
    save_pickle(pickle_path, "feats_test_original.pkl", feats_test)

In [15]:
if not calculate_feats:
    feats_train = load_pickle(pickle_path, "feats_train_original.pkl")
    feats_test = load_pickle(pickle_path, "feats_test_original.pkl")

#### Select features

In [16]:
def select_features(exclude_feats=[], normalize=False, discretize=False, discretize_size=10):
    feats_train_ret = feats_train.copy()
    feats_test_ret = feats_test.copy()
    
    for feat in exclude_feats:
        feats_train_ret.drop(feat, inplace=True, axis=1)
        feats_test_ret.drop(feat, inplace=True, axis=1)
    
    if normalize:
        feats_train_ret = normalize_features(feats_train_ret)
        feats_test_ret = normalize_features(feats_test_ret)
        
    if discretize:
        feats_train_ret, feats_test_ret = discretize_features(feats_train_ret, feats_test_ret, size=discretize_size)
    else:
        feats_train_ret = feats_train_ret.values
        feats_test_ret = feats_test_ret.values
    
    return feats_train_ret, feats_test_ret

#### Normalizing

In [17]:
normalize_exceptions = ['char_count', 'word_density']

def normalize_features(feats):
    text_length = feats["char_count"]
    
    norm_feats = pd.DataFrame()
    for feature in feats.columns:
        if feature not in normalize_exceptions:
            norm_feats[feature] = feats[feature] / text_length
            
    return norm_feats

#### Discretizing

In [18]:
from sklearn.preprocessing import KBinsDiscretizer

def discretize_features(train_feats, test_feats, size=10, strategy='kmeans', encode='onehot-dense'):
    est = KBinsDiscretizer(n_bins=size, encode=encode, strategy=strategy)
    train = est.fit_transform(train_feats)
    test = est.transform(test_feats)

    return train, test

### Selecting features

In [19]:
train_feats_save, test_feats_save = select_features(normalize=normalize, discretize=discretize, 
                                                   exclude_feats=exclude_feats, discretize_size=discretize_size)

  "replaced with 0." % jj)


### 3. Saving selected features

In [20]:
save_pickle(pickle_path, "feats_train.pkl", train_feats_save)
save_pickle(pickle_path, "feats_test.pkl", test_feats_save)