In [None]:
#!pip install -q pyarrow

# Load dataset

In [None]:
'''
Authors: Daniel M. Low
License: See license in github repository
'''

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

ts = datetime.datetime.utcnow().strftime('%y-%m-%dT%H-%M-%S')

pd.set_option("display.max_columns", None)
# pd.options.display.width = 0


# os.chdir(os.path.dirname(__file__)) # Set working directory to current file

on_colab = False

if on_colab:
  from google.colab import drive
  project_name = 'project_name'
  drive.mount('/content/drive')
  input_dir = f'/content/drive/MyDrive/datum/{project_name}/data/input/'
  output_dir = f'/content/drive/MyDrive/datum/{project_name}/data/output/'
else:
  input_dir = './data/'
  output_dir = './data/output/'

os.makedirs(output_dir, exist_ok=True)



In [None]:
# Config
balance = True # balance training set by downsampling
task = 'classification'
# target = 'immiment_risk'
normalize_lexicon = True



if task == 'classification':
	dv = 'suicide_ladder_classification'
	if target == 'suicidal_desire':
		balance_values = ['nonsuicidal','suicidal_desire']
	elif target == 'imminent_risk':
		balance_values = ['suicidal_desire','imminent_risk']
	smallest_value = 'imminent_risk'
	n = 1893

elif task == 'regression':

	# config
	dv = 'suicide_ladder_a'
	balance_values = [1,2,3]
	smallest_value = 3


In [None]:

def generate_feature_importance_df(trained_model, model_name, feature_names, xgboost_method = 'weight', model_name_in_pipeline = 'estimator', lgbm_method='split'):
	'''
	Function to generate feature importance table for methods that use .coef_ from sklearn
	as well as xgboost models.
	both using sklearn pipelines that go into GridsearchCV, where we need to 
	first access the best_estimator to access, for example, the coefficients.
	
	trained_model: sklearn type model object fit to data
	model_name: str among the ones that appear below
	xgboost_method: str, there are a few options: https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.Booster.get_score     
	'''
	
	#  Feature importance using coefficients for linear models and gini 
	if model_name in ['SGDRegressor', 'Ridge', 'Lasso', 'LogisticRegression', 'LinearSVC']:
		try:
			coefs = list(trained_model.named_steps['model'].coef_)
		except:
			coefs = list(trained_model.best_estimator_.named_steps[model_name_in_pipeline].coef_)                     # Obtain coefficients from GridSearch
		try:
			coefs= pd.DataFrame(coefs,index = ['Coef.'], columns = feature_names).T # make DF
		except:
			coefs= pd.DataFrame(coefs,index=feature_names, columns = ['Coef.']) # make DF
		coefs['Abs. Coef.'] = coefs['Coef.'].abs()  # add column with absolute values to sort by, both positive and negative values are important. 
		coefs= coefs.sort_values('Abs. Coef.', ascending=False).reset_index() # sort by abs value and reset index to add a feature name column
		coefs= coefs.drop(['Abs. Coef.'], axis=1)   # drop abs value, it's job is done
		coefs.index +=1                             # Importance for publication, start index with 1 , as in 1st, 2nd, 3rd
		coefs= coefs.reset_index()                  # turn index into column
		coefs.columns= ['Importance', 'Feature', 'Coef.'] # Clean column names
		feature_importance = coefs.copy()
		return feature_importance
		
	elif model_name in ['LGBMRegressor', 'LGBMClassifier']:    
		try:
			importance_split = trained_model.named_steps[model_name_in_pipeline].booster_.feature_importance(importance_type='split')
			importance_gain = trained_model.named_steps[model_name_in_pipeline].booster_.feature_importance(importance_type='gain')
			# feature_names = trained_model.named_steps[model_name_in_pipeline].booster_.feature_name()
		except:
			importance_split = trained_model.best_estimator_.named_steps[model_name_in_pipeline].booster_.feature_importance(importance_type='split')
			importance_gain = trained_model.best_estimator_.named_steps[model_name_in_pipeline].booster_.feature_importance(importance_type='gain')
			# feature_names = trained_model.best_estimator_.named_steps[model_name_in_pipeline].booster_.feature_name()
		
		feature_importance = pd.DataFrame({'feature': feature_names, 'split': importance_split, 'gain': importance_gain})
		
		# Sort by gain
		feature_importance = feature_importance.sort_values('gain', ascending=False)
		return feature_importance

		

	elif model_name in ['XGBRegressor', 'XGBClassifier']:
		# WARNING it will not return values for features that weren't used: if feature 3 wasn't used there will not be a f3 in the results        
		try:
			feature_importance = trained_model.named_steps[model_name_in_pipeline].get_booster().get_score(importance_type=xgboost_method )
		except:
			feature_importance = trained_model.best_estimator_.named_steps[model_name_in_pipeline].get_booster().get_score(importance_type=xgboost_method )
		feature_importance_keys = list(feature_importance .keys())
		feature_importance_values = list(feature_importance .values())    
		feature_importance = pd.DataFrame(feature_importance_values,index=feature_importance_keys) # make DF
		feature_importance = feature_importance .sort_values(0, ascending=False)
		feature_importance = feature_importance.reset_index()
	
		feature_importance.index +=1
		feature_importance = feature_importance.reset_index()
		feature_importance
		
		
		feature_importance.columns = ['Importance', 'Feature', xgboost_method.capitalize()]
		
		feature_name_mapping = {}
		for i, feature_name_i in enumerate(feature_names):
			feature_name_mapping[f'f{i}'] = feature_name_i
		
		# Or manually edit here: 
		# feature_name_mapping = {'f0': 'Unnamed: 0', 'f1': 'Adult Mortality', 'f2': 'infant deaths', 'f3': 'percentage expenditure', 'f4': 'Hepatitis B', 'f5': 'Measles ', 'f6': ' BMI ', 'f7': 'under-five deaths ', 'f8': 'Polio', 'f9': 'Diphtheria ', 'f10': ' HIV/AIDS', 'f11': ' thinness  1-19 years', 'f12': ' thinness 5-9 years', 'f13': 'Developing'}
		
		feature_importance['Feature'] = feature_importance['Feature'].map(feature_name_mapping )
	# Todo: add feature_importances_ for sklearn tree based models
	# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#feature-importance-based-on-mean-decrease-in-impurity
	
	
		return feature_importance
	else:
		warnings.warn(f'model not specificied for feature importance: {model_name}')
		return None


In [None]:

liwc_nonsemantic = ['WC','WPS',
 'BigWords',
 'Dic',
 'Linguistic',
 'function',
 'pronoun',
 'ppron',
 'i',
 'we',
 'you',
 'shehe',
 'they',
 'ipron',
 'det',
 'article',
 'number',
 'prep',
 'auxverb',
 'adverb',
 'conj',
 'negate',
 'verb',
 'adj',
 'quantity',
 'AllPunc',
 'Period',
 'Comma',
 'QMark',
 'Exclam',
 'Apostro',
 'OtherP'
]

liwc_semantic = ['Analytic',
 'Clout',
 'Authentic',
 'Tone', 
 'Drives',
 'affiliation',
 'achieve',
 'power',
 'Cognition',
 'allnone',
 'cogproc',
 'insight',
 'cause',
 'discrep',
 'tentat',
 'certitude',
 'differ',
 'memory',
 'Affect',
 'tone_pos',
 'tone_neg',
 'emotion',
 'emo_pos',
 'emo_neg',
 'emo_anx',
 'emo_anger',
 'emo_sad',
 'swear',
 'Social',
 'socbehav',
 'prosocial',
 'polite',
 'conflict',
 'moral',
 'comm',
 'socrefs',
 'family',
 'friend',
 'female',
 'male',
 'Culture',
 'politic',
 'ethnicity',
 'tech',
 'Lifestyle',
 'leisure',
 'home',
 'work',
 'money',
 'relig',
 'Physical',
 'health',
 'illness',
 'wellness',
 'mental',
 'substances',
 'sexual',
 'food',
 'death',
 'need',
 'want',
 'acquire',
 'lack',
 'fulfill',
 'fatigue',
 'reward',
 'risk',
 'curiosity',
 'allure',
 'Perception',
 'attention',
 'motion',
 'space',
 'visual',
 'auditory',
 'feeling',
 'time',
 'focuspast',
 'focuspresent',
 'focusfuture',
 'Conversation',
 'netspeak',
 'assent',
 'nonflu',
 'filler']

# Skip loading data and extracting featues and load below

# Or load data and extract

In [None]:

set_names = ['train10_train_30perc' ,'train10_val_15perc','train10_test_15perc']

dataset_dir = '/Users/danielmlow/data/ctl/input/datasets/'

sub_dir = 'train10_subset_30'

# Text 
train = pd.read_parquet(dataset_dir + f'{sub_dir}/{set_names[0]}_messages_texter.gzip', engine='pyarrow')
val = pd.read_parquet(dataset_dir + f'{sub_dir}/{set_names[1]}_messages_texter.gzip', engine='pyarrow')
test = pd.read_parquet(dataset_dir + f'{sub_dir}/{set_names[2]}_messages_texter.gzip', engine='pyarrow')

# Metadata (i.e., target variables)
train_metadata = pd.read_csv(dataset_dir + f'{sub_dir}/{set_names[0]}_metadata.csv')
val_metadata = pd.read_csv(dataset_dir+ f'{sub_dir}/{set_names[1]}_metadata.csv')
test_metadata = pd.read_csv(dataset_dir + f'{sub_dir}/{set_names[2]}_metadata.csv')




In [None]:
# Suicide risk lexicon. should be able to import it
import sys
sys.path.append( './../../concept-tracker/')
from concept_tracker import lexicon

In [None]:
import pickle 
import random 


run_this = True

dfs = {'train':{'name':set_names[0], 'messages':train, 'metadata':train_metadata},
        'val':{'name':set_names[1], 'messages':val, 'metadata':val_metadata},
       'test':{'name':set_names[2], 'messages':test, 'metadata':test_metadata},
               }


if run_this:
    with open(f'./data/input/ctl/{sub_dir}_dfs.pkl', 'wb') as f:
        pickle.dump(dfs, f) 

# Save dfs to extract features in lexicon.py script

# Create two datasets for classification: non-suicidal vs. non-imminent suicidal, suicidal vs. imminment risk

In [None]:



display(train_metadata['suicide_ladder_c'].value_counts())
display(test_metadata['suicide_ladder_c'].value_counts())


for split in dfs.keys():
	df_metadata = dfs[split]['metadata']
	new_values = []
	for n in df_metadata['suicide_ladder_c'].values:
		if n==1:
			new_values.append('nonsuicidal')
		elif n == 2:
			new_values.append('suicidal_desire')
		elif n >= 4:
			new_values.append('imminent_risk')
		else:
			new_values.append('suicidal_intent_capability')

	df_metadata['suicide_ladder_classification'] = new_values
	dfs[split]['metadata'] = df_metadata




			
	display(df_metadata['suicide_ladder_classification'].value_counts())




# Balanced training set (downsample)

In [None]:


if task == 'classification':
	dv = 'suicide_ladder_classification'
	if target == 'suicidal_desire':
		balance_values = ['nonsuicidal','suicidal_desire']
	elif target == 'imminent_risk':
		balance_values = ['suicidal_desire','imminent_risk']
	smallest_value = 'imminent_risk'
	n = 1893

elif task == 'regression':

	# config
	dv = 'suicide_ladder_a'
	balance_values = [1,2,3]
	smallest_value = 3
	n = train_metadata[dv].value_counts()[smallest_value]  

display(train_metadata[dv].value_counts())

# n = n- 10 #(-10 just in case there are issues like NaNs)
print(n)


dv_counts = []
dv_counts_perc = []
total_sample_size = 0
for split in dfs.keys():
	print(split)
	print('====================')
	
	messages = dfs[split]['messages']
	metadata = dfs[split]['metadata']
	total_sample_size+=metadata.shape[0]
	print(messages.shape[0])
	print(messages['conversation_id'].unique().shape[0])
	
	# display()


	if split == 'train' and balance:
		# balance training set
		
		metadata_balanced = []
		for i in balance_values:
			
			metadata_balanced_i  = metadata[metadata[dv]==i].sample(n=n, random_state = 42) # perceived risk == i

			# ids_subset = random.sample(ids, 1371) # subsample those
			# messages_dv_i = messages_dv[messages_dv['perceived_risk'].isin(ids_subset)]
			metadata_balanced.append(metadata_balanced_i)
		
		metadata_balanced = pd.concat(metadata_balanced)
		metadata_balanced = metadata_balanced.sample(frac=1) #reshuffle
		metadata_balanced = metadata_balanced.sort_values(by=['conversation_id'])
		metadata_balanced.reset_index(inplace= True, drop=True)
		
		dv_counts.append(metadata_balanced[dv].value_counts())
		dv_counts_perc.append(metadata_balanced[dv].value_counts(normalize = True))
		messages_dv = messages.merge(metadata_balanced, on = 'conversation_id', how='inner')
		balanced_convo_ids = messages_dv['conversation_id'].values
		# liwc = dfs['train']['liwc22']
		# liwc_balanced = liwc[liwc['conversation_id'].isin(balanced_convo_ids)]
		# dfs['train']['liwc22_balanced'] = liwc_balanced.copy()
		
		# metadata_dv = metadata[['conversation_id', 'perceived_risk']]
		# print(metadata_dv.shape,messages.shape)
		
		
		
	if split != 'train':
		messages_dv = messages.merge(metadata, on = 'conversation_id', how='inner')
	
	messages_dv = messages_dv[~messages_dv[dv].isna()]
	print(messages_dv.shape[0])
	messages_dv = messages_dv.sort_values(by=['conversation_id', 'message_timestamp_utc'])

	
	if task == 'classification':
		messages_dv = messages_dv[messages_dv[dv].isin(balance_values)]
		messages_dv = messages_dv.reset_index(drop=True)
		convo_ids = messages_dv['conversation_id'].values
		messages_dv = messages_dv.reset_index(drop=True)

	
	dfs[split]['metadata_messages'] = messages_dv

	convo_ids = []
	X = []
	y = []
	# concat messages in messages_dv
	for i in messages_dv['conversation_id'].unique():
		messages_dv_i = messages_dv[messages_dv['conversation_id']==i]
		messages_convo_i = [n.strip(' ') if n.endswith(('.', ',', ']', ')', '!','?', '>')) else n.strip(' ')+'.' for n in messages_dv_i['message'].tolist() ]
		X_i = ' '.join(messages_convo_i) # messages of 1 convo
		
		# X_i = X_i.replace('
		X.append(X_i)
		y_i= messages_dv_i[dv].unique()
		if len(y_i)!=1:
			print('multiple values, fix:', y_i)
			break

		y_i = y_i[0]
		y.append(y_i)
		convo_ids.append(i)


	dfs[split]['X'] = X
	dfs[split]['y'] = y

	df_text = pd.DataFrame({'conversation_id': convo_ids,
							'text':X,
							'y':y
						})
	
	
	
	# print('text before', df_text.shape)
	# df_text = df_text [~((df_text['y'].isna()) | (df_text['y']==''))]
	# print('text after', df_text.shape)

	

	dfs[split]['df_text'] = df_text.copy()
	name = dfs[split]['name']


	
	if split == 'train':
		df_text.to_csv(f'./data/input/ctl/{name}_text_y_balanced_{task}.csv')
		df_text = pd.read_csv(f'./data/input/ctl/{name}_text_y_balanced_{task}.csv', index_col = 0)
		df_text = df_text [~((df_text['y'].isna()) | (df_text['y']==''))]
		


		df_text.to_csv(f'./data/input/ctl/{name}_text_y_balanced_{task}.csv')
	else:
		display(df_text['y'][65:80])
		df_text.to_csv(f'./data/input/ctl/{name}_text_y_{task}.csv')
		df_text = pd.read_csv(f'./data/input/ctl/{name}_text_y_{task}.csv', index_col = 0)
		df_text = df_text [~((df_text['y'].isna()) | (df_text['y']==''))]
		df_text.to_csv(f'./data/input/ctl/{name}_text_y_{task}.csv')

	dv_counts.append(df_text['y'].value_counts())
	dv_counts_perc.append(df_text['y'].value_counts(normalize=True))

	
						
	

	# balance liwc dataset
	
	
	
	# print(metadata['conversation_id'].unique().shape[0])
	
print('total_sample_size', total_sample_size)
	

In [None]:
from collections import Counter

for split in dfs.keys():
	print(Counter(dfs[split]['y']))
	[np.round(n/len(dfs[split]['y']),2) for n in dict(Counter(dfs[split]['y'])).values()]

# Descriptive statistics

In [None]:

# dataset_name = 'train10_subset_30'

# for split in dfs.keys():
# 	df_text = dfs[split]['df_text'][['conversation_id', 'text', 'y']]
# 	df_text.to_csv(f'./data/input/ctl/{dataset_name}_{split}_text_y.csv', index = False)




In [None]:
if task == 'classification':

	dv_counts = pd.concat(dv_counts,axis=1        )
	dv_counts_perc = pd.concat(dv_counts_perc, axis=1)
	dv_counts = dv_counts.iloc[:,1:]
	dv_counts_perc = dv_counts_perc.iloc[:,1:]
	dv_counts.columns = dfs.keys()
	dv_counts_perc.columns = dfs.keys()

	# print('downsampled sample size', dv_counts.sum().sum())

	# dv_distr = dv_counts.astype(str).add(" (").add(dv_counts_perc.round(2).astype(str)).add("%)")
	dv_distr = dv_counts_perc.round(2).astype(str).add(" (").add(dv_counts.astype(str)).add(")").sort_index()

	


	dv_distr = dv_distr.reindex(balance_values)
	dv_distr.index = [n.capitalize().replace('_', ' ') for n in balance_values]
	dv_distr.index.name = 'Suicide risk'

	dv_distr.columns = ['Training', 'Validation', "Test"]

	dv_distr.to_csv(f'./data/output/tables/distribution_dv_{dv}_{task}_{target}.csv', index= True)
	dv_distr

In [None]:
balance_values

In [None]:
if task == 'regression':

	dv_counts = pd.concat(dv_counts,axis=1        )
	dv_counts_perc = pd.concat(dv_counts_perc, axis=1)
	dv_counts = dv_counts.iloc[:,1:]
	dv_counts_perc = dv_counts_perc.iloc[:,1:]
	dv_counts.columns = dfs.keys()
	dv_counts_perc.columns = dfs.keys()

	print('downsampled sample size', dv_counts.sum().sum())

	dv_distr = dv_counts.astype(str).add(" (").add(dv_counts_perc.round(2).astype(str)).add("%)")
	dv_distr = dv_counts_perc.round(2).astype(str).add(" (").add(dv_counts.astype(str)).add(")").sort_index()




	dv_distr.index = ['Low', 'Medium', 'High']
	dv_distr.index.name = 'Suicide risk'
	dv_distr.columns = ['Training', 'Validation', "Test"]

	dv_distr.to_csv(f'./data/output/tables/distribution_dv_{dv}_{task}.csv', index= True)
	dv_distr

In [None]:
dv_distr

In [None]:
# # Remove non IV columns from LIWC22
# for split in dfs.keys():
#     dfs[split]['liwc22'] = dfs[split]['liwc22'].drop(['Segment', 'conversation_id', 'message', 'Emoji'], axis=1)
#     if balance and split=='train':
#         dfs[split]['liwc22_balanced'] = dfs[split]['liwc22'].drop(['Segment', 'conversation_id', 'message', 'Emoji'], axis=1)
                    

# Extract liwc

## automated liwc: this didn't work, I used the desktop app

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Ryan L. Boyd
# 2022-03-17


run_this = False # We'll do it manually below
	if run_this:


		# This is an example script that demonstrates how to make a call to the LIWC-22 command line interface (CLI)
		# from Python. Briefly described, what we want to do is launch the CLI application as a subprocess, then wait
		# for that subprocess to finish.

		# This is a very crude example script, so please feel free to improve/innovate on this example :)          """


	# Make sure that you have the LIWC-22.exe GUI running — it is required for the CLI to function correctly :)
	# Make sure that you have the LIWC-22.exe GUI running — it is required for the CLI to function correctly :)
	# Make sure that you have the LIWC-22.exe GUI running — it is required for the CLI to function correctly :)
	# Make sure that you have the LIWC-22.exe GUI running — it is required for the CLI to function correctly :)


	import subprocess


	#  ______    _     _                      _ _   _       _________   _________   ______ _ _
	# |  ____|  | |   | |                    (_| | | |     |__   __\ \ / |__   __| |  ____(_| |
	# | |__ ___ | | __| | ___ _ __  __      ___| |_| |__      | |   \ V /   | |    | |__   _| | ___ ___
	# |  __/ _ \| |/ _` |/ _ | '__| \ \ /\ / | | __| '_ \     | |    > <    | |    |  __| | | |/ _ / __|
	# | | | (_) | | (_| |  __| |     \ V  V /| | |_| | | |    | |   / . \   | |    | |    | | |  __\__ \
	# |_|  \___/|_|\__,_|\___|_|      \_/\_/ |_|\__|_| |_|    |_|  /_/ \_\  |_|    |_|    |_|_|\___|___/

	inputFolderTXT = "C:/Users/Ryan/Datasets/TED - English Only - TXT Files/"
	outputLocation = "C:/Users/Ryan/Datasets/TED Talk TXT Files - Analyzed.csv"

	# This command will read texts from a folder, analyze them using the standard "Word Count" LIWC analysis,
	# then save our output to a specified location.
	cmd_to_execute = ["LIWC-22-cli",
					"--mode", "wc",
					"--input", inputFolderTXT,
					"--output", outputLocation]



	# Let's go ahead and run this analysis:
	subprocess.call(cmd_to_execute)

	# We will see the following in the terminal as it begins working:
	#
	#    Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
	#    Processing:
	#     - [folder] C:\Users\Ryan\Datasets\TED - English Only - TXT Files
	#    [===================                     ] 47.75%; Number of Texts Analyzed: 1304; Total Words Analyzed: 2.62M


	# A thing of beauty, to be sure. What if we want to process our texts using an older LIWC dictionary,
	# or an external dictionary file? This can be done easily as well.



	# We can specify whether we want to use the LIWC2001, LIWC2007, LIWC2015,
	# or LIWC22 dictionary with the --dictionary argument.
	liwcDict = "LIWC2015"

	# Alternatively, you can specify the absolute path to an external dictionary
	# file that you would like to use, and LIWC will load this dictionary for processing.
	#liwcDict = "C:/Users/Ryan/Dictionaries/Personal Values Dictionary.dicx"


	# Let's update our output location as well so that we don't overwrite our previous file.
	outputLocation = "C:/Users/Ryan/Datasets/TED Talk TXT Files - Analyzed (LIWC2015).csv"

	cmd_to_execute = ["LIWC-22-cli",
					"--mode", "wc",
					"--dictionary", liwcDict,
					"--input", inputFolderTXT,
					"--output", outputLocation]

	subprocess.call(cmd_to_execute)








	#   _____  _______      __  ______ _ _
	#  / ____|/ ____\ \    / / |  ____(_| |
	# | |    | (___  \ \  / /  | |__   _| | ___
	# | |     \___ \  \ \/ /   |  __| | | |/ _ \
	# | |____ ____) |  \  /    | |    | | |  __/
	#  \_____|_____/    \/     |_|    |_|_|\___|



	# Beautiful. Now, let's do the same thing, but analyzing a CSV file full of the same texts.
	inputFileCSV = 'C:/Users/Ryan/Datasets/TED Talk - English Transcripts.csv'
	outputLocation = 'C:/Users/Ryan/Datasets/TED Talk CSV File - Analyzed.csv'


	# We're going to use a variation on the command above. Since this is a CSV file, we want to include the indices of
	#     1) the columns that include the text identifiers (although this is not required, it makes our data easier to merge later)
	#     2) the columns that include the actual text that we want to analyze
	#
	# In my CSV file, the first column has the text identifiers, and the second column contains the text.
	# For more complex datasets, please use the --help argument with LIWC-22 to learn more about how to process your text.
	cmd_to_execute = ["LIWC-22-cli",
					"--mode", "wc",
					"--input", inputFileCSV,
					"--row-id-indices", "1",
					"--column-indices", "2",
					"--output", outputLocation]

	# Let's go ahead and run this analysis:
	subprocess.call(cmd_to_execute)


	# We will see the following in the terminal as LIWC does its magic:
	#    Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
	#    Processing:
	#     - [file] C:\Users\Ryan\Datasets\TED Talk - English Transcripts.csv
	#    [========================================] 100.00%; Number of Rows Analyzed: 2737; Total Words Analyzed: 5.40M
	#    Done. Please examine results in C:\Users\Ryan\Datasets\TED Talk CSV File - Analyzed.csv









	#                       _                  _____ _        _
	#     /\               | |                / ____| |      (_)
	#    /  \   _ __   __ _| |_   _ _______  | (___ | |_ _ __ _ _ __   __ _
	#   / /\ \ | '_ \ / _` | | | | |_  / _ \  \___ \| __| '__| | '_ \ / _` |
	#  / ____ \| | | | (_| | | |_| |/ |  __/  ____) | |_| |  | | | | | (_| |
	# /_/    \_|_| |_|\__,_|_|\__, /___\___| |_____/ \__|_|  |_|_| |_|\__, |
	#                          __/ |                                   __/ |
	#                         |___/                                   |___/

	# What if we want to simply pass a string to the CLI for analysis? This is possible. As described on the
	# Help section of the liwc.app website, this is generally not recommended as it will not be very performant.
	#
	# Also, of serious importance! Most command lines/terminals have a limit on the length of any string that it
	# will parse. This means that you likely cannot analyze very long texts (e.g., like a long paper, speech,
	# or book) by passing the text directly into the console. Instead, you will likely need to process your
	# data directly from the disk instead.
	#
	# However, if you insist...

	# The string that we would like to analyze.
	inputString = "This is some text that I would like to analyze. After it has finished, I will say \"Thank you, LIWC!\""

	# For this one, let's save our result as a newline-delimited json file (.ndjson)
	outputLocation = 'C:/Users/Ryan/Datasets/LIWC-22 Results from String.ndjson'


	cmd_to_execute = ["LIWC-22-cli",
					"--mode", "wc",
					"--input", "console",
					"--console-text", inputString,
					"--output", outputLocation]


	# Let's go ahead and run this analysis:
	subprocess.call(cmd_to_execute)

	# The results from this analysis:
	#{"Segment": 1,"WC": 20,"Analytic": 3.8,"Clout": 40.06,"Authentic": 28.56,"Tone": 99,"WPS": 10,"BigWords": 10,
	#"Dic": 100, "Linguistic": 80,"function": 70,"pronoun": 30,"ppron": 15,"i": 10,"we": 0,"you": 5,"shehe": 0,"they": 0,
	#"ipron": 15,"det": 15,"article": 0,"number": 0,"prep": 15,"auxverb": 20,"adverb": 0,"conj": 5,"negate": 0,
	#"verb": 35,"adj": 0,"quantity": 5,"Drives": 5,"affiliation": 0,"achieve": 5,"power": 0,"Cognition": 15,
	#"allnone": 0,"cogproc": 15,"insight": 5,"cause": 0,"discrep": 10,"tentat": 0,"certitude": 0,"differ": 0,
	#"memory": 0,"Affect": 15,"tone_pos": 15,"tone_neg": 0,"emotion": 10,"emo_pos": 10,"emo_neg": 0,"emo_anx": 0,
	#"emo_anger": 0,"emo_sad": 0,"swear": 0,"Social": 20,"socbehav": 15,"prosocial": 5,"polite": 5,"conflict": 0,"moral": 0,
	#"comm": 15,"socrefs": 5,"family": 0,"friend": 0,"female": 0,"male": 0,"Culture": 5,"politic": 0,"ethnicity": 0,"
	#tech": 5,"Lifestyle": 0,"leisure": 0,"home": 0,"work": 0,"money": 0,"relig": 0,"Physical": 0,"health": 0,"illness": 0,
	#"wellness": 0,"mental": 0,"substances": 0,"sexual": 0,"food": 0,"death": 0,"need": 0,"want": 0,"acquire": 0,"lack": 0,
	#"fulfill": 0,"fatigue": 0,"reward": 0,"risk": 0,"curiosity": 0,"allure": 0,"Perception": 0,"attention": 0,"motion": 0,
	#"space": 0,"visual": 0,"auditory": 0,"feeling": 0,"time": 10,"focuspast": 0,"focuspresent": 10,"focusfuture": 5,
	#"Conversation": 0,"netspeak": 0,"assent": 0,"nonflu": 0,"filler": 0,
	#"AllPunc": 30,"Period": 5,"Comma": 10,"QMark": 0,"Exclam": 5,"Apostro": 0,"OtherP": 10}



	# And, lastly — what if we want to get the output directly from the command line or terminal as a json string?
	# Why, we can do that too!


	inputString = "This is some text that I would like to analyze. After it has finished," \
				" we will get results in the console. Hooray!"
	outputLocation = "console"

	cmd_to_execute = ["LIWC-22-cli",
					"--mode", "wc",
					"--input", "console",
					"--console-text", inputString,
					"--output", outputLocation]

	# Let's go ahead and run this analysis. We do this somewhat differently than what we've been doing, however.
	# This will end up giving us a list, where each element is a line of output from the console.
	results = subprocess.check_output(cmd_to_execute, shell=True).strip().splitlines()

	# In this case, the item that we want to parse from a json to a Python dictionary is in results[1], so we will
	# go right ahead and parse that to a dictionary now:
	import json
	results_json = json.loads(results[1])

## Manual

In [None]:
task

In [None]:
dfs[split]['df_text']

In [None]:
liwc_dir = './data/input/ctl/'


liwc_train = pd.read_csv(liwc_dir+f'train10_train_30perc_text_y_balanced_{task}_liwc22.csv', index_col = 0)
liwc_test = pd.read_csv(liwc_dir+f'train10_test_15perc_text_y_{task}_liwc22.csv', index_col = 0)

for split, df_i in zip(['train', 'test'], [liwc_train, liwc_test]):
    df_text = dfs[split]['df_text']
    df_i = df_i[df_i['conversation_id'].isin(df_text['conversation_id'].unique())]
    dfs[split]['liwc22_X'] = df_i.drop(['Segment', 'conversation_id', 'y', 'text', 'Emoji'], axis=1)
    dfs[split]['liwc22_y'] = df_i['y'].values
                                   

# Extract Suicide Risk Lexicon

In [None]:

from concept_tracker.utils import lemmatizer # local script
import tqdm

In [None]:
from concept_tracker.lexicon import lemmatize_tokens
	


In [None]:
import dill
sys.path.append( './../../concept-tracker/') # TODO: replace with pip install construct-tracker
from concept_tracker import lexicon


def load_lexicon(path):
	lexicon = dill.load(open(path, "rb"))
	return lexicon
srl = load_lexicon("./data/input/lexicons/suicide_risk_lexicon_calibrated_unmatched_tokens_unvalidated_24-02-15T21-55-05.pickle")


In [None]:
srl.exact_match_n



In [None]:



for split in ['train', 'test']:
	print('extracting', split)
	df_text = dfs[split]['df_text']
	docs = df_text['text'].values
	

	# srl = lemmatize_tokens(srl) # TODO: integrate this to class: self.lemmatize_tokens() adds tokens_lemmatized

	# Extract
	feature_vectors, matches_counter_d, matches_per_doc, matches_per_construct  = lexicon.extract(docs,
																						srl.constructs,normalize = True, return_matches=True,
																						add_lemmatized_lexicon=True, lemmatize_docs=False,
																						exact_match_n = srl.exact_match_n,exact_match_tokens = srl.exact_match_tokens)
	

	

	df_text[feature_vectors.columns] = feature_vectors.values
	dfs[split]['srl_unvalidated'] = df_text.drop(['conversation_id', 'y', 'text', 'word_count', 'Direct self-injury 2', 'Relationship issues 2'], axis=1).copy()

	

In [None]:
dfs[split]['srl_unvalidated']

In [None]:
X_train = dfs['train']['srl_unvalidated'] 
# X_val = dfs['val']['srl_unvalidated']    
X_test = dfs['test']['srl_unvalidated']
y_train = dfs['train']['y']
# y_val = dfs['val']['y'] 
y_test = dfs['test']['y']


print(len(X_train), len(y_train))
# print(len(X_val), len(y_val))
print(len(X_test), len(y_test))

# Suicide Risk Lexicon (only GPT-4 Turbo tokens) 

In [None]:
srl_gpt4 = {}

for construct in srl.constructs.keys():
    gpt4_tokens = []
    for source in srl.constructs[construct]['tokens_metadata'].keys():
        if 'gpt-4-1106-preview' in source:
            tokens_i = srl.constructs[construct]['tokens_metadata'][source]['tokens']
            gpt4_tokens.extend(tokens_i)
            
            
    srl_gpt4[construct]={'tokens':list(np.unique(gpt4_tokens))}
    

In [None]:
# we'll consider the 2 version of two of these after editing either the definition or prompt_name
srl_gpt4['Direct self-injury'] = srl_gpt4['Direct self-injury 2'].copy()
del srl_gpt4['Direct self-injury 2']
srl_gpt4['Relationship issues'] = srl_gpt4['Relationship issues 2'].copy()
del srl_gpt4['Relationship issues 2']


In [None]:
list(srl_gpt4.keys())

In [None]:
from concept_tracker.utils import lemmatizer
for c in tqdm.tqdm(list(srl_gpt4.keys())):
	lexicon_tokens = srl_gpt4[c]['tokens']


	# If you add lemmatized and nonlemmatized you'll get double count in many cases ("plans" in doc will be matched by "plan" and "plans" in lexicon)
	lexicon_tokens_lemmatized = lemmatizer.spacy_lemmatizer(lexicon_tokens, language='en') # custom function
	lexicon_tokens_lemmatized = [' '.join(n) for n in lexicon_tokens_lemmatized]
	lexicon_tokens += lexicon_tokens_lemmatized
	lexicon_tokens = list(np.unique(lexicon_tokens)) # unique set
	srl_gpt4[c]['tokens_lemmatized']=lexicon_tokens

In [None]:
for split in ['train', 'test']:
    df_text = dfs[split]['df_text']
    docs = df_text['text'].values    
    feature_vectors, matches_counter_d, matches_per_doc, matches_per_construct  = lexicon.extract(docs,
                                                                                          srl_gpt4,
                                                                                          normalize = normalize_lexicon,
                                                                                          exact_match_n = srl.exact_match_n
                                                                                          )
    df_text[feature_vectors.columns] = feature_vectors.values
    dfs[split]['SRL GPT-4 Turbo'] = df_text.drop(['conversation_id', 'y', 'text', 'word_count'], axis=1).copy()

    

# TextDescriptives



In [None]:
# !pip install textdescriptives==2.7.3

In [None]:
td_columns = ['token_length_mean',
#  'token_length_median',
 'token_length_std',
 'sentence_length_mean',
#  'sentence_length_median',
 'sentence_length_std',
#  'syllables_per_token_mean',
#  'syllables_per_token_median',
#  'syllables_per_token_std',
 'n_tokens',
#  'n_unique_tokens',
#  'proportion_unique_tokens',
#  'n_characters',
 'n_sentences',
#  'first_order_coherence',
#  'second_order_coherence',
 'pos_prop_ADJ',
 'pos_prop_ADP',
 'pos_prop_ADV',
 'pos_prop_AUX',
 'pos_prop_CCONJ',
 'pos_prop_DET',
 'pos_prop_INTJ',
 'pos_prop_NOUN',
 'pos_prop_NUM',
 'pos_prop_PART',
 'pos_prop_PRON',
 'pos_prop_PROPN',
 'pos_prop_PUNCT',
 'pos_prop_SCONJ',
 'pos_prop_SYM',
 'pos_prop_VERB',
 'pos_prop_X',
#  'flesch_reading_ease',
#  'flesch_kincaid_grade',
#  'smog',
 'gunning_fog',
 'automated_readability_index',
#  'coleman_liau_index',
#  'lix',
#  'rix',
#  'entropy',
#  'perplexity',
#  'per_word_perplexity',
 'passed_quality_check',
#  'n_stop_words',
 'alpha_ratio',
 'mean_word_length',
#  'doc_length',
 'symbol_to_word_ratio_#',
 'proportion_ellipsis',
#  'proportion_bullet_points',
#  'contains_lorem ipsum',
#  'duplicate_line_chr_fraction',
#  'duplicate_paragraph_chr_fraction',
#  'duplicate_ngram_chr_fraction_5',
#  'duplicate_ngram_chr_fraction_6',
#  'duplicate_ngram_chr_fraction_7',
#  'duplicate_ngram_chr_fraction_8',
#  'duplicate_ngram_chr_fraction_9',
#  'duplicate_ngram_chr_fraction_10',
 'top_ngram_chr_fraction_2',
#  'top_ngram_chr_fraction_3',
#  'top_ngram_chr_fraction_4',
#  'oov_ratio',
 'dependency_distance_mean',
 'dependency_distance_std',
 'prop_adjacent_dependency_relation_mean',
 'prop_adjacent_dependency_relation_std']
# df_text[['y']+metrics.columns].corr(method='spearman')

In [None]:
import spacy
import textdescriptives as td
# load your favourite spacy model (remember to install it first using e.g. `python -m spacy download en_core_web_sm`)
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives/all")  #TODO: dont compute coherence, quality, etc.


for split in ['train', 'test']:
	print('extracting', split)
	df_text = dfs[split]['df_text'].copy()
	# docs = df_text['text'].values
	
	doc = nlp.pipe(df_text['text'])
	td_features = td.extract_df(doc, include_text=False, metrics =["descriptive_stats","readability", 'quality', 'pos_proportions', 'dependency_distance'])
	

	td_features = td_features[td_columns] # only keep td_columns

	assert td_features.shape[0] == df_text.shape[0]
	dfs[split]['text_descriptives'] = td_features.copy()

	df_text_td = df_text.join(td_features, how="left")

	dfs[split]['srl_unvalidated_text_descriptives'] = df_text_td.drop(['conversation_id', 'y', 'text', 'word_count', 'Direct self-injury 2', 'Relationship issues 2'], axis=1).copy()



In [None]:
import seaborn as sns
td_features = dfs['train']['text_descriptives']
td_features = td_features[td_columns]
td_features['y'] = dfs['train']['y'].values
td_features_corr = td_features.corr(method='spearman')
td_features_corr = td_features_corr.fillna(td_features_corr.median().median())

sns.set(font_scale=0.75)
sns.clustermap(td_features_corr)
# n = 5

# Adjust both x-tick and y-tick label sizes
# ax.set_xticklabels(ax.get_xticklabels(), fontsize=n)  # Set the fontsize for x-tick labels
# ax.set_yticklabels(ax.get_yticklabels(), fontsize=n)  # Set the fontsize for y-tick labels

# plt.show()

# Extract embeddings

- 4000 docs - 8m

- 1000 docs - 1.5 m


In [None]:
# !pip install tensorboard

In [None]:
len(dfs[split]['X']) == dfs[split]['df_text'].shape[0]

In [None]:

run_this = True
# 25m for train set.
if run_this:
	import tensorboard
	from sentence_transformers import SentenceTransformer, util 
	embeddings_name = 'all-MiniLM-L6-v2'
	# Encode the documents with their sentence embeddings 
	# a list of pre-trained sentence transformers
	# https://www.sbert.net/docs/pretrained_models.html
	# https://huggingface.co/models?library=sentence-transformers
	
	# all-MiniLM-L6-v2 is optimized for semantic similarity of paraphrases
	sentence_embedding_model = SentenceTransformer(embeddings_name)       # load embedding
	
	sentence_embedding_model._first_module().max_seq_length = 500
	# TODO: Change max_seq_length to 500
	# Note: sbert will only use fewer tokens as its meant for sentences, 
	print(sentence_embedding_model .max_seq_length)



	for split in ['train', 'test']:
		dfs[split]['embeddings'] = sentence_embedding_model.encode(dfs[split]['X'], convert_to_tensor=True,show_progress_bar=True)
	
	# TODO move up to where I encoded this
		
	for split in ['train', 'test']:
		embeddings = dfs[split]['embeddings']
		embeddings = pd.DataFrame(embeddings, columns = [f'{embeddings_name}_{str(n).zfill(4)}' for n in range(embeddings.shape[1])])
		dfs[split][embeddings_name] = embeddings

	

# Load everything above

In [None]:
import pickle
run_this = False #True saves, False loads
if run_this:
    with open(f'./data/input/ctl/ctl_dfs_features_{task}.pkl', 'wb') as f:
        pickle.dump(dfs, f) 
else:

    with open(f'./data/input/ctl/ctl_dfs_features_{task}.pkl', 'rb') as f:
    	dfs = pickle.load(f)


# Models

In [None]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os 
import re

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler

from lightgbm import LGBMClassifier # TODO: add
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import (
    auc,
    average_precision_score,
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    confusion_matrix,
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import clone
import warnings
from sklearn.preprocessing import StandardScaler
# !pip install xgboost
# !pip install lightgbm==4.3.0
from lightgbm import LGBMRegressor
import string
from sklearn.linear_model import Lasso
# import contractions # TODO: add
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import sys
sys.path.insert(1,'./../../concept-tracker')
from concept_tracker.utils import metrics_report # local script

from scipy.stats import pearsonr, spearmanr
# from imblearn.pipeline import Pipeline as imb_Pipeline

# from imblearn.over_sampling import RandomOverSampler
import datetime

import nltk
nltk.download('stopwords')

In [None]:
output_dir = './data/output/'
output_dir_i = output_dir+'ml_performance/'
os.makedirs(output_dir_i,exist_ok=True)

In [None]:
dfs['train'].keys()

In [None]:
ridge_alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
ridge_alphas_toy = [0.1, 10]
def get_params(feature_vector,model_name = 'Ridge', toy=False):
	if model_name in ['LogisticRegression']:
		if feature_vector == 'tfidf':
			if toy:
				warnings.warn('WARNING, running toy version')
				param_grid = {
				   'vectorizer__max_features': [256, 512],
				}
			else:
				param_grid = {
					'vectorizer__max_features': [512,2048,None],
					'model__C': ridge_alphas,
				}
	
		else:
			if toy:
				warnings.warn('WARNING, running toy version')
				param_grid = {
					'model__C': ridge_alphas_toy,
				}
			else:
				param_grid = {
					'model__C': ridge_alphas,
				}
	
	elif model_name in ['Ridge', 'Lasso']:
		if feature_vector == 'tfidf':
			if toy:
				warnings.warn('WARNING, running toy version')
				param_grid = {
				   'vectorizer__max_features': [256, 512],
				}
			else:
				param_grid = {
					'vectorizer__max_features': [512,2048,None],
					'model__alpha': ridge_alphas,
				}
	
		else:
			if toy:
				warnings.warn('WARNING, running toy version')
				param_grid = {
					'model__alpha': ridge_alphas_toy,
				}
			else:
				param_grid = {
					'model__alpha': ridge_alphas,
				}
	

	elif model_name in [ 'LGBMRegressor', 'LGBMClassifier']:
		if toy:
			warnings.warn('WARNING, running toy version')
			param_grid = {
			   # 'vectorizer__max_features': [256,2048],
				# 'model__colsample_bytree': [0.5, 1],
				'model__max_depth': [10,20], #-1 is the default and means No max depth
		
			}
		else:
			if feature_vector =='tfidf':
				param_grid = {
					'vectorizer__max_features': [256,2048,None],
					'model__num_leaves': [30,45,60],
					'model__colsample_bytree': [0.1, 0.5, 1],
					'model__max_depth': [0,5,15], #0 is the default and means No max depth
					'model__min_child_weight': [0.01, 0.001, 0.0001],
					'model__min_child_samples': [10, 20,40], #alias: min_data_in_leaf
				   'vectorizer__max_features': [256, 512],
					}
			
			param_grid = {
				'model__num_leaves': [30,45,60],
				'model__colsample_bytree': [0.1, 0.5, 1],
				'model__max_depth': [0,5,15], #0 is the default and means No max depth
				'model__min_child_weight': [0.01, 0.001, 0.0001],
				'model__min_child_samples': [10, 20,40], #alias: min_data_in_leaf
		
			}

	
	elif model_name in [ 'XGBRegressor', 'XGBClassifier']:
		if toy:
			warnings.warn('WARNING, running toy version')
			param_grid = {
				'model__max_depth': [10,20], #-1 is the default and means No max depth
		
			}
		else:
			if feature_vector =='tfidf':
				param_grid = {
					'vectorizer__max_features': [256,2048,None],
					'model__colsample_bytree': [0.1, 0.5, 1],
					'model__max_depth': [5,15, None], #None is the default and means No max depth
					'model__min_child_weight': [0.01, 0.001, 0.0001],
				
				   
					}
			
			param_grid = {
				'model__colsample_bytree': [0.1, 0.5, 1],
				'model__max_depth': [5,15, None], #None is the default and means No max depth
				'model__min_child_weight': [0.01, 0.001, 0.0001],
		
			}

	return param_grid

from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
def get_pipelines(feature_vector, model_name = 'Ridge'):
	
	# model = getattr(__main__, model_name)()
	model = globals()[model_name]()
	# if model_name == 'Ridge':
	#     model = Ridge()
	# elif model_name == 'XGBRegressor':
	#     model = XGBRegressor()
	model.set_params(random_state = 123)
	
	
	if feature_vector =='tfidf':
		pipeline = Pipeline([
			 ('vectorizer', vectorizer),
			 ('model', model), 
			])
	else:
		pipeline = Pipeline([
			('imputer', SimpleImputer(strategy='median')),
			('standardizer', StandardScaler()),
			 ('model', model), 
			])
	return pipeline

In [None]:
from sklearn import metrics

In [None]:

def tfidf_feature_importances(pipe, top_k = 100, savefig_path = '', model_name_in_pipeline = 'model', xgboost_method = 'weight' ):
    # # Using sklearn pipeline:
    feature_names = pipe.named_steps["vectorizer"].get_feature_names_out()
    
    try: coefs = pipe.named_steps["model"].coef_.flatten() # Get the coefficients of each feature
    except: 
        try: coefs = list(pipe.named_steps[model_name_in_pipeline].get_booster().get_score(importance_type=xgboost_method )) # pipeline directly
        except:
            # gridsearchcv(pipeline)
            coefs = pipe.best_estimator_.named_steps[model_name_in_pipeline].get_booster().get_score(importance_type=xgboost_method )
    
    # Without sklearn pipeline
    # feature_names = vectorizer.get_feature_names_out()
    # print(len(feature_names ))
    # coefs = pipeline.coef_.flatten() # Get the coefficients of each feature
    
    # Visualize feature importances
    # Sort features by absolute value
    df = pd.DataFrame(zip(feature_names, coefs), columns=["feature", "value"])
    df["abs_value"] = df["value"].apply(lambda x: abs(x))
    df["colors"] = df["value"].apply(lambda x: "orange" if x > 0 else "dodgerblue")
    df = df.sort_values("abs_value", ascending=False) # sort by absolute coefficient value
    
    fig, ax = plt.subplots(1, 1, figsize=(3.5, 6))
    plt.style.use('default')  # Example of applying the 'ggplot' style
    ax = sns.barplot(x="value",
                y="feature",
                data=df.head(top_k),
                hue="colors")
    ax.legend_.remove()
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8)
    ax.set_title(f"Top {top_k} Features", fontsize=14)
    ax.set_xlabel("Coef", fontsize=12) # coeficient from linear model
    ax.set_ylabel("Feature Name", fontsize=12)
    
    plt.tight_layout()
    plt.savefig(savefig_path+'.png', dpi=300)
    plt.show()
    return df

In [None]:
metrics_report = 1
from concept_tracker.utils import metrics_report



In [None]:

from concept_tracker.utils.metrics_report import cm, classification_report#, regression_report

In [None]:
# tfidf 

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def nltk_lemmatize(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

# Now, integrate this with TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf_vectorizer = TfidfVectorizer(tokenizer=nltk_lemmatize, stop_words='english')

from sklearn.linear_model import Ridge

def custom_tokenizer(string):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(string)
    return words

def tokenizer_remove_punctuation(text):
    return re.split("\\s+",text)

vectorizer = TfidfVectorizer(
                 min_df=3, ngram_range=(1,2), 
                 stop_words=None, #'english',# these include 'just': stopwords.words('english')+["'d", "'ll", "'re", "'s", "'ve", 'could', 'doe', 'ha', 'might', 'must', "n't", 'need', 'sha', 'wa', 'wo', 'would'], strip_accents='unicode',
                 sublinear_tf=True,
                 # tokenizer=nltk_lemmatize,
                token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'",
                    use_idf=True,
                 )

In [None]:
# import dill
# def load_lexicon(path):
# 	lexicon = dill.load(open(path, "rb"))
# 	return lexicon
# lexicon = load_lexicon("./../data/lexicons/suicide_risk_lexicon_gpt-4-1106-preview_dml_24-01-24T18-38-38.pickle")


In [None]:
# from collections import Counter
# [np.round(n/len(y_train),2) for n in dict(Counter(y_train)).values()]

In [None]:
from srl_constructs import constructs_in_order


def get_splits(feature_vector):
	if feature_vector in ['tfidf']:
		X_train = dfs['train']['X'] # text
		# X_val = dfs['val']['X']
		X_test = dfs['test']['X']
		y_train = dfs['train']['y']
		# y_val = dfs['val']['y']
		y_test = dfs['test']['y']
		
	elif feature_vector in ['liwc22']:        
		
		X_train = dfs['train']['liwc22_X'] 
		# X_val = dfs['val']['liwc22_X']    
		X_test = dfs['test']['liwc22_X']
		y_train = dfs['train']['liwc22_y']
		# y_val = dfs['val']['liwc22_y']
		y_test = dfs['test']['liwc22_y']

	elif feature_vector in ['srl_unvalidated']:        
		
		X_train = dfs['train']['srl_unvalidated'] 
		# X_val = dfs['val']['srl_unvalidated']    
		X_test = dfs['test']['srl_unvalidated']
		y_train = dfs['train']['y']
		# y_val = dfs['val']['y'] 
		y_test = dfs['test']['y']

	elif feature_vector in ['SRL GPT-4 Turbo']:
		X_train = dfs['train']['SRL GPT-4 Turbo'][constructs_in_order] 
		# X_val = dfs['val']['SRL GPT-4 Turbo'][constructs_in_order]    
		X_test = dfs['test']['SRL GPT-4 Turbo'][constructs_in_order]
		y_train = dfs['train']['y']
		# y_val = dfs['val']['y'] 
		y_test = dfs['test']['y']
		

	elif feature_vector in ['text_descriptives']:        
		
		X_train = dfs['train']['text_descriptives'] 
		X_test = dfs['test']['text_descriptives']
		y_train = dfs['train']['y']
		y_test = dfs['test']['y']
		
	elif feature_vector in ['srl_unvalidated_text_descriptives']:        
		
		X_train = dfs['train']['srl_unvalidated_text_descriptives'] 
		X_test = dfs['test']['srl_unvalidated_text_descriptives']
		y_train = dfs['train']['y']
		y_test = dfs['test']['y']
	

	
	elif feature_vector in ['all-MiniLM-L6-v2']:
		X_train = dfs['train']['all-MiniLM-L6-v2'] 
		# X_val = dfs['val']['all-MiniLM-L6-v2']    
		X_test = dfs['test']['all-MiniLM-L6-v2']
		y_train = dfs['train']['y']
		# y_val = dfs['val']['y']
		y_test = dfs['test']['y']
		
	
	return X_train, y_train,X_test, y_test

In [None]:

from itertools import product


parameters =   {'model__colsample_bytree': [1, 0.5, 0.1],
                'model__max_depth': [-1,10,20], #-1 is the default and means No max depth
                'model__min_child_weight': [0.01, 0.001, 0.0001],
                'model__min_child_samples': [10, 20,40], #alias: min_data_in_leaf
               }
        

combinations = list(product(*parameters.values()))
        
def get_combinations(parameters):
    
    parameter_set_combinations = []
    for combination in combinations:
        parameter_set_i = {}
        
        for i, k in enumerate(parameters.keys()):
            parameter_set_i[k] = combination[i]
        parameter_set_combinations.append(parameter_set_i)
    return parameter_set_combinations


In [None]:
from xgboost import XGBClassifier, XGBRegressor

# Run models

In [None]:

toy = False

# config

feature_vectors = ['all-MiniLM-L6-v2', 'srl_unvalidated','SRL GPT-4 Turbo', 'liwc22', 'liwc22_semantic'] # srl_unvalidated_text_descriptives','text_descriptives' ]
sample_sizes = ['all', 150] 


if task == 'classification':
	scoring = 'f1'
	metrics_to_report = 'all'
	model_names = ['LGBMRegressor', 'LogisticRegression']
	
elif task == 'regression':
	scoring = 'neg_mean_squared_error'
	# metrics_to_report = ['Model','n', 'RMSE','RMSE per value','MAE','MAE per value',  'rho', 'gridsearch', 'Best parameters']
	model_names = ['LGBMRegressor', 'Ridge']
	metrics_to_report = 'all'

gridsearch = True#, 'minority'
balance = True
output_dir = './data/output/ml_performance/'
os.makedirs(output_dir , exist_ok=True)


# 64,51,54 vs .4, .25, 56 (with much more training data)

In [None]:
from sklearn.preprocessing import LabelEncoder
from skopt import BayesSearchCV # had to replace np.int for in in transformers.py


from importlib import reload
reload(metrics_report)


from concept_tracker.utils.metrics_report import cm, classification_report, regression_report
regression_report


In [None]:
from sklearn import metrics
def regression_report(y_test,y_pred,y_train=None,gridsearch=None, best_params=None,feature_vector=None,model_name=None,metrics_to_report = 'all', plot = True, save_fig_path = None, n = 'all', round_to = 2, figsize=(4,8), ordinal_ticks = True):
	'''
	metrics = {'all', ['MAE','RMSE','rho', 'Best parameters']
	}
	'''
	
	# Metrics
	# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
	rmse = metrics.mean_squared_error(y_test, y_pred, squared=False )
	mae = metrics.mean_absolute_error(y_test, y_pred)
	r2 = metrics.r2_score(y_test, y_pred)    
	r, p = pearsonr(y_test, y_pred)    
	rho, p = spearmanr(y_test, y_pred)    
	
	results_dict = {
		'Features':feature_vector,
		'Estimator':model_name,
		'n':n,
		'y_train_min': np.min(y_train),
		'y_train_max': np.max(y_train),        
		'RMSE':np.round(rmse,round_to ),
		'MAE':np.round(mae,round_to ),
		'R^2':np.round(r2,round_to ),    
		'r':np.round(r,round_to ),    
		'rho':np.round(rho,round_to ),    
		'gridsearch':gridsearch,
		'Best parameters': str(best_params),
		}    
	results = pd.DataFrame(results_dict, index=[model_name]).round(3)
	# results_all.append(results)
	
	if metrics_to_report == 'all' or ('RMSE per value' in metrics_to_report and 'MAE per value' in metrics_to_report):
		y_pred_test = {}
		y_pred_test['RMSE per value'] = []
		y_pred_test['MAE per value'] = []
		for value in np.unique(y_test):
			y_pred_test_i = [[pred,test] for pred,test in zip(y_pred,y_test) if test == value]
			y_pred_test[value] = np.array(y_pred_test_i)
			y_pred_i = [n[0] for n in y_pred_test_i]
			y_test_i = [n[1] for n in y_pred_test_i]
			rmse_i = metrics.mean_squared_error(y_test_i, y_pred_i, squared=False )
			mae_i = metrics.mean_absolute_error(y_test_i, y_pred_i)
			y_pred_test['RMSE per value'].append(np.round(rmse_i,round_to ))
			y_pred_test['MAE per value'].append(np.round(mae_i,round_to ))
		# print(y_pred_test['RMSE per value'])
		results_dict.update({
		'RMSE per value':f"{y_pred_test['RMSE per value']}",
		'MAE per value':f"{y_pred_test['MAE per value']}"
		})
		macro_avg_rmse = np.round(np.mean(y_pred_test['RMSE per value']), round_to)
		macro_avg_mae = np.round(np.mean(y_pred_test['MAE per value']), round_to)

		results_dict.update({
		'Macro avg. RMSE':f"{macro_avg_rmse}",
		'Macro avg. MAE':f"{macro_avg_mae}",
		})

		# metrics_to_report_2 = metrics_to_report.copy()
		# metrics_to_report_2.remove('RMSE') #redudant
		# metrics_to_report_2.remove('MAE') #redudant
		results = pd.DataFrame(results_dict, index=[model_name]) # replace with updated metrics
		# results = results[metrics_to_report_2]

	
	# Plot result for a regression task: true value vs predicted values
	# ============================================================
	plt.clf()
	plt.figure(figsize=figsize)  # Width=10 inches, Height=6 inches

	plt.style.use('default')  # Example of applying the 'ggplot' style
	plt.scatter(y_test, y_pred, alpha = 0.05)
	# plt.title(f"{feature_vector.capitalize().replace('_',' ')}")
	plt.xlabel('True values')
	plt.ylabel('Predicted values')
	
	
	ticks = list(np.unique(y_test))
	if ordinal_ticks and len(ticks)<12:
		plt.xticks(ticks=ticks,labels = [str(int(n)) for n in ticks]) 
	
	plt.tight_layout()
	if save_fig_path:
		plt.savefig(save_fig_path+'.png', dpi=300)    
	# plt.show()
	return results



In [None]:
# if feature_vector:
	# 	model_name_for_df = f"{feature_vector} {model_name}"
	# else:
	# 	model_name_for_df = f"{model_name}"

In [None]:
feature_vectors

In [None]:
np.random.seed(123)

# TODO: see where to save feature_vector (tfidf, liwc22) and where to save model_name (Ridge, LightGBM)



ts_i = datetime.datetime.utcnow().strftime('%y-%m-%dT%H-%M-%S')

if toy:
	sample_sizes = [150]
	feature_vectors = feature_vectors[:2]

for n in sample_sizes:
	results = []
	# for gridsearch in [True]:

	# for feature_vector in ['srl_unvalidated', 'all-MiniLM-L6-v2']:#['srl_unvalidated']:#, 'srl_unvalidated']:
	for feature_vector in feature_vectors:#['srl_unvalidated']:#, 'srl_unvalidated']:

		if toy:
			output_dir_i = output_dir + f'results_{ts_i}_toy/'
		else:
			output_dir_i = output_dir + f'results_{ts_i}_{n}_{task}_{balance_values[-1]}/'
			
		os.makedirs(output_dir_i, exist_ok=True)
		
		if feature_vector == 'liwc22_semantic':
			X_train, y_train, X_test, y_test = get_splits('liwc22')
			X_train = X_train[liwc_semantic]
			# X_val = X_val[liwc_semantic]
			X_test = X_test[liwc_semantic]
	
		else:
			X_train, y_train,X_test, y_test = get_splits(feature_vector)

		

	
		if toy:
			X_train['y'] = y_train
			X_train = X_train.sample(n = 100)
			y_train = X_train['y'].values
			X_train = X_train.drop('y', axis=1)
	
		elif n!='all':
			X_train['y'] = y_train
			X_train = X_train.sample(n = n, random_state=123)
			y_train = X_train['y'].values
			X_train = X_train.drop('y', axis=1)
	

		if task == 'classification':
			encoder = LabelEncoder()

			# Fit and transform the labels to integers
			y_train = encoder.fit_transform(y_train)
			y_test = encoder.transform(y_test)

		
		for model_name in model_names: 
	
			pipeline = get_pipelines(feature_vector, model_name = model_name)
			print(pipeline)
		
			if gridsearch == 'minority':
				# Obtain all hyperparameter combinations
				parameters = get_params(feature_vector,model_name=model_name, toy=toy)
				parameter_set_combinations = get_combinations(parameters)
				scores = {}
				for i, set in enumerate(parameter_set_combinations):
					pipeline.set_params(**set)
					pipeline.fit(X_train,y_train)
					y_pred = pipeline.predict(X_val) # validation set 
					rmse_per_value = []
					rmse = metrics.mean_squared_error(y_val, y_pred, squared=False ) # validation set 
					for value in np.unique(y_val):
						y_pred_test_i = [[pred,test] for pred,test in zip(y_pred,y_val) if test == value] # validation set 
						y_pred_i = [n[0] for n in y_pred_test_i]
						y_test_i = [n[1] for n in y_pred_test_i]
						rmse_i = metrics.mean_squared_error(y_test_i, y_pred_i, squared=False )
						rmse_per_value.append(rmse_i )
					scores[i] = [rmse]+rmse_per_value+[str(set)]
				scores = pd.DataFrame(scores).T
				scores.columns = ['RMSE', 'RMSE_2', 'RMSE_3', 'RMSE_4', 'Parameters']
				scores = scores.sort_values('RMSE_4')
				best_params = eval(scores['Parameters'].values[0])
				pipeline.set_params(**best_params)
				pipeline.fit(X_train,y_train)
				y_pred = pipeline.predict(X_test)
				
			elif gridsearch == True:
				parameters = get_params(feature_vector,model_name=model_name, toy=toy)
	
				pipeline = BayesSearchCV(pipeline, parameters, cv=5, scoring=scoring, return_train_score=False,
				n_iter=32, random_state=123)    
				if feature_vector != 'tfidf':
					if 'y' in X_train.columns:
						warnings.warn('y var is in X_train, removing')
						X_train = X_train.drop('y', axis=1)
						
						
				pipeline.fit(X_train,y_train)
				best_params = pipeline.best_params_
				best_model = pipeline.best_estimator_
				if feature_vector != 'tfidf':
					if 'y' in X_test.columns:
						warnings.warn('y var is in X_test, removing')
						X_test = X_test.drop('y', axis=1)
				y_pred = best_model.predict(X_test)
			else:
				pipeline.fit(X_train,y_train)
				best_params = 'No hyperparameter tuning'
				y_pred = pipeline.predict(X_test)
			
			# Predictions
			y_pred_df = pd.DataFrame(y_pred)
			y_pred_df.to_csv(output_dir_i+f'y_pred_{feature_vector}_{model_name}_gridsearch-{gridsearch}_{n}_{ts_i}.csv', index=False)
			path = output_dir_i + f'scatter_{feature_vector}_{model_name}_gridsearch-{gridsearch}_{n}_{ts_i}'
		
			# Performance
			if task == 'classification':
				cm_df_meaning, cm_df, cm_df_norm = cm(y_test,y_pred, output_dir_i, model_name, ts_i, classes = balance_values, save=True)
				y_proba = pipeline.predict_proba(X_test)       # Get predicted probabilities
				y_proba_1 = y_proba[:,1]
				y_pred = y_proba_1>=0.5*1                   # define your threshold
				results_i = classification_report(y_test, y_pred, y_proba_1, output_dir_i,gridsearch=gridsearch,
										best_params=best_params,feature_vector=feature_vector,model_name=model_name,round_to = 2, ts = ts_i)
			elif task == 'regression':

				results_i =regression_report(y_test,y_pred,y_train=y_train,
										metrics_to_report = metrics_to_report,
											gridsearch=gridsearch,
										best_params=best_params,feature_vector=feature_vector,model_name=model_name, plot = True, save_fig_path = path,n = n, round_to = 2)
			results_i.to_csv(output_dir_i + f'results_{feature_vector}_{model_name}_gridsearch-{gridsearch}_{n}_{ts_i}.csv')
			display(results_i)
			results.append(results_i)
			results_df = pd.concat(results)
			results_df = results_df.reset_index(drop=True)
			results_df.to_csv(output_dir_i + f'results_{n}_{ts_i}.csv', index=False)
		
			# Feature importance
			if feature_vector == 'tfidf':
				if model_name in ['XGBRegressor']:
					warnings.warn('Need to add code to parse XGBoost feature importance dict')
				else:
					feature_importances = tfidf_feature_importances(pipeline, top_k = 50, savefig_path = output_dir_i + f'feature_importance_{feature_vector}_{model_name}_{n}_{ts_i}')
			else:
				feature_names = X_train.columns
				feature_importance = generate_feature_importance_df(pipeline, model_name,feature_names,  xgboost_method='weight', model_name_in_pipeline = 'model')
				if str(feature_importance) != 'None':       # I only implemented a few methods for a few models
					feature_importance.to_csv(output_dir_i + f'feature_importance_{feature_vector}_{model_name}_gridsearch-{gridsearch}_{n}_{ts_i}.csv', index = False)        
					# display(feature_importance.iloc[:50])
			
		
			# NaN analysis
			if type(X_train) == pd.core.frame.DataFrame:
				df = X_train.copy()
				# Find the column and index of NaN values
				nan_indices = df.index[df.isnull().any(axis=1)].tolist()
				nan_columns = df.columns[df.isnull().any()].tolist()
				# print("Indices of NaN values:", nan_indices)
				print("Columns with NaN values:", nan_columns)
				print(df.size)
				nans = df.isna().sum().sum()
				print('% of nans:', np.round(nans/df.size,3))
			
	
		
		
		
			
			
			

In [None]:
X_train, y_train,X_test, y_test = get_splits('srl_unvalidated_text_descriptives')
print(len(X_train), len(y_train), len(X_test), len(y_test))

In [None]:
print(len(y_pred), len(y_test))

# Error analysis


In [None]:
ts_i = '24-02-15T20-17-48'
n = 'all'

output_dir_i = output_dir + f'results_{ts_i}_{n}_{task}_{balance_values[-1]}/'

results = []
# for gridsearch in [True]:

# for feature_vector in ['srl_unvalidated', 'all-MiniLM-L6-v2']:#['srl_unvalidated']:#, 'srl_unvalidated']:
for feature_vector in feature_vectors:#['srl_unvalidated']:#, 'srl_unvalidated']:
	if feature_vector == 'liwc22_semantic':
		X_train, y_train,X_val, y_val, X_test, y_test = get_splits('liwc22')
		X_train = X_train[liwc_semantic]
		X_val = X_val[liwc_semantic]
		X_test = X_test[liwc_semantic]

	else:
		X_train, y_train,X_val, y_val, X_test, y_test = get_splits(feature_vector)

	


	if toy:
		X_train['y'] = y_train
		X_train = X_train.sample(n = 100)
		y_train = X_train['y'].values
		X_train = X_train.drop('y', axis=1)

	elif n!='all':
		X_train['y'] = y_train
		X_train = X_train.sample(n = n, random_state=42)
		y_train = X_train['y'].values
		X_train = X_train.drop('y', axis=1)


	if task == 'classification':
		encoder = LabelEncoder()

		# Fit and transform the labels to integers
		y_train = encoder.fit_transform(y_train)
		y_test = encoder.transform(y_test)

	
	for model_name in model_names: 
		y_pred = pd.read_csv(output_dir_i+f'y_pred_{feature_vector}_{model_name}_gridsearch-{gridsearch}_{n}_{ts_i}.csv')
		break
	break


In [None]:
from sklearn import metrics
y_df = y_pred.copy()
i = 2
y_df['y_test'] = y_test
y_df.columns = ['y_pred', 'y_test']
y_df_i = y_df[y_df['y_test']==i]
y_df_i['error'] = y_df_i['y_pred'] - y_df_i['y_test']
y_df_i = y_df_i.sort_values(by='error')
X_test_text = dfs['test']['df_text']
print(X_test_text.shape, y_df.shape)
# display(X_test_text.head(), y_df[:5])
display(y_df_i.iloc[:10])
display(X_test_text.loc[y_df_i.index[:10]])
docs = X_test_text.loc[y_df_i.index[:10]]['text'].to_list()

print(docs)
# metrics.mean_absolute_error(y_test, y_pred.values)


In [None]:
import dill
sys.path.append( './../../concept-tracker/') # TODO: replace with pip install construct-tracker
from concept_tracker import lexicon


def load_lexicon(path):
	lexicon = dill.load(open(path, "rb"))
	return lexicon
srl = load_lexicon("./data/input/lexicons/suicide_risk_lexicon_calibrated_unmatched_tokens_unvalidated_24-02-15T19-30-52.pickle")


feature_vectors, matches_counter_d, matches_per_doc, matches_per_construct  = lexicon.extract(docs,
																						srl.constructs,normalize = normalize_lexicon, return_matches=True,
																						add_lemmatized_lexicon=True, lemmatize_docs=False,
																						exact_match_n = srl.exact_match_n,exact_match_tokens = srl.exact_match_tokens)

In [None]:
i = 2
print(docs[i])
constructs_alphabetical = constructs_in_order.copy()
constructs_alphabetical.sort()
pd.DataFrame(matches_per_doc[i])[constructs_alphabetical]

# Clean up results table

In [None]:
def insert_empty_row(df, index_to_insert):
	# Splitting the DataFrame
	df_before = df.iloc[:index_to_insert, :]
	df_after = df.iloc[index_to_insert:, :]

	# Creating an empty row (all values set to NaN or any desired value)
	# The length of the empty DataFrame should match the number of columns in the original DataFrame
	empty_row = pd.DataFrame({col: np.nan for col in df.columns}, index=[index_to_insert])

	# Adjusting the index for df_after to accommodate the new row
	df_after.index = df_after.index + 1

	# Concatenating the DataFrames
	df_updated = pd.concat([df_before, empty_row, df_after])

	# Resetting the index if desired
	df_updated = df_updated.reset_index(drop=True)
	return df_updated

In [None]:
sample_sizes = ['all', 150] # TODO
model_names = ['LGBMRegressor', 'Ridge']
timestamp = '24-02-08T20-05-33'


for n in sample_sizes:
	for model in model_names:	
		
		results_dir = f'results_{timestamp}_{n}/'
		results_df = pd.read_csv('./data/output/ml_performance/'+results_dir+f'results_{n}_{timestamp}.csv')
		results_df = results_df[results_df['Estimator'].str.contains(model)]
		results_df.reset_index(drop=True,inplace=True)
		results_df = results_df.drop(['n','Estimator',  'gridsearch', 'Best parameters', 'y_train_min', 'y_train_max', 'R^2', 'r'], axis = 1)
		results_df = insert_empty_row(results_df, 4)
		results_df = insert_empty_row(results_df, 6)
		results_df.to_csv(f'./data/output/tables/'+f'results_{model}_{n}.csv', index=False)
		
		



# Feature importance plot

In [None]:
timestamp = 'results_24-02-08T15-58-41'
model = 'LGBMRegressor'
input_dir = './data/output/ml_performance/'+timestamp+'_all/'
files = os.listdir(input_dir)
feature_vectors = ['srl_unvalidated', 'liwc22_semantic']
table_names = ['SRL unvalidated', 'LIWC-22 semantic']

rank_col_name = 'Rank'
files
feature_importance = []
for file, table_name in zip(feature_vectors, table_names):
    file1 = [n for n in files if ('feature_importance_'+file in n and 'clean' not in n)][0]
    
print(file1)
fi = pd.read_csv(input_dir+file1)
# fi.columns = ['Feature', 'Split', 'Gain']
# fi=fi.drop('Split', axis=1).round(1)
# fi = fi.reset_index()
# fi.columns = [rank_col_name, 'Feature', 'Gain']
# fi[rank_col_name]+=1
# fi[rank_col_name] = fi[rank_col_name].astype(str)
fi

In [None]:
# For each feature, correlate feature with y
import math
liwc22_X = dfs['train']['liwc22_X']
liwc22_y = dfs['train']['liwc22_y']
liwc_rho = {}
for feature in liwc22_X.columns:
	filtered_list1, filtered_list2 = zip(*[(x, y) for x, y in zip(liwc22_y, liwc22_X[feature].values) if not math.isnan(x) and not math.isnan(y)])

	# Converting the tuples back to lists
	filtered_list1 = list(filtered_list1)
	filtered_list2 = list(filtered_list2)
	r,p = spearmanr(filtered_list1, filtered_list2)
	# r,p = spearmanr(liwc22_y, liwc22_X[feature])
	# if p <= 0.05:
	liwc_rho[feature] = np.round(r,2)
	if str(r)=='nan':
		
		print(feature)
	# else:
		# liwc_rho[feature] = np.nan


# For each feature, correlate feature with y
srl_unv_X = dfs['train']['srl_unvalidated'].drop('y', axis=1)
srl_unv_y = dfs['train']['srl_unvalidated']['y'].values
srl_unv_rho = {}
for feature in srl_unv_X.columns:
	# remove nans:
	filtered_list1, filtered_list2 = zip(*[(x, y) for x, y in zip(srl_unv_y, srl_unv_X[feature].values) if not math.isnan(x) and not math.isnan(y)])

	# Converting the tuples back to lists
	filtered_list1 = list(filtered_list1)
	filtered_list2 = list(filtered_list2)
	r,p = spearmanr(filtered_list1, filtered_list2)
	# if p <= 0.05:
	srl_unv_rho[feature] = np.round(r,2)
	# else:
		# srl_unv_rho[feature] = np.nan
	




In [None]:
timestamp = 'results_24-02-08T15-58-41'
model = 'LGBMRegressor'
input_dir = './data/output/ml_performance/'+timestamp+'_all/'
files = os.listdir(input_dir)
feature_vectors = ['srl_unvalidated', 'liwc22_semantic']
table_names = ['SRL unvalidated', 'LIWC-22 semantic']

rank_col_name = 'Rank'
files
feature_importance = []
for file, table_name in zip(feature_vectors, table_names):
	timestamp_i = timestamp.replace('results_', '')
	file1 = f'feature_importance_{file}_{model}_gridsearch-True_all_{timestamp_i}.csv'
	
	
	fi = pd.read_csv(input_dir+file1)
	fi.columns = ['Feature', 'Split', 'Gain']
	fi=fi.drop('Split', axis=1).round(1)
	fi = fi.reset_index()
	fi.columns = [rank_col_name, 'Feature', 'Gain']
	fi[rank_col_name]+=1
	fi[rank_col_name] = fi[rank_col_name].astype(str)
	if 'liwc22' in file:
		fi['rho'] = fi['Feature'].map(liwc_rho)
	else:
		fi['rho'] = fi['Feature'].map(srl_unv_rho)

	fi.to_csv(input_dir+'feature_importance_'+file+'_clean.csv', index=False)
	columns = pd.MultiIndex.from_tuples([
	(table_name, rank_col_name),
	(table_name, 'Feature'),
	(table_name, 'Gain'),
	(table_name, 'rho'),
	])
	fi.columns = columns
	feature_importance.append(fi)

feature_importance_df = pd.concat([feature_importance[0],feature_importance[1].drop(columns=(table_names[1], rank_col_name))],axis=1)



feature_vectors = '_'.join(feature_vectors)



feature_importance_df.to_csv(input_dir+f'feature_importance_{feature_vectors}_gridsearch-True_all_{timestamp}.csv', index= 0 )
display(feature_importance_df)

feature_importance_df.iloc[:20].to_csv(input_dir+f'feature_importance_{feature_vectors}_gridsearch-True_all_{timestamp}_top20.csv', index= 0 )

# top 15 and bottom 10
df0 = feature_importance[0].copy()
top_15 = df0.head(15)
bottom_10 = df0.tail(10)
empty_row = pd.DataFrame(np.nan, index=[0], columns=bottom_10.columns)
bottom_10 = pd.concat([empty_row, bottom_10]).reset_index(drop=True)
df0 = pd.concat([top_15, bottom_10])
df0 = df0.reset_index(drop=True)

df1 = feature_importance[1].copy()
top_15 = df1.head(15)
bottom_10 = df1.tail(10)
empty_row = pd.DataFrame(np.nan, index=[0], columns=bottom_10.columns)
bottom_10 = pd.concat([empty_row, bottom_10]).reset_index(drop=True)
df1 = pd.concat([top_15, bottom_10])
df1 = df1.reset_index(drop=True)


feature_importance_df = pd.concat([df0,df1],axis=1)
feature_importance_df.to_csv('./data/output/tables/'+f'feature_importance_{feature_vectors}_gridsearch-True_all_{timestamp}_top_and_bottom.csv', index= 0 )
display(feature_importance_df)



In [None]:
dfs['train'].keys()

In [None]:
dfs['train']['liwc22_y']

In [None]:
dfs['train']['srl_unvalidated'].columns

In [None]:
fi = pd.read_csv(input_dir+file1)
fi.columns = ['Feature', 'Split', 'Gain']
fi=fi.drop('Split', axis=1).round(1)
fi = fi.reset_index()
fi.columns = [rank_col_name, 'Feature', 'Gain']
fi[rank_col_name]+=1
fi[rank_col_name] = fi[rank_col_name].astype(str)


In [None]:
fi['rho'] 

# Ordinal classification

In [None]:
# !pip install scikit-learn==1.2.0

In [None]:
from importlib import reload
sklearn = 1
import sklearn 
sklearn.__version__

In [None]:
model_name = model.__repr__()
model_name

In [None]:
feature_vector = 'srl_unvalidated'
scoring = 'f1_macro'
gridsearch = True
toy = True

In [None]:
feature_vector

In [None]:
model_name

In [None]:
parameters = get_params(feature_vector,model_name=model_name, toy=toy)
parameters

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
# TODO crossvalidation
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_validate
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_validate
sys.path.append('./../../concept-tracker/')
from concept_tracker.ordinal import OrdinalClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics


ts_i = datetime.datetime.utcnow().strftime('%y-%m-%dT%H-%M-%S')

if toy:
    output_dir_i = output_dir + f'results_{ts_i}_toy/'
else:
    output_dir_i = output_dir + f'results_{ts_i}/'
    
os.makedirs(output_dir_i, exist_ok=True)


results_all = []

# disease_class_progression = ["low", "medium", "high"]



X_train, y_train,X_val, y_val, X_test, y_test = get_splits(feature_vector)


if feature_vector == 'tfidf':
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

random_state = 42

# Models


model = SVC(kernel='linear', class_weight="balanced", 
              probability=True 
             )
model = LGBMClassifier()
oc_model = OrdinalClassifier(model)



# svc_bal = SVC(kernel='rbf', class_weight="balanced", probability=True)
# svc_imb = SVC(kernel='linear', class_weight='None', probability=True)
# svc_bal = RandomForestClassifier(class_weight="balanced",)

# oc_imb = OrdinalClassifier(svc_imb)

models = {'LGBMClassifier':oc_model, 
          'LGBMClassifier':model}

oc_params = [{"reverse_classes": True}, {'reverse_classes': False}]

# scoring = ['f1_weighted', 'precision_weighted', 'recall_weighted', 'roc_auc']
# cv_results = cross_validate(model, X, y_t, cv=skf, scoring=scoring, return_train_score=False)

tests = []



# Train your model on X_train and y_train
# Evaluate your model on X_test and y_test

# for i, (train_index, test_index) in enumerate(skf.split(X, y_t)):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = [y_t[i] for i in train_index], [y_t[i] for i in test_index]



# X_train, X_test, y_train, y_test = train_test_split(X, y_t, shuffle=True, test_size=0.2, 
#                                                     # random_state=random_state, 
#                                                     stratify=y_t)



ord_pass = 0
for model_name, model  in models.items():
    param ={}
    if "Ordinal" in model.__repr__():
        if ord_pass == 0:
            param = oc_params[ord_pass]  #first time through
            ord_pass +=1
        elif ord_pass==1:
            param = oc_params[ord_pass]  # second pass
    model.set_params(**param)

    # model_name = model.__repr__()

    # cv = cross_validate(model, X_train, y_train,  n_jobs=-1, scoring="f1_macro")
    # cv["clf"] = model_name
    # cv['param'] = param
    # cv['y_name'] = y_t.name
    
    pipeline = get_pipelines(feature_vector, model_name = model_name)
    print(pipeline)

    # model.fit(X_train, y_train)

    if gridsearch == True:
        parameters = get_params(feature_vector,model_name=model_name, toy=toy)
        
        pipeline = BayesSearchCV(pipeline, parameters, cv=5, scoring=scoring, return_train_score=False,
        n_iter=32)    
        if feature_vector != 'tfidf':
            if 'y' in X_train.columns:
                break
        pipeline.fit(X_train,y_train)
        best_params = pipeline.best_params_
        best_model = pipeline.best_estimator_
        if feature_vector != 'tfidf':
            if 'y' in X_test.columns:
                break
        y_pred = best_model.predict(X_test)
    else:
        pipeline.fit(X_train,y_train)
        best_params = 'No hyperparameter tuning'
        y_pred = pipeline.predict(X_test)
    
    # Predictions
    y_pred_df = pd.DataFrame(y_pred)
    y_pred_df.to_csv(output_dir_i+f'y_pred_{feature_vector}_{model_name}_gridsearch-{gridsearch}_{ts_i}.csv', index=False)
    # path = output_dir_i + f'scatter_{feature_vector}_{model_name}_gridsearch-{gridsearch}_{ts_i}'

    # Performance










    
    print(model.__repr__())
    print("params are: ".format(param))
    # try:
        #this gets into using pred_proba
    # y_pred = model.predict(X_test)

    y_pred = pipeline.predict(X_test)
    
    # y_proba = model.predict_proba(X_test)       # Get predicted probabilities
    # y_proba = np.round(y_proba,2)               # round to 2 decimals
    # y_proba                                     # probabilitiesfor y=0 and y=1
    # y_proba_1 = y_proba[:,1]
    # y_pred = y_proba_1>=0.5*1                   # define your threshold
    # y_pred_b = model.predict(X_test)            # binary classification using 0.5 threshold

    # # Confusion matrix
    # cm = metrics.confusion_matrix(y_test, y_pred)                   # REMEMBER you can also obtain proportions with normalize argument: confusion_matrix(y_test, y_pred, normalize = 'all')
    # cm
    # cm_display = metrics.ConfusionMatrixDisplay(cm,display_labels=[0,1]).plot() # sklearn provides a way to plot it. IMPORTANT YOU KNOW WHICH AXIS IS TRUE VS. PREDICTED
    # cm_df_meaning = pd.DataFrame([['TN', 'FP'],['FN','TP']], index=[0,1], columns=[0,1])
    # cm_df_meaning 

    # # Metrics
    print(metrics.classification_report(y_test, y_pred))                # here we need to print to view correctly

    # # Custom classification report
    precision = metrics.precision_score(y_test, y_pred, average =average)
    recall = metrics.recall_score(y_test, y_pred, average =average)
    specificity = metrics.recall_score(y_test, y_pred, pos_label=0, average =average)    # specificity is the recall of the negative class or control group
    f1 = metrics.f1_score(y_test, y_pred, average =average)

    # # Here we use all probabilities, not just Y=1, in binary we use y_proba_1
    # roc_auc = metrics.roc_auc_score(y_test, y_proba, average ='weighted', multi_class='ovr')  # IMPORTANT: other metrics take binary predictions y_pred. Here we test different thresholds, so we need probabilities (this will change the ROC AUC score)

    results_dict = {
        'Model' : f"{feature_vector} {model_name}",
        'Precision':precision,
        'Recall':recall,
        'Specificity':specificity,    
        'F1':f1,
        # 'ROC AUC':roc_auc,
        }    
    model_name  = str(model.get_params().get('estimator'))+' reverse_classes='+str(model.get_params().get('reverse_classes'))
    results = pd.DataFrame(results_dict, index=[model_name]).round(3)
    display(results)
    results_all.append(results) 
    # TODO save output_dir_i


In [None]:
import seaborn as sns
sns.set(font_scale = 0.5)

sns.clustermap(X_train.corr(method='spearman'))
plt.savefig('dendrogram_similarity_lexicon.png', dpi = 300)


### Train

In [None]:
dfs['train'].keys()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
# import xgboost as xgb                          # If installation needed: conda install py-xgboost               
from sklearn.model_selection import ParameterGrid

In [None]:

# Pipelines avoid us making mistakes with fit, fit_transform and transform
# across train and test sets. They are also easier to read, i.e., they organize
# your code. 

# Configuration
crossvalidation_k = 5
scoring = 'neg_mean_squared_error'
verbose = 1                     # how much to print regarding model training



pipelines = {
    'Ridge': 
        Pipeline([
            ('scaler', StandardScaler()),
            ('estimator', Ridge(random_state = 1234))
            ]),
    # 'SVR': Pipeline([
    #     ('scaler', StandardScaler()),
    #     ('estimator', SVR(verbose=True))
    #     ]),
    }


paramater_grids = {    
    # use name of step in pipeline for which you want to tune hyperparameters
    # use doubnle underscore to access the hyperparameter
    'Ridge' : {
        "estimator__alpha": [0.0001, 0.01, 1, 100],
        },

    'SGDRegressor' : {
        "estimator__penalty": ['l1', 'l2'],
        "estimator__alpha": [0.0001, 0.01, 1, 100],
        },
    'SVR' : {
        "estimator__kernel": ['linear', 'rbf'],
        "estimator__C": [0.01,0.1, 1, 10, 100],
        },

    }


# How many runs will this imply?
model_name_i = 'SVR'
param_grid = paramater_grids.get(model_name_i)
len(ParameterGrid(param_grid))


# Train models, evaluate, feature importance, and save outputs
# ============================================================
ts = datetime.datetime.utcnow().strftime('%y-%m-%dT%H-%M-%S')

os.makedirs(f'results_{ts}', exist_ok=True)

results_all = [] # to store results for all models
for model_name_i in pipelines.keys():
    pipeline_i = pipelines.get(model_name_i)
    params_i = paramater_grids.get(model_name_i)
    
    if pipeline_i == None or params_i==None:
        print('did not find model configuration:', model_name_i)
        break
    
    model = GridSearchCV(pipeline_i ,
                      param_grid=params_i,
                      scoring=scoring ,
                      cv=crossvalidation_k, 
                      verbose=verbose, 
                      error_score='raise'
                      )                             # define gridsearch CV
    # You could try RandomizedSearchCV instead which will give you faster performance, maybe only slightly worse https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
    
    model.fit(X_train, y_train)  # train model
    best_params= model.best_params_                   # best params
    print('best parameters: ', best_params)
    pd.DataFrame(model.cv_results_).to_csv(f'results_{ts}/cv_results_all_{ts}.csv')
    
    # Performance evaluation
    # ====================================================================
    y_pred = model.predict(X_test)            

    # If it's a classification task, you can get:
        # If this were classification you could compute predict_proba() 
        # confusion matrix
        # print(metrics.classification_report(y_test, y_pred))                # here we need to print to view correctly
        # classification metrics
    
    y_pred_df = pd.DataFrame(y_pred)
    y_pred_df.to_csv(f'results_{ts}/{model_name_i}_{ts}.csv', index=False)
    path = output_dir + f'scatter_{model_name_i}_{ts}'
    results =regression_report(y_test,y_pred,y_train=y_train,best_params=best_params,model_name=model_name_i, plot = True, save_fig_path = path, round_to = 2)
    results_all.append(results)
    

    # feature importance  
    # ============================================================
    # feature_importance = feature_importance_df(model, model_name_i,feature_names,  xgboost_method='weight')
    # if str(feature_importance) != 'None':       # I only implemented a few methods for a few models
    #     feature_importance.to_csv(f'results_{ts}/feature_importance_{model_name_i}_{ts}.csv')        
    


results_all = pd.concat(results_all)
results_all.to_csv(f'results_{ts}/results_{ts}.csv')        
results_all




In [None]:
results_all

In [None]:
X_train.columns

In [None]:


fi_df = feature_importance_df(model, model_name_i, X_train.columns, xgboost_method = 'weight', model_name_in_pipeline = 'estimator')
fi_df.iloc[:50]



In [None]:


fi_df = feature_importance_df(model, model_name_i, X_train.columns, xgboost_method = 'weight', model_name_in_pipeline = 'estimator')
fi_df



# Deep learning embeddings

In [None]:
X_train = dfs['train']['X']
X_val = dfs['val']['X']
X_test = dfs['test']['X']
y_train = dfs['train']['y']
y_val = dfs['val']['y']
y_test = dfs['test']['y']
from collections import Counter
Counter(y_train) # Make sure it's balanced

In [None]:
# !pip install -i sentence-transformers==2.2.2

### Encode 
TODO: Change max_seq_length to 500

In [None]:
# !pip install -U sentence-transformers 
from sentence_transformers import SentenceTransformer, util 

# Encode the documents with their sentence embeddings 
# a list of pre-trained sentence transformers
# https://www.sbert.net/docs/pretrained_models.html
# https://huggingface.co/models?library=sentence-transformers

# Here the progress bar will show you how long it will take to embedd the documents.

# all-MiniLM-L6-v2 is optimized for semantic similarity of paraphrases
sentence_embedding_model = SentenceTransformer('all-MiniLM-L6-v2')       # load embedding

# TODO: Change max_seq_length to 500
# Note: sbert will only use fewer tokens as its meant for sentences, 
print(sentence_embedding_model .max_seq_length)
# you can increase to closer to the base model it was trained on BERT has 512
# sentence_embedding_model._first_module().max_seq_length = 500
# print(sentence_embedding_model .max_seq_length) # now it takes up to 500, but will be a bit slower to encode and might not change performance a whole lot in this case


In [None]:
# X_train = dfs['train']['embeddings_balanced']
X_train = dfs['train']['embeddings_balanced']

X_test = dfs['test']['embeddings_balanced']


In [None]:

# Pipelines avoid us making mistakes with fit, fit_transform and transform
# across train and test sets. They are also easier to read, i.e., they organize
# your code. 

# Configuration
crossvalidation_k = 3
scoring = 'neg_mean_squared_error'
verbose = 1                     # how much to print regarding model training

ts = datetime.datetime.utcnow().strftime('%y-%m-%dT%H-%M-%S')
output_dir_i = f'results_embeddings_{ts}/'
os.makedirs(output_dir_i, exist_ok=True)

pipelines = {
    'Ridge': 
        Pipeline([
            ('scaler', StandardScaler()),
            ('estimator', Ridge(random_state = 1234))
            ]),
    # 'SVR': Pipeline([
    #     ('scaler', StandardScaler()),
    #     ('estimator', SVR(verbose=True))
    #     ]),
    }


paramater_grids = {    
    # use name of step in pipeline for which you want to tune hyperparameters
    # use doubnle underscore to access the hyperparameter
    'Ridge' : {
        "estimator__alpha": [0.0001, 0.01, 1, 100],
        },

    'SGDRegressor' : {
        "estimator__penalty": ['l1', 'l2'],
        "estimator__alpha": [0.0001, 0.01, 1, 100],
        },
    'SVR' : {
        "estimator__kernel": ['linear', 'rbf'],
        "estimator__C": [0.01,0.1, 1, 10, 100],
        },

    }


# How many runs will this imply?
model_name_i = 'SVR'
param_grid = paramater_grids.get(model_name_i)
len(ParameterGrid(param_grid))


# Train models, evaluate, feature importance, and save outputs
# ============================================================

os.makedirs(output_dir_i, exist_ok=True)

results_all = [] # to store results for all models
for model_name_i in pipelines.keys():
    pipeline_i = pipelines.get(model_name_i)
    params_i = paramater_grids.get(model_name_i)
    
    if pipeline_i == None or params_i==None:
        print('did not find model configuration:', model_name_i)
        break
    
    model = GridSearchCV(pipeline_i ,
                      param_grid=params_i,
                      scoring=scoring ,
                      cv=crossvalidation_k, 
                      verbose=verbose, 
                      error_score='raise'
                      )                             # define gridsearch CV
    # You could try RandomizedSearchCV instead which will give you faster performance, maybe only slightly worse https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
    
    model.fit(X_train, y_train)  # train model
    best_params= model.best_params_                   # best params
    print('best parameters: ', best_params)
    pd.DataFrame(model.cv_results_).to_csv(f'results_embeddings_{ts}/cv_results_all_{ts}.csv')
    
    # Performance evaluation
    # ====================================================================
    y_pred = model.predict(X_test)            

    y_pred_df = pd.DataFrame(y_pred)
    y_pred_df.to_csv(f'{output_dir_i}/{model_name_i}_{ts}.csv', index=False)
    path = output_dir_i + f'scatter_{model_name_i}_{ts}'
    results =regression_report(y_test,y_pred,y_train=y_train,best_params=best_params,model_name=model_name, plot = True, save_fig_path = path, round_to = 2)
    results_all.append(results)
    

    # feature importance  
    # ============================================================
    # feature_importance = feature_importance_df(model, model_name_i,feature_names,  xgboost_method='weight')
    # if str(feature_importance) != 'None':       # I only implemented a few methods for a few models
    #     feature_importance.to_csv(f'results_{ts}/feature_importance_{model_name_i}_{ts}.csv')        
    


results_all = pd.concat(results_all)
results_all.to_csv({output_dir_i}+f'results_{ts}.csv')        
results_all



    

In [None]:
results

In [None]:
results_all