In [1]:
%matplotlib inline

import pandas as pd
import numpy as np 

from os.path import basename, exists, isdir, isfile, join, splitext
from os import chdir

BASE_DIR = join('..', 'analyses', 'single')

HIGH_EXP = '+'
MED_EXP_H = '▲'
MED_EXP_L = '▼'
LOW_EXP = '-'
OSC = 'x'

GLOBAL_INCIDENCE = '1000'
TOP_INCIDENCE = '10'

In [2]:
# Positive Group (eRisk and CLPysch) #
clpsych_positive_rankings = pd.read_csv(join(BASE_DIR, 'ranking_clpsych15.tsv'), sep='\t', header=0)
erisk_positive_rankings = pd.read_csv(join(BASE_DIR, 'ranking_eRisk18.tsv'), sep='\t', header=0)

# Control Group (eRisk and CLPysch) #
clpsych_control_rankings = pd.read_csv(join(BASE_DIR, 'ranking_clpsych15_ctrl.tsv'), sep='\t', header=0)
erisk_control_rankings = pd.read_csv(join(BASE_DIR, 'ranking_eRisk19.tsv'), sep='\t', header=0)

In [4]:
def categorise(ranking):
    question_ranking_positions = np.zeros(22, dtype=np.uint16)
    for row in ranking.iteritems():
        # Starts from zero #
        position = row[0] + 1
        for qid in row[1:]:
            question_ranking_positions[qid] = position

    high = [1.0, 6.0]
    moderate_high = [6.0, 11.0]
    moderate_low = [11.0, 16.0]
    low = [16.0, 22.0]
    
    exposition = [[], [], [], []]
    for qid, q_pos in enumerate(question_ranking_positions[1:], start=1):
        if q_pos >= high[0] and q_pos < high[1]:
            exposition[0].append(qid)
        elif q_pos >= moderate_high[0] and q_pos < moderate_high[1]:
            exposition[1].append(qid)
        elif q_pos >= moderate_low[0] and q_pos < moderate_low[1]:
            exposition[2].append(qid)
        elif q_pos >= low[0] and q_pos < low[1]:
            exposition[3].append(qid)
        else: # This should neve happen, otherwise there is something wrong #
            print(qid, q_pos)
            raise Exception('Something went wrong!')
    return exposition

def fillup_table(qbc, category_table_col):
    for qid in qbc[0]:
        category_table_col[qid] = HIGH_EXP
    for qid in qbc[1]:
        category_table_col[qid] = MED_EXP_H
    for qid in qbc[2]:
        category_table_col[qid] = MED_EXP_L
    for qid in qbc[3]:
        category_table_col[qid] = LOW_EXP

In [5]:
# Iterate over the Questions Rankings and categorise each question #
# retrieval models = ['bm25_10', 'bm25_100', 'bm25_1000', 'qld_10', 'qld_100', 'qld_1000']

# Select the desired granularity level (global vs. top) #
# [GLOBAL_INCIDENCE, TOP_INCIDENCE] #
hits = GLOBAL_INCIDENCE

bm25 = 'bm25_%s' % hits
qld = 'qld_%s' % hits

category_table_positive = np.empty([22, 4], dtype=np.unicode_)
category_table_positive[:] = ''

category_table_control = np.empty([22, 4], dtype=np.unicode_)
category_table_control[:] = ''

# Positive group #
questions_by_category = categorise(erisk_positive_rankings.loc[:, bm25])
fillup_table(questions_by_category, category_table_positive[:,0])
questions_by_category = categorise(erisk_positive_rankings.loc[:, qld])
fillup_table(questions_by_category, category_table_positive[:,1])

questions_by_category = categorise(clpsych_positive_rankings.loc[:, bm25])
fillup_table(questions_by_category, category_table_positive[:,2])
questions_by_category = categorise(clpsych_positive_rankings.loc[:, qld])
fillup_table(questions_by_category, category_table_positive[:,3])

# Control group #
questions_by_category = categorise(erisk_control_rankings.loc[:, bm25])
fillup_table(questions_by_category, category_table_control[:,0])
questions_by_category = categorise(erisk_control_rankings.loc[:, qld])
fillup_table(questions_by_category, category_table_control[:,1])

questions_by_category = categorise(clpsych_control_rankings.loc[:, bm25])
fillup_table(questions_by_category, category_table_control[:,2])
questions_by_category = categorise(clpsych_control_rankings.loc[:, qld])
fillup_table(questions_by_category, category_table_control[:,3])

In [6]:
exp = 'global' if hits == GLOBAL_INCIDENCE else 'top'
filepath = join(BASE_DIR, 'category_table_%s_exp_positive.tsv' % exp)
np.savetxt(filepath, category_table_positive, fmt='%s', delimiter='\t')

filepath = join(BASE_DIR, 'category_table_%s_exp_control.tsv' % exp)
np.savetxt(filepath, category_table_positive, fmt='%s', delimiter='\t')

# -- The output produced will have the following format -- #

# +	▼	+	-
# ▲	▼	▲	▲
# ▲	-	▲	▼
# ...

# Columns are: BM25_R, QLD_R, BM25_T, QLD_T. Where "R" corresponds to Reddit and 
# "T" to Twitter.