# __Debugging Scores__

In [102]:
import itertools
import os
import pickle
from operator import itemgetter
from pathlib import Path
from collections import Counter, OrderedDict, defaultdict
import pandas as pd
from tqdm import tqdm as tqdm
import math
import numpy as np
import global_options
from culture import culture_dictionary, file_util

In [3]:
current_dict_path = "outputs/dict/expanded_dict.csv"
culture_dict, all_dict_words = culture_dictionary.read_dict_from_csv(current_dict_path)

Importing dict: outputs/dict/expanded_dict.csv
Number of words in integrity dimension: 366
Number of words in teamwork dimension: 460
Number of words in innovation dimension: 287
Number of words in respect dimension: 232
Number of words in quality dimension: 202


In [8]:
len(all_dict_words)

1547

In [9]:
word_sim_weights = culture_dictionary.compute_word_sim_weights(current_dict_path)

## __Pre Score : Construct Doc Level Corpus__

In [14]:
def construct_doc_level_corpus(sent_corpus_file, sent_id_file):
    """Construct document level corpus from sentence level corpus and write to disk.
    Dump "corpus_doc_level.pickle" and "doc_ids.pickle" to Path(global_options.OUTPUT_FOLDER, "scores", "temp"). 
    
    Arguments:
        sent_corpus_file {str or Path} -- The sentence corpus after parsing and cleaning, each line is a sentence
        sent_id_file {str or Path} -- The sentence ID file, each line correspond to a line in the sent_co(docID_sentenceID)
    
    Returns:
        [str], [str], int -- a tuple of a list of documents, a list of document IDs, and the number of documents
    """
    print("Constructing doc level corpus")
    # sentence level corpus
    sent_corpus = file_util.file_to_list(sent_corpus_file)
    sent_IDs = file_util.file_to_list(sent_id_file)
    assert len(sent_IDs) == len(sent_corpus)
    # doc id for each sentence
    doc_ids = [x.split("_")[0] for x in sent_IDs]
    # concat all text from the same doc
    id_doc_dict = defaultdict(lambda: "")
    for i, id in enumerate(doc_ids):
        id_doc_dict[id] += " " + sent_corpus[i]
    # create doc level corpus
    corpus = list(id_doc_dict.values())
    doc_ids = list(id_doc_dict.keys())
    assert len(corpus) == len(doc_ids)
    with open(
        Path(global_options.OUTPUT_FOLDER, "scores", "temp", "corpus_doc_level.pickle"),
        "wb",
    ) as out_f:
        pickle.dump(corpus, out_f)
    with open(
        Path(global_options.OUTPUT_FOLDER, "scores", "temp", "doc_ids.pickle"), "wb"
    ) as out_f:
        pickle.dump(doc_ids, out_f)
    N_doc = len(corpus)
    return corpus, doc_ids, N_doc

__Toma el texto procesado Trigram lo carga y lo une__

In [28]:
sent_corpus = file_util.file_to_list(sent_corpus_file)

In [29]:
sent_corpus[:10]

['thank_sir',
 'lady_gentleman time will begin_question_answer_session',
 'operator_instruction',
 '[ner:ordinal]_question_come [ner:person] [ner:organization]',
 'please_go_ahead',
 'good_morning_guy',
 'fabulous_quarter',
 'thanks [ner:person]',
 'difference increase revenue increase margin [ner:ordinal]_party increase cost',
 'tell [ner:date] cost_increase']

In [30]:
sent_corpus_file = "data/processed/trigram/documents.txt"
sent_id_file = "data/processed/parsed/document_sent_ids.txt"

In [31]:
corpus, doc_ids, N_doc = construct_doc_level_corpus(sent_corpus_file,sent_id_file)

Constructing doc level corpus


In [32]:
N_doc

1393

In [35]:
corpus[0][:200]

' thank_sir lady_gentleman time will begin_question_answer_session operator_instruction [ner:ordinal]_question_come [ner:person] [ner:organization] please_go_ahead good_morning_guy fabulous_quarter tha'

## __Calculate DF : Crea un Diccionario con la ocurrencia de cada palabra en words_in_doc__

In [36]:
def calculate_df(corpus):
    """Calcualte and dump a document-freq dict for all the words.
    
    Arguments:
        corpus {[str]} -- a list of documents
    
    Returns:
        {dict[str: int]} -- document freq for each word
    """
    print("Calculating document frequencies.")
    # document frequency
    df_dict = defaultdict(int)
    for doc in tqdm(corpus):
        doc_splited = doc.split()
        words_in_doc = set(doc_splited)
        for word in words_in_doc:
            df_dict[word] += 1
    # save df dict
    with open(
        Path(global_options.OUTPUT_FOLDER, "scores", "temp", "doc_freq.pickle"), "wb"
    ) as f:
        pickle.dump(df_dict, f)
    return df_dict

In [37]:
word_doc_freq = calculate_df(corpus)

Calculating document frequencies.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1393/1393 [00:00<00:00, 2084.46it/s]


In [41]:
word_doc_freq.get("back")

1241

In [45]:
len(word_doc_freq.keys())

82162

# __Computing using TF-IDF__

In [105]:
doc = corpus[0]

In [106]:
contribution = defaultdict(int)
results = []

In [107]:
document = doc.split()
dimension_count = OrderedDict()

In [108]:
expanded_words = culture_dict

In [109]:
for dimension in expanded_words:
    dimension_count[dimension] = 0
c = Counter(document)

In [110]:
dimension_count

OrderedDict([('integrity', 0),
             ('teamwork', 0),
             ('innovation', 0),
             ('respect', 0),
             ('quality', 0)])

In [111]:
c

Counter({'thank_sir': 1,
         'lady_gentleman': 1,
         'time': 12,
         'will': 34,
         'begin_question_answer_session': 1,
         'operator_instruction': 2,
         '[ner:ordinal]_question_come': 1,
         '[ner:person]': 41,
         '[ner:organization]': 8,
         'please_go_ahead': 10,
         'good_morning_guy': 3,
         'fabulous_quarter': 1,
         'thanks': 9,
         'difference': 2,
         'increase': 19,
         'revenue': 3,
         'margin': 14,
         '[ner:ordinal]_party': 1,
         'cost': 6,
         'tell': 2,
         '[ner:date]': 141,
         'cost_increase': 1,
         'want': 6,
         'handle': 2,
         'sure': 2,
         'think': 60,
         'table': 1,
         'sorry': 1,
         '[ner:location]': 44,
         'come': 6,
         'listen': 1,
         'go': 40,
         'try': 5,
         'give': 13,
         'religion': 1,
         '[ner:money]': 44,
         '[ner:duration]': 40,
         'revenue_per_[ner:d

In [112]:
df_dict = word_doc_freq

In [113]:
len(document)

2637

In [114]:
for pair in c.items(): # Itero en cada tupla del Counter que cuenta repeticiones de palabras en formato (palabra , repeticiones)
    for dimension, words in expanded_words.items(): #Itero en Categoría y sus palabras
        if pair[0] in words: # Reviso si la palabra del Documento está en el diccionario de palabras de la categoría
            w_ij = pair[1] * math.log(N_doc / df_dict[pair[0]]) 
            ### pair[1] = cantidad de veces que la palabra aparece en el documento
            ### N_doc = Total de documentos
            ### df_dict[pair[0]] = Total de veces que la palabra en el total de documentos
            dimension_count[dimension] += w_ij ## Aumentamos la cuenta de la dimensión en el valor obtenido
            contribution[pair[0]] += w_ij / len(document) ## Creamos la contribución dividiendo por el total de palabras del documento

dimension_count = OrderedDict(sorted(dimension_count.items(), key=lambda t: t[0]))

In [115]:
dimension_count

OrderedDict([('innovation', 30.205946679520117),
             ('integrity', 12.241819641474017),
             ('quality', 42.91781604784668),
             ('respect', 20.447753453549858),
             ('teamwork', 27.798470991302864)])

In [116]:
result = list(dimension_count.values())
result.append(len(document))
results.append(result)
results = np.array(results)

In [117]:
results

array([[  30.20594668,   12.24181964,   42.91781605,   20.44775345,
          27.79847099, 2637.        ]])

In [120]:
df = pd.DataFrame(
        results, columns=sorted(list(expanded_words.keys())) + ["document_length"]
    )
df["Doc_ID"] = "Document_ID"

In [121]:
df

Unnamed: 0,innovation,integrity,quality,respect,teamwork,document_length,Doc_ID
0,30.205947,12.24182,42.917816,20.447753,27.798471,2637.0,Document_ID
