## Split training and test set
This notebook split the dataset in training set and test set.

The input is taken from raw/inquiry_lessons.

The output is exported to data/tables

### Import and path definition

In [1]:
import os
import pickle
import pandas as pd
import sys
import random

root_path = os.path.dirname(os.path.abspath(os.getcwd()))
# to import src is necessary to append the root_path to the path
sys.path.append(root_path)

from collections import Counter
from src import topic_preprocessing as topic_pp
from src import utterance_proccesing as up

raw_path = os.path.join(root_path,'raw')
data_path = os.path.join(root_path,'data')
pickle_path = os.path.join(raw_path,'pickles')
results_path = os.path.join(root_path,'results')
transcriptions_path = os.path.join(raw_path,'inquiry_lessons')
tables_path = os.path.join(data_path,'tables')

In [2]:
import importlib
importlib.reload(up)

<module 'src.utterance_proccesing' from 'C:\\Users\\CATALINA ESPINOZA\\Documents\\ciae\\Classification_IBL\\src\\utterance_proccesing.py'>

### Define global variables

In [3]:
WITH_STEMMING = True
REMOVE_STOPWORDS = True
MINIMUM_WORDS_PER_PHRASE = 0
GROUP = -1
SEED = 10
NUM_TOPICS = 60
VERSION = 2
random.seed(SEED)

### Load excels

In [4]:
lessons = []
phases = []
utterances_percentage = []
group = []
groups = []
groups_dict = {}
if GROUP < 0:
    for a_lesson in os.listdir(transcriptions_path):
        if a_lesson.startswith('~'):
            continue
        groups_dict[a_lesson] = {}
        a_path = os.path.join(transcriptions_path,a_lesson)
        df = pd.read_excel(a_path)
        try:
            lessons.append(df['Utterance'].values)
            phases.append(df['Phase'].values)       
            ut_numbers = list(df.index)
            ut_order = list(map(lambda x:x*1.0/len(ut_numbers),ut_numbers))
            utterances_percentage.append(ut_order)
            group.append(a_lesson)
            groups+= [a_lesson for i in df['Phase'].values]
            groups_dict[a_lesson]['phases'] = df['Phase'].values
            groups_dict[a_lesson]['utterances'] = [v for v in df['Utterance'].values if v==v]
            groups_dict[a_lesson]['ut_order'] = ut_order
            print("Loaded {}".format(a_lesson))
        except:
            lessons.append(df['Unnamed: 7'].values)
            phases.append(df['Phase'].values)
            ut_numbers = list(df.index)
            utterances_percentage.append(list(map(lambda x:x*1.0/len(ut_numbers),ut_numbers)))
            group.append(a_lesson)
            print("Loaded {}".format(a_lesson))
else:
    print("Error {}".format(a_lesson))

Loaded Group12_2016_No_scaffold.xlsx
Loaded Group2_2017_Scaffold1.xlsx
Loaded Group3C_2017_Scaffold2.xlsx
Loaded Group3_2016_No_scaffold.xlsx
Loaded Group3_2017_Scaffold2.xlsx
Loaded Group4B_2017_Scaffold2.xlsx
Loaded Group4_2017_Scaffold1.xlsx
Loaded Group5_2016_No_scaffold.xlsx
Loaded Group5_2017_Scaffold2.xlsx
Loaded Group6A_2017_Scaffold1.xlsx
Loaded Group9_2016_No_scaffold.xlsx


In [5]:
len(groups_dict)

11

In [6]:
test_set_length = 2
test_set = random.sample(groups_dict.keys(),test_set_length)
print(test_set)

['Group6A_2017_Scaffold1.xlsx', 'Group12_2016_No_scaffold.xlsx']


In [7]:
train_set = []
for i in groups_dict.keys():
    if i in test_set:
        continue
    train_set.append(i)
print(len(train_set))

9


In [8]:
train_set

['Group2_2017_Scaffold1.xlsx',
 'Group3C_2017_Scaffold2.xlsx',
 'Group3_2016_No_scaffold.xlsx',
 'Group3_2017_Scaffold2.xlsx',
 'Group4B_2017_Scaffold2.xlsx',
 'Group4_2017_Scaffold1.xlsx',
 'Group5_2016_No_scaffold.xlsx',
 'Group5_2017_Scaffold2.xlsx',
 'Group9_2016_No_scaffold.xlsx']

### Preprocess data

In [9]:
for g in groups_dict:
    clean_utterances = [topic_pp.finnish_preprocessing(x) for x in groups_dict[g]['utterances']]
    clean_utterances = [topic_pp.preprocessing(x,REMOVE_STOPWORDS,WITH_STEMMING) for x in clean_utterances]
    groups_dict[g]['clean_utterances'] = clean_utterances

In [10]:
words = []
for g in groups_dict:
    for element in groups_dict[g]['clean_utterances']:
        for word in element:
            words.append(word)
dict_with_fq = dict(Counter(words))

In [11]:
len(groups_dict.keys())

11

In [12]:
tag_unfrequent = ''
for g in groups_dict:
    try:
        utterances = list(groups_dict[g]['clean_utterances'])
        copy_phases = list(groups_dict[g]['phases'])
        copy_ut_order = list(groups_dict[g]['ut_order'])
        groups_dict[g]['clean_utterances'] = []
        groups_dict[g]['ut_order'] = []
        groups_dict[g]['phases'] = []
        for i,phrase in enumerate(utterances):
            a_phrase = []
            for w in phrase:
                aux_w = w
                if dict_with_fq[w]<=1:
                    if tag_unfrequent == '':
                        continue
                    else:
                        aux_w = tag_unfrequent
                a_phrase.append(aux_w)
            if a_phrase != [] or len(a_phrase) != a_phrase.count(''):
                groups_dict[g]['clean_utterances'].append(a_phrase)
                groups_dict[g]['ut_order'].append(copy_ut_order[i])
                groups_dict[g]['phases'].append(copy_phases[i])
    except:
        print(g)

### Load lda model

In [13]:
a_name = 'lda_textbooks_chunksize_alpha_auto_v{}_seed_{}_{}_{}_{}.pickle'.format(VERSION,SEED,NUM_TOPICS,REMOVE_STOPWORDS,WITH_STEMMING)
model_file = os.path.join(results_path,'lda_models',a_name)
with open(model_file,'rb') as f:
    ldamodel = pickle.load(f)



In [14]:
dict_file = os.path.join(pickle_path,'dictionary_v{}_seed_{}_{}_{}_{}.pickle'.format(VERSION,SEED,NUM_TOPICS,REMOVE_STOPWORDS,WITH_STEMMING))
with open(dict_file,'rb') as f:
    dictionary = pickle.load(f)
print("Dictionary length removing unfrequent words: {}".format(len(dictionary)))

Dictionary length removing unfrequent words: 12960


In [15]:
def get_topic_distribution_phrase(a_phrase):
    bow = dictionary.doc2bow(a_phrase)
    T = ldamodel.get_document_topics(bow,minimum_probability=0,minimum_phi_value=0.001)
    return [x[1] for x in T]

#### Description training and test set

training set

In [16]:
the_keys = range(1,6)
phases_n = [0 for n in range(len(the_keys))]
for g in groups_dict:
    if g in train_set:
        for k in the_keys:
            n = groups_dict[g]['phases'].count(k)
            phases_n[k-1]+= n
for k in the_keys:
    print("key {}, total {}".format(k,phases_n[k-1]))

key 1, total 403
key 2, total 175
key 3, total 406
key 4, total 62
key 5, total 554


In [17]:
403+175+407+62+555

1602

test set

In [18]:
the_keys = range(1,6)
phases_n = [0 for n in range(len(the_keys))]
for g in groups_dict:
    if g in test_set:
        for k in the_keys:
            n = groups_dict[g]['phases'].count(k)
            phases_n[k-1]+= n
for k in the_keys:
    print("key {}, total {}".format(k,phases_n[k-1]))

key 1, total 72
key 2, total 33
key 3, total 70
key 4, total 8
key 5, total 98


### Build table with utterances, topics distribution and phases

#### Build table with train_set

In [18]:
df = up.build_simple_df(groups_dict,train_set,ldamodel,dictionary)
file_name = '[train]IBL_topic_distribution_by_utterance_minimum_5_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'train',file_name))

In [22]:
for group in train_set:
    df = up.build_simplest_df(groups_dict,[group],ldamodel,dictionary)
    file_name = '[train]{}_{}_{}.xlsx'.format(group,WITH_STEMMING,NUM_TOPICS)
    df.to_excel(os.path.join(tables_path,'fv_62_removing_more_stopwords','train',file_name))

#### Build table with test_set

In [19]:
df = up.build_simple_df(groups_dict,[test_set[0]],ldamodel,dictionary)
file_name = '[test1]IBL_topic_distribution_by_utterance_minimum_5_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'test',file_name))

In [20]:
df = up.build_simple_df(groups_dict,[test_set[1]],ldamodel,dictionary)
file_name = '[test2]IBL_topic_distribution_by_utterance_minimum_5_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'test',file_name))

In [21]:
for group in test_set:
    df = up.build_simplest_df(groups_dict,[group],ldamodel,dictionary)
    file_name = '[test]{}_{}_{}.xlsx'.format(group,WITH_STEMMING,NUM_TOPICS)
    df.to_excel(os.path.join(tables_path,'fv_62_removing_more_stopwords','test',file_name))

### Build excel with before and after

#### Training set

In [21]:
df = up.build_next_utterances_df(groups_dict,train_set,ldamodel,dictionary)
file_name = '[train]IBL_topic_distribution_by_utterance_with_phrase_before_and_after_time_utterance_minimum_0_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'train',file_name))

In [19]:
for group in train_set:
    df = up.build_simplest_next_utterances_df(groups_dict,[group],ldamodel,dictionary)
    file_name = '[train]{}_{}_{}.xlsx'.format(group,WITH_STEMMING,NUM_TOPICS)
    df.to_excel(os.path.join(tables_path,'fv_182_removing_more_stopwords','train',file_name))

#### Test set

In [22]:
df = up.build_next_utterances_df(groups_dict,[test_set[0]],ldamodel,dictionary)
file_name = '[test1]IBL_topic_distribution_by_utterance_before_after_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'test',file_name))

In [23]:
df = up.build_next_utterances_df(groups_dict,[test_set[1]],ldamodel,dictionary)
file_name = '[test2]IBL_topic_distribution_by_utterance_before_after_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'test',file_name))

In [20]:
for group in test_set:
    df = up.build_simplest_next_utterances_df(groups_dict,[group],ldamodel,dictionary)
    file_name = '[test]{}_{}_{}.xlsx'.format(group,WITH_STEMMING,NUM_TOPICS)
    df.to_excel(os.path.join(tables_path,'fv_182_removing_more_stopwords','test',file_name))

### Build excel with co-occurrences

#### Train set

In [29]:
df = up.build_co_occurrence_df(groups_dict,train_set,ldamodel,dictionary)
file_name = '[train]IBL_topic_distribution_by_utterance_with_co_occurrence_time_utterance_minimum_0_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'train',file_name))

#### Test set

In [30]:
df = up.build_co_occurrence_df(groups_dict,[test_set[0]],ldamodel,dictionary)
file_name = '[test1]IBL_topic_distribution_by_utterance_with_co_occurrence_time_utterance_minimum_0_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'test',file_name))

In [31]:
df = up.build_co_occurrence_df(groups_dict,[test_set[1]],ldamodel,dictionary)
file_name = '[test2]IBL_topic_distribution_by_utterance_with_co_occurrence_time_utterance_minimum_0_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'test',file_name))

### Build excel with windows

#### Train set

In [None]:
df = up.build_windows_df(groups_dict,train_set,ldamodel,dictionary,3)
file_name = '[train]IBL_topic_distribution_by_utterance_windows_time_utterance_minimum_0_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'train',file_name))

#### Test set

In [None]:
df = up.build_windows_df(groups_dict,[test_set[0]],ldamodel,dictionary)
file_name = '[test1]IBL_topic_distribution_by_utterance_with_co_occurrence_time_utterance_minimum_0_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'test',file_name))

In [None]:
df = up.build_windows_df(groups_dict,train_set,ldamodel,dictionary,3)
file_name = '[train]IBL_topic_distribution_by_utterance_windows_time_utterance_minimum_0_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df.to_excel(os.path.join(tables_path,'train',file_name))