# Preparación y curación de los datos

---
## Convenciones

### Dataframes principales

`msg_df` dataframe de mensajes

`md_df` dataframe de metadata

`agg_msg_df` dataframe de mensajes totalizado

`res_df` dataframe resultante del merge del dataframe de metadata y el dataframe de mensajes totalizado

### Prefijos de columnas nuevas

`n_` cantidad count

`n_words` cantidad de palabras

`avg_` promedio

---


## Imports

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

from ast import literal_eval
from collections import Counter

import re

from type_to_fix import T2F

## Columnas de la metadata a incluir en el resultado

**Nota:**
*Incluidas en la notebook para que sean facilmente "comentables"*

In [2]:
C2K = list()
C2K.append('session_id')
C2K.append('tutor_id')
C2K.append('student_id')
C2K.append('timestamp')
C2K.append('feedback_score')
C2K.append('wait_time')
C2K.append('ended_by_reason')
C2K.append('session_category')
C2K.append('consolidated_session_category')
C2K.append('student_complained')
C2K.append('student_complaint_clarity')
C2K.append('student_complaint_speed')
C2K.append('student_complaint_subject')
C2K.append('student_complaint_other')
C2K.append('session_tag_cheating')
C2K.append('session_tag_inappropriate')
C2K.append('session_tag_other_subject')
C2K.append('session_tag_no_material')
C2K.append('session_tag_student_left')
C2K.append('session_tag_student_not_engaging')
C2K.append('session_tag_used_whiteboard')
C2K.append('student_rating')
C2K.append('length_of_session')
C2K.append('avg_tutor_response_time')
C2K.append('max_tutor_response_time')
C2K.append('subject')
C2K.append('subtopic')
# C2K.append('rubric_version')
C2K.append('tutor_gender')
C2K.append('tutor_international_name')
C2K.append('tutor_fired')
C2K.append('tutor_math_exam_score')
C2K.append('tutor_physics_exam_score')
C2K.append('tutor_chemistry_exam_score')
C2K.append('tutor_last_sign_in_country')
C2K.append('tutor_age')
#C2K.append('student_platform')
#C2K.append('student_app_version')
#C2K.append('student_transactions_amount')


## Carga de archivos CSV de origen

**Nota:**
*Descomentar el archivo de mensajes necesario*

In [3]:
data_dir = os.path.join('..', 'dataset')

# dataset de mensajes
msg_data_file_name = 'dev_yup_messages_preprocessed.csv'
#msg_data_file_name = 'train_yup_messages_preprocessed.csv'
#msg_data_file_name = 'test_yup_messages_preprocessed.csv'
#msg_data_file_name = 'yup_messages_preprocessed.csv'

msg_full_data_file_name = os.path.join(data_dir, msg_data_file_name)
msg_df = pd.read_csv(msg_full_data_file_name)
msg_df = msg_df.astype({'created_at': 'datetime64[ns, UTC]'})

# dataset de metadata
md_data_file_name = 'datadump-20150801-20171219.csv'
md_full_data_file_name = os.path.join(data_dir, md_data_file_name)
md_df = pd.read_csv(md_full_data_file_name, low_memory=False)

# dataset de metadata
md_df.columns = [column.strip() for column in md_df.columns]
md_df = md_df[C2K]
md_df = md_df.astype(T2F)

# Caracterización del dataset de mensajes
print ('\nDimensiones del dataset de mensajes')
print ('Filas: {}'.format(msg_df.shape[0]))
print ('Columnas: {}'.format(msg_df.shape[1]))
print ('Cantidad de sesiones: ', msg_df['session_id'].nunique())
print ('Cantidad de turnos: {}'.format(msg_df.shape[0]))
print ('Cantidad de turnos del tutor: {}'.format(len(msg_df[msg_df.sent_from=='tutor'])))
print ('Cantidad de turnos del estudiante: {}'.format(len(msg_df[msg_df.sent_from=='student'])))

# Caracterización del dataset de metadata
print ('\nDimensiones del dataset de metadata')
print ('Filas: {}'.format(md_df.shape[0]))
print ('Columnas: {}'.format(md_df.shape[1]))
print ('Cantidad de sesiones: ', md_df['session_id'].nunique())


Dimensiones del dataset de mensajes
Filas: 1574069
Columnas: 6
Cantidad de sesiones:  25693
Cantidad de turnos: 1574069
Cantidad de turnos del tutor: 810024
Cantidad de turnos del estudiante: 601453

Dimensiones del dataset de metadata
Filas: 63265
Columnas: 35
Cantidad de sesiones:  63265


## Cálculo de la longitud de los mensajes y features derivadas de éstas

In [4]:
# Cálculo de las longitudes de los mensajes (cantidad de palabas)

msg_df['tokens'] = msg_df.text.apply(lambda x: literal_eval(x))
msg_df['n_words'] = msg_df.tokens.apply(lambda x: len(x))

In [5]:
# Número de palabras tutor y estudiante

agg_tutor_msg_df = msg_df[msg_df['sent_from']=='tutor'].groupby('session_id').agg(
    avg_words_tutor = ('n_words', np.mean),
    n_words_tutor = ('n_words', sum),
    n_msg_tutor = ('sent_from', lambda x: x.eq('tutor').sum())
    
)
agg_student_msg_df = msg_df[msg_df['sent_from']=='student'].groupby('session_id').agg(
    avg_words_student = ('n_words', np.mean),
    n_words_student = ('n_words', sum),
    n_msg_student = ('sent_from', lambda x: x.eq('student').sum())
    
)

## Otras agregaciones del dataset de mansajes

In [6]:
# Otras agregaciones del dataset de mensajes 
agg_msg_df = msg_df.groupby('session_id').agg(
        # started =('created_at', min),
        # ended =('created_at', max),
        duration = ('created_at', lambda x: (max(x) - min(x))),
        n_msg_bot = ('sent_from', lambda x: x.eq('bot').sum()),
        n_msg_system = ('sent_from', lambda x: x.isin(list(['system alert', 'system info', 'system warn'])).sum()),
        n_msg_content_text = ('content_type', lambda x: x.eq('text').sum()),
        n_msg_content_image = ('content_type', lambda x: x.eq('image').sum()),
)

In [7]:
print(agg_tutor_msg_df.shape)
print(agg_student_msg_df.shape)
print(agg_msg_df.shape)

(22359, 3)
(25195, 3)
(25693, 5)


## Unión de los dataframes

*Las métricas que no tienen valores en el left join se les asigna 0*

In [8]:
# Joins
res_df = pd.merge(md_df, agg_msg_df, on=["session_id", "session_id"])
res_df = pd.merge(res_df, agg_tutor_msg_df, how="left", on=["session_id", "session_id"])
res_df = pd.merge(res_df, agg_student_msg_df, how="left", on=["session_id", "session_id"])

# Llenar los NAN producto del left join con ceros
res_df['avg_words_tutor'] = res_df['avg_words_tutor'].fillna(0)
res_df['n_words_tutor'] = res_df['n_words_tutor'].fillna(0)
res_df['n_msg_tutor'] = res_df['n_msg_tutor'].fillna(0)

res_df['avg_words_student'] = res_df['avg_words_student'].fillna(0)
res_df['n_words_student'] = res_df['n_words_student'].fillna(0)
res_df['n_msg_student'] = res_df['n_msg_student'].fillna(0)

## Caracterización de la variable `student_rating` a `student_rating_cat`

In [9]:
# Categorización de la variable student_rating

def student_rating_category(x):
    
    if x <= 2:
        return 0
    
    if x >= 4:
        return 1

    return 'neutra'

# Eliminamos las filas nulas
res_df = res_df.dropna(subset=['student_rating'])

# Categorización de la variable numérica
res_df['student_rating_cat'] = res_df.student_rating.apply(student_rating_category)

# Remoción de las filas con calificación neutra
print ('Remoción de {} sesiones calificadas como neutras'.format(len(res_df[res_df['student_rating_cat'] == 'neutra'])))
res_df = res_df[res_df['student_rating_cat'] != 'neutra']

Remoción de 788 sesiones calificadas como neutras


In [10]:
# Caracterización del dataset combinado
print ('\nDimensiones del dataset combinado')
print ('Filas: {}'.format(res_df.shape[0]))
print ('Columnas: {}'.format(res_df.shape[1]))
print ('Cantidad de sesiones: ', res_df['session_id'].nunique())


Dimensiones del dataset combinado
Filas: 17429
Columnas: 47
Cantidad de sesiones:  17429


## Drop de columnas que no se van a usar

In [11]:
columns_to_keep = list()
columns_to_keep.append('tutor_id')
columns_to_keep.append('tutor_age')
columns_to_keep.append('session_tag_no_material')
columns_to_keep.append('session_tag_student_left')
columns_to_keep.append('session_tag_student_not_engaging')
columns_to_keep.append('student_complained')
columns_to_keep.append('student_complaint_clarity')
columns_to_keep.append('student_complaint_speed')
columns_to_keep.append('student_complaint_subject')
columns_to_keep.append('student_complaint_other')
columns_to_keep.append('session_tag_cheating')
columns_to_keep.append('session_tag_inappropriate')
columns_to_keep.append('session_tag_other_subject')
columns_to_keep.append('avg_words_tutor')
columns_to_keep.append('n_words_tutor')
columns_to_keep.append('n_msg_tutor')
columns_to_keep.append('avg_words_student')
columns_to_keep.append('n_words_student')
columns_to_keep.append('n_msg_student')
columns_to_keep.append('student_rating_cat')

final_df = res_df[columns_to_keep]

mean_tutor_age = final_df.tutor_age.mean().astype(int)
for i in final_df[final_df.tutor_age.isna()].index:
    final_df.loc[i, ['tutor_age']] = mean_tutor_age

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


## Guardado del archivo de resultados

In [12]:
# Salva archivo csv con el prefijo del archivo procesado
curated_data_file_name = msg_data_file_name.split('_')[0] + '_curated_data.csv'
curated_full_data_file_name = os.path.join(data_dir, curated_data_file_name)
final_df.to_csv(curated_full_data_file_name, index=False)