This is a code example that was run with the help of my science advisor, who provided me with data analytics.

In [None]:
import json
import pandas as pd
import numpy as np

from statistics import mean

import collections
from tqdm.auto import tqdm

In [None]:
data = pd.read_json('alexaprize-export.json')

In [None]:
df1 = data.iloc[:352, :]
df2 = data.iloc[352:, :]

In [None]:
FNAMES = []
FNAMES.append(df1)
FNAMES.append(df2)

In [None]:
def dfs(n=float('inf')):
    
    for fname in tqdm(FNAMES):
        # df = pd.read_json(fname)
        df = pd.DataFrame(fname)
        
    yield df

## Get length

In [None]:
def calc_len():
    
    length_of_dialogues = []
    
    for df in dfs():
        df['rating'] = df['rating'].fillna(0)
        notzero_rated = df.loc[df['rating'] != 0]
        rated_utts = notzero_rated['utterances'].tolist()

        for i in rated_utts:
            length = len(i)
            length_of_dialogues.append(length)
    
        length_data = pd.DataFrame({
            'rating': notzero_rated['rating'],
            'length_in_turns': length_of_dialogues})
    
    return length_data

In [None]:
def get_length(df):
    
    lengths_data = {}
    utterances = df['utterances'].tolist()
    
    dia = 0
    for dialogue in utterances:
        dia += 1
        length = len(dialogue)
    
        lengths_data.update({dia: {'length_in_turns': length}})
        lengths_df = pd.DataFrame(lengths_data).T
    
    return lengths_df

## Get sentiment

In [None]:
def get_sentiment(dfs):
    
    notzero_rated = dfs[dfs['rating'] != 0]
    utterances = notzero_rated['utterances'].tolist()
    dialogue_data = {}
    dia = 0
    for d in utterances:
        dia += 1
        sentiments = []
        for turn in d:
            if turn['user']['user_type'] == 'human': 
                try:
                    for key, value in turn['annotations']['combined_classification']['sentiment_classification'].items():
                        if key == 'positive':
                            sentiments.append(1)
                        elif key == 'neutral':
                            sentiments.append(0)
                        elif key == 'negative':
                            sentiments.append(-1)
                except KeyError:
                    pass
        
        dialogue_data.update({dia: {'sentiments': sentiments}})
        transposed_dialogue_data = pd.DataFrame(dialogue_data).T
        
    return transposed_dialogue_data

In [None]:
a = None

def overall_sentiment():
    
    global a
    notnan_dfs = []
    for df in dfs():
        notnan_df = df.dropna(subset=['rating']).loc[df['rating'] != 0]
        if not notnan_df.shape[0]: continue
        sentiment_calculated = get_sentiment(notnan_df)
        sentiment_calculated.index = notnan_df.index
        notnan_df['sentiment'] = sentiment_calculated
        a = notnan_df
        all_sentiments = [sum(sent) for sent in notnan_df['sentiment']]
    
        for i in range(len(all_sentiments)):
            if all_sentiments[i] > 0:
                all_sentiments[i] = 1
            elif all_sentiments[i] == 0:
                all_sentiments[i] = 0
            elif all_sentiments[i] < 0:
                all_sentiments[i] = -1
    
        notnan_df['sentiment'] = all_sentiments
        notnan_dfs.append(notnan_df)
        
    concat_df = pd.concat(notnan_dfs, ignore_index=True)
    concat_df.index += 1
    
    return concat_df

## Skills

In [None]:
def get_skills(df):
    
    utterances = df['utterances'].tolist()
    all_skills = {}
    dia = 0
    
    for dialogue in utterances:
        dia += 1
        skills = []
        for turn in dialogue:
            if turn['user']['user_type'] == 'bot':
                skills.append(turn['active_skill'])
        
        freq_skills = collections.Counter(skills)
        all_skills.update({dia: {
            'skill_name': dict(freq_skills)}})
        
        transposed_skills = pd.DataFrame(all_skills).T
        
    return transposed_skills

In [None]:
def get_skill_for_dataframes():

    notnan_dfs = []
    for df in dfs():
        notnan_df = df.dropna(subset=['rating']).loc[df['rating'] != 0]
        if not notnan_df.shape[0]: continue
        all_skills_freq = get_skills(notnan_df)
        all_skills_freq.index = notnan_df.index
        skills_df = all_skills_freq['skill_name'].apply(pd.Series)
        
        reset_ix = notnan_df.reset_index(drop=True)
        reset_ix.index += 1
        skills_df.index = reset_ix.index
 
        full_df = pd.concat([reset_ix, skills_df], axis=1)
        notnan_dfs.append(full_df)

    concat_df = pd.concat(notnan_dfs, ignore_index=True)
    concat_df.index += 1
    
    return concat_df.fillna(0)

In [None]:
SKILLS = get_skill_for_dataframes().rename(columns={'': 'no_skill'})

In [None]:
skills_dict_spearman = {}

for skill in skills_list:
    coef, p = spearmanr_corr(SKILLS, skill)
    alpha = 0.05
    if p > alpha: 
        skills_dict_spearman.update({skill: {'Spearman': coef,
                                    'p_value': p,
                                   'correlation': 'no'}})
    else:
        skills_dict_spearman.update({skill: {'Spearman': coef,
                                    'p_value': p,
                                   'correlation': 'yes'}})

In [None]:
skills_spearman = pd.DataFrame.from_dict(skills_dict_spearman, 
                                          orient='index', 
                                          columns=['Spearman', 'p_value', 'correlation'])

In [None]:
skills_dict_kendall = {}

for skill in skills_list:
    coef, p = kendalltau_corr(SKILLS, skill)
    alpha = 0.05
    if p > alpha: 
        skills_dict_kendall.update({skill: {'Kendall': coef,
                                    'p_value': p,
                                   'correlation': 'no'}})
    else:
        skills_dict_kendall.update({skill: {'Kendall': coef,
                                    'p_value': p,
                                   'correlation': 'yes'}})

In [None]:
skills_kendall = pd.DataFrame.from_dict(skills_dict_kendall, 
                                          orient='index', 
                                          columns=['Kendall', 'p_value', 'correlation'])

## Dialogue Acts

In [None]:
def get_das(df):
    
    utterances = df['utterances'].tolist()
    midas_data = {}
    dia = 0
    
    for dialogue in utterances:
        dia += 1
        das = []
        for turn in dialogue:
            if turn.get('user', {'user_type': None})['user_type'] == 'human':
                if 'midas_classification' in turn['annotations'].keys():
                    if len(turn['annotations']['midas_classification']) != 0:
                        das.append(turn['annotations']['midas_classification'][0])
                
        midas_data.update({dia: {'all_das': das}})
    
    return midas_data

In [None]:
def get_das_freq(das_dict):
    
    avr_das = {}
    das_class = {}
    dia = 0
    
    for dialogue, das in das_dict.items():
        dia += 1
        keys = das['all_das']
        if len(keys) != 0:
            keys = das['all_das'][0]
            for key in keys:
                freq = sum(d[key] for d in das['all_das']) / len(das['all_das'])
                das_class.update({key: freq})
        
            avr_das.update({dia: {'all_das': das_class}})
            das_class = {}
    
    list_das = pd.DataFrame(avr_das)
    transposed_das = pd.DataFrame(avr_das).T
    
    return transposed_das

In [None]:
def get_das_for_dataframes():

    notnan_dfs = []
    for df in dfs():
        notnan_df = df.dropna(subset=['rating']).loc[df['rating'] != 0]
        if not notnan_df.shape[0]: continue
        all_das_freq = get_das_freq(get_das(notnan_df))
        all_das_freq.index = notnan_df.index
        das_df = all_das_freq['all_das'].apply(pd.Series)
        
        reset_ix = notnan_df.reset_index(drop=True)
        reset_ix.index += 1
        das_df.index = reset_ix.index
 
        full_df = pd.concat([reset_ix, das_df], axis=1)
        notnan_dfs.append(full_df)

    concat_df = pd.concat(notnan_dfs, ignore_index=True)
    concat_df.index += 1
    
    return concat_df.fillna(0)

In [None]:
DAS = get_das_for_dataframes()

In [None]:
das_list = ['command', 'comment', 'opinion', 'complaint',
            'statement', 'neg_answer', 'pos_answer', 'dev_command', 
            'appreciation', 'other_answers', 'yes_no_question', 
            'open_question_factual', 'open_question_opinion']

In [None]:
das_dict_spearman = {}

for das in das_list:
    coef, p = spearmanr_corr(DAS, das)
    alpha = 0.05
    if p > alpha: 
        das_dict_spearman.update({das: {'Spearman': coef,  'p_value': p, 'correlation': 'no'}})
    else:
        das_dict_spearman.update({das: {'Spearman': coef,
                                    'p_value': p,
                                   'correlation': 'yes'}})

das_spearman = pd.DataFrame.from_dict(das_dict_spearman, 
                                          orient='index', 
                                          columns=['Spearman', 'p_value', 'correlation'])

In [None]:
das_dict_kendall = {}

for das in das_list:
    coef, p = kendalltau_corr(DAS, das)
    alpha = 0.05
    if p > alpha: 
        das_dict_kendall.update({das: {'Kendall': coef,  'p_value': p,  'correlation': 'no'}})
    else:
        das_dict_kendall.update({das: {'Kendall': coef, 'p_value': p, 'correlation': 'yes'}})

das_kendall = pd.DataFrame.from_dict(das_dict_kendall, 
                                          orient='index', 
                                          columns=['Kendall', 'p_value', 'correlation'])

In [None]:
final_das = pd.concat([das_pearson, das_spearman, das_kendall], axis=1)

## Sentiment change

In [None]:
def splitlist(inputlist, n):
    
    first_half = inputlist[:n]
    sec_half = inputlist[n:]
    
    return first_half, sec_half

In [None]:
s = None

def sentiment_change():
    
    global s
    notnan_dfs = []
    for df in dfs():
        notnan_df = df.dropna(subset=['rating']).loc[df['rating'] != 0]
        if not notnan_df.shape[0]: continue
        sentiment_calculated = get_sentiment(notnan_df)
        sentiment_calculated.index = notnan_df.index
        notnan_df['sentiment'] = sentiment_calculated
        s = notnan_df
        
        sentiment_changes = []
        for sent in notnan_df['sentiment']:
            if len(sent) > 3:
                first_half, second_half = splitlist(sent, len(sent) // 2)
                if sum(first_half) > sum(second_half):
                    sentiment_changes.append(-1)
                elif sum(first_half) < sum(second_half):
                    sentiment_changes.append(1)
                else:
                    sentiment_changes.append(0)
            else:
                sentiment_changes.append(-2)
        
        del notnan_df['sentiment']
        notnan_df['sentiment_by_the_end_of_dialogue'] = sentiment_changes
        notnan_dfs.append(notnan_df)
    
    concat_df = pd.concat(notnan_dfs, ignore_index=True)
    concat_df.index += 1
    
    return concat_df

## Dialogue ends abruptly

In [None]:
def abrupt_end_dialogue(df):
    
    utterances = df['utterances'].tolist()
    abrupt_dialogue = {}

    dialog = 0
    for dialogue in utterances:
        dialog += 1
        last_turn = []
        last_skill = []
        abruptly_ended = []
        for turn in dialogue:
            last_turt = []
            if turn.get('user', {'user_type': None})['user_type'] == 'bot':
                last_turn.append(turn['text'])
                last_skill.append(turn['active_skill'])

        if last_turn[-1][-1] == '?':
            abruptly_ended.append(1)
        else:
            abruptly_ended.append(0)

        abrupt_dialogue.update({dialog: {'abrupt_end': abruptly_ended[0],
                                        'last_skill': last_skill[-1]}})
        transposed_dialogue_data = pd.DataFrame(abrupt_dialogue).T
        
    return transposed_dialogue_data

In [None]:
def get_abrupt_end_for_dataframes():

    notnan_dfs = []
    for df in dfs():
        notnan_df = df.dropna(subset=['rating']).loc[df['rating'] != 0]
        if not notnan_df.shape[0]: continue
        all_ends = abrupt_end_dialogue(notnan_df)
        # all_ends.index = notnan_df.index
        
        reset_ix = notnan_df.reset_index(drop=True)
        reset_ix.index += 1
        all_ends.index = reset_ix.index
 
        full_df = pd.concat([reset_ix, all_ends], axis=1)
        notnan_dfs.append(full_df)

    concat_df = pd.concat(notnan_dfs, ignore_index=True)
    concat_df.index += 1
    
    return concat_df

## User's average answers are too short

In [None]:
def short_answers(df):
    
    utterances = df['utterances'].tolist()
    short_data = {}
    dia = 0
    
    for dialogue in utterances:
        dia += 1
        das = []
        sentence_length = []
        
        for turn in dialogue:
            if turn.get('user', {'user_type': None})['user_type'] == 'human':
                words = turn['text'].split()
                sentence_length.append(words)
            
        turn_lengths = []
        average_user_reply = []
        for sentence in sentence_length:
            turn_lengths.append(len(sentence))
        average_user_reply.append(mean(turn_lengths))
        
        short_reply = []
        for reply in average_user_reply:
            if reply > 1 and reply < 2:
                short_reply.append(1)
            else:
                short_reply.append(0)
        
      
        short_data.update({dia: {'average_length_too_short': short_reply[0]}})
    
    short_reply_list = pd.DataFrame(short_data).T
    
    return short_reply_list

In [None]:
def get_short_answers_for_dataframes():

    notnan_dfs = []
    for df in dfs():
        notnan_df = df.dropna(subset=['rating']).loc[df['rating'] != 0]
        if not notnan_df.shape[0]: continue
        all_ends = short_answers(notnan_df)
        # all_ends.index = notnan_df.index
        
        reset_ix = notnan_df.reset_index(drop=True)
        reset_ix.index += 1
        all_ends.index = reset_ix.index
 
        full_df = pd.concat([reset_ix, all_ends], axis=1)
        notnan_dfs.append(full_df)

    concat_df = pd.concat(notnan_dfs, ignore_index=True)
    concat_df.index += 1
    
    return concat_df

## Skills change too frequently

In [None]:
def freq_change_skill(df):
    
    utterances = df['utterances'].tolist()
    change_skill = {}

    dialog = 0
    for dialogue in utterances:
        dialog += 1
        all_skills = []
        for turn in dialogue:
            if turn.get('user', {'user_type': None})['user_type'] == 'bot':
                all_skills.append(turn['active_skill'])

        idx = 0
        result = []
        while idx < len(all_skills)-1:
            if len(all_skills) != 1:
                if all_skills[idx] == all_skills[idx+1]:
                    result.append(0)
                    # print(all_skills[idx], all_skills[idx+1], 'are the same')
                elif all_skills[idx] != all_skills[idx+1]:
                    result.append(1)
                    # print(all_skills[idx], all_skills[idx+1], 'are not the same')
                idx += 1
        
        change_skill.update({dialog: {'is_next_skill_the_same': dict(collections.Counter(result))}})
        
        skill_changes_too_freq = {}
        freq_skill = [] 
        for k, value in change_skill.items():
            keys = value['is_next_skill_the_same']
            if len(keys) != 0:
                f = max(keys, key=keys.get)
                freq_skill.append(f)
            else:
                freq_skill.append(2)

    skill_changes_too_freq.update({'skill_changes_too_freq': freq_skill})
    freq_skills_df = pd.DataFrame(skill_changes_too_freq)
    freq_skills_df.index += 1 

    return freq_skills_df

In [None]:
def get_skill_change_for_dataframes():

    notnan_dfs = []
    for df in dfs():
        notnan_df = df.dropna(subset=['rating']).loc[df['rating'] != 0]
        if not notnan_df.shape[0]: continue
        all_das = freq_change_skill(notnan_df)
        all_das.index = notnan_df.index
        
        full_df = pd.concat([notnan_df, all_das], axis=1)
        notnan_dfs.append(full_df)
        
    concat_df = pd.concat(notnan_dfs, ignore_index=True)
    concat_df.index += 1
    
    return concat_df.fillna(0) 

## Calculate all metrics

In [None]:
a = None

def find_bad_features():
    
    global a
    notnan_dfs = []
    for df in dfs():
        notnan_df = df.dropna(subset=['rating']).loc[df['rating'] != 0]
        if not notnan_df.shape[0]: continue
            
        # length
        length_obtained = get_length(notnan_df)
        length_obtained.index = notnan_df.index
        notnan_df['length_in_turns'] = length_obtained
            
        # sentiment
        sentiment_calculated = get_sentiment(notnan_df)
        sentiment_calculated.index = notnan_df.index
        notnan_df['sentiment'] = sentiment_calculated
        a = notnan_df
        all_sentiments = [sum(sent) for sent in notnan_df['sentiment']]
    
        for i in range(len(all_sentiments)):
            if all_sentiments[i] > 0:
                all_sentiments[i] = 1
            elif all_sentiments[i] == 0:
                all_sentiments[i] = 0
            elif all_sentiments[i] < 0:
                all_sentiments[i] = -1
    
        notnan_df['overall_sentiment'] = all_sentiments
        
        # sentiment change
        sentiment_changes = []
        for sent in notnan_df['sentiment']:
            if len(sent) > 3:
                first_half, second_half = splitlist(sent, len(sent) // 2)
                if sum(first_half) > sum(second_half):
                    sentiment_changes.append(-1)
                elif sum(first_half) < sum(second_half):
                    sentiment_changes.append(1)
                else:
                    sentiment_changes.append(0)
            else:
                sentiment_changes.append(2)
        
        del notnan_df['sentiment']
        notnan_df['sentiment_by_the_end_of_dialogue'] = sentiment_changes

        reset_ix = notnan_df.reset_index(drop=True)
        reset_ix.index += 1
        
        # dialogue ends too abruptly
        all_ends = abrupt_end_dialogue(reset_ix)
        
        # user's answers are mostly very short
        short_reply = short_answers(reset_ix)
 
        # skills change frequently
        all_skills = freq_change_skill(reset_ix)
        
        new_df = pd.concat([reset_ix, all_ends, 
                            short_reply, all_skills], axis=1) 
        notnan_dfs.append(new_df)
    
    concat_df = pd.concat(notnan_dfs, ignore_index=True)
    concat_df.index += 1
    
    return concat_df

In [None]:
RESULT_DIALOGUES = find_bad_features()