In [1]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
from firebase_admin import db
import pickle
import os
from psycopg2.pool import ThreadedConnectionPool
import psycopg2.extras



def setup_firebase():
    config = {
        "apiKey": "AIzaSyCznO4_adkIorJSXO6wNNPkR7D-HBFEfe0",
        "authDomain": "simpleintervals.firebaseapp.com",
        "databaseURL": "https://simpleintervals.firebaseio.com",
        "storageBucket": "simpleintervals.appspot.com",
        "serviceAccount": "AUTH/SimpleIntervals-f3d1a6f15f68.json"
    }

    # Use a service account
    cred = credentials.Certificate('AUTH/SimpleIntervals-f3d1a6f15f68.json')
    firebase_admin.initialize_app(cred, config)

class ProcessSafePoolManager(object):
    def __init__(self, *args, **kwargs):
        self.last_seen_process_id = os.getpid()
        self.args = args
        self.kwargs = kwargs
        self._init()

    def _init(self):
        self._pool = ThreadedConnectionPool(*self.args, **self.kwargs)

    def getconn(self):
        current_pid = os.getpid()
        if not (current_pid == self.last_seen_process_id):
            self._init()
            print("New id is " + current_pid + ", old id was " + self.last_seen_process_id)
            self.last_seen_process_id = current_pid
        return self._pool.getconn()

    def putconn(self, conn):
        return self._pool.putconn(conn)
    
def setup_postgres():
    SQL_PASSWORD = 'yjnc2xRbEENHbmQ'
    pool = None
    try:
        pool = ProcessSafePoolManager(1, 10, 
                                      host='35.238.139.24',
                                      port='5432',
                                      user='server',
                                      password=SQL_PASSWORD,
                                      sslmode='verify-ca',
                                      sslrootcert='AUTH/server-ca.pem',
                                      sslkey='AUTH/client.key',
                                      sslcert='AUTH/client.crt',
                                      database='portal',
                                      connect_timeout=3)

    except Exception as e:
        print("FAILED", e)
    if (pool):
        print("Connection pool created successfully")
    return pool

#setup_firebase()
pool = setup_postgres()
def get_user_dict():
    if not os.path.exists("user_dict.pickl") and pool:
        ps_connection  = pool.getconn()
        if (ps_connection):
            print("successfully received connection from connection pool ")
            ps_cursor = ps_connection.cursor(cursor_factory = psycopg2.extras.DictCursor)

            ps_cursor.execute("select uid, email, firstname, lastname, child_list from users_min")
            records = ps_cursor.fetchall()
            ps_cursor.close()
            #Use this method to release the connection object and send back ti connection pool
            pool.putconn(ps_connection)
            print("Put away a PostgreSQL connection")

            user_dict = {}
            for row in records:
                user_dict[row['uid']] = {'email': row['email'], 'firstname': row['firstname'], 
                                         'lastname': row['lastname'], 'childList': row['child_list']}
            return user_dict
    elif os.path.exists('user_dict.pickl'):
        user_dict = pickle.load(open("user_dict.pickl", "rb"))
        return user_dict
    else:
        return None
    
user_dict = get_user_dict()
setup_firebase()

Connection pool created successfully
successfully received connection from connection pool 
Put away a PostgreSQL connection


In [2]:
#get current data that are in the queue and see which can be send out without changes
def GetCompletedTriages(min_time, max_age):
    ps_connection = pool.getconn()
    ps_cursor = ps_connection.cursor(cursor_factory = psycopg2.extras.NamedTupleCursor)

    q = """  
        SELECT t.*, hist.id, hist.details, hist.agent, hist.time as hist_time, hist.duration, hist.notes 
        FROM triage as t RIGHT JOIN triage_history as hist on t.triage_key = hist.triage_key 
        WHERE t.status = %s AND t.time >= %s and t.age <= %s ORDER BY t.time desc;
        """
    
    ps_cursor.execute(q, ('Done', min_time, max_age))

    triage_dict = {}
    for rec in ps_cursor:
        t = rec._asdict()
        key = t['triage_key']
        if key not in triage_dict:
            triage_dict[key] = {'uid': t['uid'],
                                'cid': t['cid'],
                                'age': t['age'],
                                'sql_key': key,
                                'status': t['status'],
                                'due': t['due'],
                                'time': t['time'],
                                'turnaround_time': t['turnaround_time'],
                                'rec_request_key': t['rec_request_key'],
                                'request': t['request'],
                               }
            if t['assignee']:
                triage_dict[key]['assignee'] = t['assignee']
            if t['generate']:
                triage_dict[key]['generate'] = t['generate']
                
            triage_dict[key]['history'] = {}
    
        if t['details'] == 'recs_history':
            triage_dict[key]['recs_history'] = t['notes']
        else:            
            history = triage_dict[key]['history']
            history_id = t['id']
            history[history_id] = {'time': t['hist_time']}
            if t['agent']:
                history[history_id]['agent'] = t['agent']
            if t['notes']:
                history[history_id]['notes'] = t['notes']
            if t['details']:
                history[history_id]['details'] = t['details']
            if t['duration']:
                history[history_id]['duration'] = t['duration']
    
    for case_id in list(triage_dict.keys()):
        v = triage_dict[case_id]
        no_change = 0
        change = 0
        for k, h in v['history'].items():
            if 'details' not in h:
                continue
            if h['details'] == 'FQ_noChangeNeeded':
                no_change += 1
            if h['details'] == 'FQ_yesChangeNeeded':
                change += 1
        if change + no_change > 1 or change + no_change == 0:
            del triage_dict[case_id]
                
    pool.putconn(ps_connection)
    
    #we also need to add in recs_history
    return triage_dict

reorder_triages = GetCompletedTriages(1619850739, 4)


In [3]:
# read it with threads
from concurrent import futures

def get_draft_recommendations_in_triages(triages, print_debug=False):
    dbrecs = {}
    def get_draft_recommendations(cid):
        recs = db.reference('draft_recommendations/'+cid).get()
        dbrecs[cid] = recs

    ex = futures.ThreadPoolExecutor(max_workers=4)
    if print_debug:
        print('main: starting')
    wait_for = [
        ex.submit(get_draft_recommendations, v['cid'])
        for k, v in triages.items()
    ]

    i = 0
    for f in futures.as_completed(wait_for):
        if i % 100 == 0 and print_debug:
            print("{} user rec dowloaded", i)
        i += 1
    return dbrecs
        
dbrecs = get_draft_recommendations_in_triages(reorder_triages, True)

main: starting
{} user rec dowloaded 0
{} user rec dowloaded 100
{} user rec dowloaded 200
{} user rec dowloaded 300
{} user rec dowloaded 400
{} user rec dowloaded 500
{} user rec dowloaded 600
{} user rec dowloaded 700
{} user rec dowloaded 800
{} user rec dowloaded 900
{} user rec dowloaded 1000
{} user rec dowloaded 1100
{} user rec dowloaded 1200
{} user rec dowloaded 1300
{} user rec dowloaded 1400
{} user rec dowloaded 1500
{} user rec dowloaded 1600
{} user rec dowloaded 1700
{} user rec dowloaded 1800
{} user rec dowloaded 1900
{} user rec dowloaded 2000
{} user rec dowloaded 2100
{} user rec dowloaded 2200
{} user rec dowloaded 2300
{} user rec dowloaded 2400
{} user rec dowloaded 2500
{} user rec dowloaded 2600
{} user rec dowloaded 2700
{} user rec dowloaded 2800
{} user rec dowloaded 2900
{} user rec dowloaded 3000
{} user rec dowloaded 3100
{} user rec dowloaded 3200
{} user rec dowloaded 3300
{} user rec dowloaded 3400
{} user rec dowloaded 3500
{} user rec dowloaded 360

In [4]:
def keep_recs_after_time(dbrecs, timestamp):
    #keep last years data for dbrecs
    last_year_recs = {}
    for cid, recs in dbrecs.items():
        if isinstance(recs, list):
            continue
    
        if not recs:
            continue
            
        new_recs = {}
        for recdate, rec in recs.items():
            if int(recdate) < timestamp:
                continue
            new_recs[recdate] = rec
    
        if new_recs:
            last_year_recs[cid] = new_recs
    return last_year_recs

last_year_recs = keep_recs_after_time(dbrecs, 1626984685)

In [5]:
def get_sleep_consultant_id_name(fsdb):
    docs = fsdb.collection(u'admins').stream()
    name_to_assignee = {}
    for doc in docs:
        data = doc.to_dict()
        if not data['assignable']:
            continue
        name_to_assignee[doc.to_dict()['display_name']] = doc.id
    
    return {v:k for k, v in name_to_assignee.items()}

fsdb = firestore.client()
assignee_to_name = get_sleep_consultant_id_name(fsdb)

In [6]:
import unicodedata
from bs4 import BeautifulSoup
import html
import re
import datetime as dt
import delorean as dn
from diff_match_patch import diff_match_patch
import multiprocessing as mp

def cleanhtml(raw_html):
    cleantext = BeautifulSoup(html.unescape(raw_html)).get_text()
    cleantext = unicodedata.normalize("NFKD", cleantext)
    #return ''.join(cleantext.splitlines())
    return cleantext

def textdiff(text1,text2, print_debug=False):
    added = 0
    subbed = 0
    
    if print_debug:
        print(text1)
        print(text2)
    dmp = diff_match_patch()
    
    #there is something going on with the timeout when running 4 process at the same time
    #maybe CPU throttle or vm issues. setting timeout to 2.5 to get good results
    dmp.Diff_Timeout = 0
    diff = dmp.diff_main(text1, text2)
    
    if print_debug:
        print(diff)
    dmp.diff_cleanupSemantic(diff)
    if print_debug:
        print("after cleaning")
        print(diff)
    sentences_added = []
    sentences_subbed = []
    comp = ''
    

    for d in diff:
        if not d[1].strip():
            continue
        
        if d[0] == 1:
            added += len(d[1])
            comp += '+'
            if len(d[1]) > 80:
                sentences_added.append(d[1].strip())
        elif d[0] == -1:
            subbed += len(d[1])
            comp += '-'
            if len(d[1]) > 80:
                sentences_subbed.append(d[1].strip()) #removal
        else:
            comp += ' '
            
        
    return {'add':added,'sub':subbed, 'compare':comp, 
            'sentences_added': sentences_added,
            'sentences_subbed': sentences_subbed}

def alignRecs(finalrecs,autorecs):
    aligned = []
    for autorec in autorecs:
        if 'type' in autorec:
            if autorec['type'] == 'Schedule':
                autorec['htmltext'] = '<table>'
                for entry in autorec['entries']:
                    autorec['htmltext'] += '<tr><td width=140px>' + entry['description'] +'</td><td>' + entry['time'] + '</td></tr>'
                autorec['htmltext'] += '</table>'
            elif autorec['type'] == 'Divider':
                autorec['description'] = autorec['description']+' (div)'

    rec_index = 0
    rec_key = -1
    for finalrec in finalrecs:
        rec_key += 1
        cur_source = 'skip'
        finalrec['draft_index'] = rec_index
        if 'type' in finalrec:
            if finalrec['type'] == 'Schedule':
                finalrec['htmltext'] = '<table>'
                for entry in finalrec['entries']:
                    finalrec['htmltext'] += '<tr><td width=140px>' + entry['description'] + '</td><td>' + entry['time'] + '</td></tr>'
                finalrec['htmltext'] += '</table>'
            elif finalrec['type'] == 'Divider':
                finalrec['description'] = finalrec['description']+' (div)'

        if 'rec_sig' in finalrec:
            if 'source' in finalrec['rec_sig']:
                cur_source = finalrec['rec_sig']['source']

        used_flag = 0
        if cur_source != 'skip':
            auto_key = -1
            for autorec in autorecs:
                auto_key += 1
                if 'rec_sig' in autorec:
                    if 'source' in autorec['rec_sig']:
                        if cur_source == autorec['rec_sig']['source']:
                            aligned.append([finalrec,autorec])
                            autorec['used'] = rec_key
                            used_flag = 1

        #try matching rec tag
        if used_flag == 0:
            cur_source = 'skip'
            if 'rectag' in finalrec:
                cur_source = finalrec['rectag']
                if cur_source != 'skip':
                    for autorec in autorecs:
                        if 'rectag' in autorec:
                            if cur_source == autorec['rectag']:
                                aligned.append([finalrec,autorec])
                                autorec['used'] = rec_key
                                used_flag = 1

        #try matching descriptions
        if used_flag == 0:
            cur_source = 'skip'
            if 'description' in finalrecs[rec_key]:
                cur_source = finalrec['description']

                cur_source_strp = stripTAG(cur_source)

                if cur_source != 'skip':
                    auto_key = -1
                    for autorec in autorecs:
                        auto_key += 1
                        if 'description' in autorec:

                            auto_des_strp = stripTAG(autorec['description'])

                            if auto_des_strp in cur_source_strp:
                                if 'type' in finalrec:
                                    cur_type = finalrec['type']
                                    if 'type' in autorec:
                                        if autorec['type'] == cur_type:
                                            aligned.append([finalrec,autorec])
                                            autorec['used'] = rec_key
                                            used_flag = 1

        if used_flag == 0:
            aligned.append([finalrec,{'description':'','htmltext':''}])

    #unused recs
    auto_key = -1
    for autorec in autorecs:
        auto_key += 1
        if 'used' in autorec:
            ok = 1
        else:
            if auto_key == 0:
                aligned.insert(0,[{'description':'','htmltext':''},autorec])
            else:
                #check position of previous card
                previous_description = autorecs[auto_key-1]['description']
                insert_position = -1
                previous_position = -1
                for cur_aligned in aligned:
                    previous_position += 1
                    if cur_aligned[1]['description'] == previous_description:
                        insert_position = int(previous_position) + 1

                if int(insert_position) == -1:
                    aligned.append([{'description':'','htmltext':''},autorec])
                elif insert_position == len(aligned):
                    aligned.append([{'description':'','htmltext':''},autorec])
                else:
                    aligned.insert(int(insert_position),[{'description':'','htmltext':''},autorec])
    return aligned

def stripTAG(string_input):
    if string_input[0:3] == 'NEW':
        string_output = string_input[3:].strip()
    elif string_input[0:7] == 'UPDATED':
        string_output = string_input[7:].strip()
    else:
        string_output = string_input
    return string_input

def format_recs(recs_to_format, key):
    temp_recs = []
    ii = 0
    for rec in recs_to_format:
        thisrec = {}
        cur = rec
        thisrec['key'] = key
        thisrec['id'] = ii
        ii+=1
        thisrec['type'] = cur['type']
        if 'rec_tag' in cur:
            thisrec['rectag'] = cur['rectag']
        thisrec['description'] = cur['description']
        if cur['rec_sig'] != None:
            thisrec['rec_sig'] = cur['rec_sig']
        if cur['type'] == "Schedule":
            thisrec['entries'] = cur['entries']
            thisrec['htmltext'] = '<table>'
            for entry in cur['entries']:
                thisrec['htmltext'] += '<tr><td width=140px>' + entry['description'] + '</td><td>' + entry['time'] + '</td></tr>'
            thisrec['htmltext'] += '</table>'
        else:
            temp = cur['text']
            #replace newline
            temp = temp.replace("\n", "")
            thisrec['htmltext'] = temp

        temp_recs.append(thisrec)
    return temp_recs

def compare_chooseDate(editted_recs, auto_recs, key):
    diff_output = []
    
    original_recs = format_recs(auto_recs, key)
    aligned = alignRecs(editted_recs, original_recs)
    
    for alignedrec in aligned:
        sentences_added = []
        sentences_subbed = []
        if 'description' in alignedrec[0]:
            cur_rec_des = alignedrec[0]['description']
            cur_rec_des = cleanhtml(cur_rec_des)
        else:
            cur_rec_des = ''
        
        if 'description' in alignedrec[1]:
            cur_auto_des = alignedrec[1]['description']
            cur_auto_des = cleanhtml(cur_auto_des)
        else:
            cur_auto_des = ''
        
        diff_out = textdiff(cur_auto_des,cur_rec_des)
        
        added = int(diff_out['add'])
        subbed = int(diff_out['sub'])
        
       
        sentences_added.append(tuple(diff_out['sentences_added']))
        sentences_subbed.append(tuple(diff_out['sentences_subbed']))

        
        if 'htmltext' in alignedrec[0]:
            cur_rec_text = alignedrec[0]['htmltext']
            if 'type' in alignedrec[0]:
                if alignedrec[0]['type'] != "Schedule":
                    cur_rec_text = cleanhtml(cur_rec_text)
        else:
            cur_rec_text = ''

        if 'htmltext' in alignedrec[1]:
            cur_auto_text = alignedrec[1]['htmltext']
            if 'type' in alignedrec[1]:
                if alignedrec[1]['type'] != "Schedule":
                    cur_auto_text = cleanhtml(cur_auto_text)
        else:
            cur_auto_text = ''

        diff_out = textdiff(cur_auto_text,cur_rec_text)
        
        added += int(diff_out['add'])
        subbed += int(diff_out['sub'])
        
        sentences_added.append(tuple(diff_out['sentences_added']))
       
        sentences_subbed.append(tuple(diff_out['sentences_subbed']))

                
        diff_output.append(({'description':cur_rec_des,'text':cur_rec_text},
                            {'description':cur_auto_des,'text':cur_auto_text},
                            {'add':added, 'sub':subbed},
                            {'sentences_added': sentences_added, 'sentences_subbed': sentences_subbed}))
    return diff_output
                                         
def parse_output(compare_output):
    compared_list = []
    for aligned in compare_output:
        if aligned[1] == {'description': '', 'text': ''}:
            cur_status = 'added'
            title = aligned[0]['description']
        elif aligned[0] == {'description': '', 'text': ''}:
            cur_status = 'removed'
            title = aligned[1]['description']
        else:
            cur_status = '+:'+str(aligned[3]['add'])+',-:'+str(aligned[3]['sub'])
            title = aligned[0]['description']
        compared_list.append({'status':cur_status,'title':title})
    return compared_list

def compare_rec_with_autogen(triage, assignee_name, childList, cur_time, drecs):
    key = triage['sql_key']
    cid = triage.get('cid', '') or triage.get('uid', '')
    uid = triage['uid']
    
    assignee_name
    
    request = triage['request']
    status = triage['status']
    
    output = {}

    #bug since we are actually looping through all the recommendations for a child and it may not
    #be the first one

    for new_recdate, new_rec_space in drecs.items():
        try:
            creation_sig = new_rec_space['rec_signature']['creation_sig']
            
            if 'request_key' not in creation_sig:
                continue
                
            if creation_sig['request_key'] != triage['rec_request_key']:
                continue
            
            if 'auto_gen' not in new_rec_space:
                continue
            
            if new_rec_space['notUL'] == 1:
                continue
                
            auto_recs = new_rec_space['auto_gen']['rec_json']
            editted_recs = new_rec_space['rec_json']
            email = user_dict[uid]['email']
            
            out_put = compare_chooseDate(editted_recs, auto_recs, key)

            consultant = assignee_name
            age = triage['age']
            id_ = new_rec_space['rec_signature']['_id']
            searchArray = new_rec_space['auto_gen']['rec_signature']['searchArray']
            creation_sig = new_rec_space['rec_signature']['creation_sig']
            child = ''
            for entry in childList:
                if entry['cid'] == cid:
                    child = entry['nickname']
            
            output = {'id': id_, 'creation_sig': creation_sig, 'searchArray': searchArray, 'age':age,
                      'output':out_put,'date':cur_time,'email':email,'child':child,'uid':uid,'cid':cid,
                      'consultant':consultant,'key':key}
            
            break
        except Exception as e:
            print("exception", e)
            
    return output

def compare_recs(triages, dbrecs, user_dict, assignee_to_name):
    triage_compare_list = []
    cur_time = dn.epoch(dt.datetime.now().timestamp()).shift('America/Los_Angeles').datetime.strftime('%Y-%m-%d %H:%M')
    
    for key, cur_entry in triages.items():
        if ('uid' not in cur_entry or 'assignee' not in cur_entry or 
            'request' not in cur_entry or 'status' not in cur_entry):
            continue
        
        cid = cur_entry.get('cid', '') or cur_entry.get('uid', '')
        assignee = cur_entry['assignee']
        if assignee not in assignee_to_name:
            continue

        assignee_name = assignee_to_name[assignee]
        uid = cur_entry['uid']
        request = cur_entry['request']
        status = cur_entry['status']

        if (request == "CC full analysis" or request == 'EA full analysis') and (status == "Done"):
            if cid not in dbrecs:
                continue
            
            drecs = dbrecs[cid]        
            childList = user_dict[uid]['childList']
            output = compare_rec_with_autogen(cur_entry, assignee_name, childList, cur_time, drecs)
            
            if output:
                triage_compare_list.append(output)
    
    return triage_compare_list

def compare_recs_mp(triages, dbrecs, user_dict, assignee_to_name):
    data_for_cmp = []
    cur_time = dn.epoch(dt.datetime.now().timestamp()).shift('America/Los_Angeles').datetime.strftime('%Y-%m-%d %H:%M')
    
    def generate_data():
        for key, cur_entry in triages.items():
            if ('uid' not in cur_entry or 
                'request' not in cur_entry or 'status' not in cur_entry):
                print("uid/request/status", cur_entry)
                continue
        
            cid = cur_entry.get('cid', '') or cur_entry.get('uid', '')
            assignee = cur_entry.get('assignee', '123')
            
            assignee_name = assignee_to_name.get(assignee, "unknown")
            uid = cur_entry['uid']
            request = cur_entry['request']
            status = cur_entry['status']
            if uid not in user_dict:
                print("uid not in user_dict", uid)
                continue
            
            if (request == "CC full analysis" or request == 'EA full analysis') and (status == "Done") and cid in dbrecs:
                drecs = dbrecs[cid]
                childList = user_dict[uid]['childList']
                yield (cur_entry, assignee_name, childList, cur_time, drecs)
    
    
    with mp.Pool(processes=mp.cpu_count(), maxtasksperchild=100) as pool:
        triage_compare_list = pool.starmap(compare_rec_with_autogen, generate_data())
    
    return triage_compare_list



In [7]:
compare_triaage_list = compare_recs_mp(reorder_triages, last_year_recs, user_dict, assignee_to_name)

In [9]:
import re
def get_history(triage):
    history = {'SA': 0, 'toofew': 0, 'vip': 0, 'rec_history': 0, 'manynotes': 0, 'question': 0, 'cur_age': 0}
    notes_history = triage.get('history', {})
    
    nweeks = 10
    nrecs = 20
    nw_number = 0
    
    re_nweeks = re.compile(r"(\d+)(Weeks|W)")
    re_nrecs = re.compile(r"(\d+)(Recs|R)")
    
    for (time_key, history_entry) in sorted(notes_history.items()):
        agent = history_entry.get('agent', '')
#         if not agent or agent != 'autogen':
#             continue
            
        notes = history_entry.get("notes", '')
        if notes == '':
            continue
            
        
        notes = notes.split(',')
        for note in notes:
            note = note.strip()
            if 'VIP' in note:
                history['vip'] = 1
        
            if 'SA' in note:
                history['SA'] = 1
            
            if "NW " in note:
                try:
                    nw_number=int(note.replace("NW","").strip())
                    if nw_number > 200:
                        history['manynotes'] = 1
                except Exception as e:
                    print(note)
            
            if 'sibling' in note:
                history['rec_history'] = 1 
        
            m = re_nweeks.match(note)
            if m:
                nweeks = int(m.group(1))
          
                
            m = re_nrecs.match(note)
            if m:
                nrecs = int(m.group(1))

    history['toofew'] = 1 if (nweeks < 4 or nrecs < 8) else 0
    history['nweeks'] = nweeks
    history['nrecs'] = nrecs
    history['nw_number'] = nw_number
    recs_history = triage.get("recs_history", "")
    if recs_history:
        num_recs_history = len(recs_history.splitlines())
        history['rec_history'] = 2 if num_recs_history > 2 else 1
        
    triage_req = triage.get('request', {})
    history['question'] = 1 if 'question' in triage_req else 0
    history['cur_age'] = triage.get("age", 0) if triage_req else 0
    return history

def case_classifier(triage):
    history = get_history(triage)
    rec_type = []
    rec_type_dict = {'vip': -1, 'repeat': -1, 'greater2': -1, 'toofew': -1,
                     'manynotes': -1, 'less4month': -1, 'greater11month': -1,
                     'SA': -1, 'general': -1, 'question': -1}

    if history['question']:
        rec_type.append('question')
        rec_type_dict['question'] = 1
    else:
        if history['vip']:
            rec_type.append('vip')
            rec_type_dict['vip'] = 1
        if history['rec_history'] == 1 or history['rec_history'] == 2:
            rec_type.append("repeat")
            rec_type_dict['repeat'] = 1
        if history['rec_history'] == 2:
            rec_type.append('+2')
            rec_type_dict['greater2'] = 1
        if history['toofew']:
            rec_type.append('toofew')
            rec_type_dict['toofew'] = 1
        if history['cur_age'] < 4:
            rec_type.append('<4m')
            rec_type_dict['less4month'] = 1
        if history['manynotes']:
            rec_type.append('manynotes')
            rec_type_dict['manynotes'] = 1
        if history['cur_age'] > 11:
            rec_type.append('>11m')
            rec_type_dict['greater11month'] = 1
        if history['SA']:
            rec_type.append('SA')
            rec_type_dict['SA'] = 1
        if not rec_type:
            rec_type.append('general')
            rec_type_dict['general'] = 1
    rec_type_dict['nweeks'] = history['nweeks']
    rec_type_dict['nrecs'] = history['nrecs']
    rec_type_dict['nw_number'] = history['nw_number']
        
    return rec_type, rec_type_dict


In [10]:
def get_triage_features(triages):
    triage_features = {}
    for _, triage in triages.items(): 
        rec_request_key = triage.get('rec_request_key', '')
        if rec_request_key == '':
            continue
        _, features = case_classifier(triage)
        for t, hist in triage['history'].items():
            if 'details' in hist and hist['details'] == 'FQ_yesChangeNeeded':
                features['NoChangeNeeded'] = 0
        
        for t, hist in triage['history'].items():
            if 'details' in hist and hist['details'] == 'FQ_noChangeNeeded':
                features['NoChangeNeeded'] = 1
                
        if 'NoChangeNeeded' not in features:
            features['NoChangeNeeded'] = -5
        triage_features[rec_request_key] = features
        
        
    return triage_features
    
triage_features = get_triage_features(reorder_triages)


In [None]:
no = 0
yes = 0
for k, v in triage_features.items():
    if v['NoChangeNeeded'] == 1:
        no += 1
    elif v['NoChangeNeeded'] == 0:
        yes += 1
    
print("no, yes", no, yes, no/(no+yes))

In [11]:
def GetFirstReq(reorder_triage):
    first_time_triage = set()
    req_key_to_triage = {}
    triage_to_consultant = {}
    for _, t in reorder_triage.items():
        if 'assignee' in t and t['assignee'] not in assignee_to_name:
            continue
        if 'recs_history' not in t:
            first_time_triage.add(t['sql_key'])
        if 'rec_request_key' in t:
            req_key_to_triage[t['rec_request_key']] = t['sql_key']
        if 'assignee' in t:
            triage_to_consultant[t['sql_key']] = assignee_to_name[t['assignee']]
        else:
            triage_to_consultant[t['sql_key']] = 'nobody'
    return first_time_triage, req_key_to_triage, triage_to_consultant

first_time_triage, req_key_to_triage, triage_to_consultant = GetFirstReq(reorder_triages)

In [12]:
import delorean as dn
from collections import defaultdict
date_created = []
num_changes = []

changes_sections = defaultdict(int)

num_changes_dict = defaultdict(int)
compare_triage_list1 = [d for d in compare_triaage_list if d]

for d in compare_triage_list1:
    triage_key = d['key']
    if triage_key not in reorder_triages:
        print("missing triage key in reorder_triages")
        continue
        
    if triage_key not in first_time_triage:
        print(reorder_triages[triage_key])
        print("not first time")
        continue
    try:
        history = reorder_triages[triage_key]['history']
        last_key = sorted(history.keys())[-1]
    except Exception as e:
        print(reorder_triages[triage_key])
    
    cid = d['cid']
    request_key = d['creation_sig']['request_key']
    recs = dbrecs[cid]
    #get the right rec
    real_rec = None
    for rec_date, rec in recs.items():
        if 'rec_signature' not in rec:
            continue
        
        if 'creation_sig' not in rec['rec_signature']:
            contnue
        
        if 'request_key' not in rec['rec_signature']['creation_sig']:
            continue
        if rec['rec_signature']['creation_sig']['request_key'] == request_key:
            real_rec = rec
            break
    
    if real_rec is None:
        continue
    
    rec_json = real_rec['rec_json']
    timestamp = last_key
    output = d['output']
    num = 0
    for o in output:
        if 'div' in o[0]['description'].lower():
            continue
        
        if 'schedule' in o[0]['description'].lower():
            continue
#         if o[0]['description'].lower() == 'explanation of schedule':
#             continue
        skip = False
        for card in rec_json:
            if card['description'] == o[0]['description']:
                skip = card['type'] == 'Schedule'
                break
        if skip:
            continue
        if o[2]['add'] + o[2]['sub'] < 50:
            continue
        num += 1
        changes_sections
    date_created.append(timestamp)
    num_changes.append(num)
    num_changes_dict[num] += 1
    


not first time


In [13]:
import pandas as pd
df = pd.DataFrame(num_changes_dict.items(), columns=['num_changes', 'count'])
df = df.sort_values(['num_changes'])
total = df['count'].sum()
df['% change'] = df['count']/total
from IPython.display import display, HTML
display(HTML(df.to_html(index=False)))
print(sum(df['count']))

num_changes,count,% change
0,330,0.229965
1,508,0.354007
2,298,0.207666
3,140,0.097561
4,57,0.039721
5,13,0.009059
6,40,0.027875
7,32,0.0223
8,12,0.008362
9,5,0.003484


1435


In [25]:
from collections import defaultdict
import pandas as pd
import numpy as np

from collections import defaultdict
def get_rec_requests():
    """Get all the requests"""
    #THIS IS TOO SLOW NOW SINCE WE HAVE MANY REC REQUESTS NOW. MOVE TO USING THREADS
    dbrequests = db.reference('rec_requests').get()
    rec_request_free_text = defaultdict(list)
    
    for user_key, request in dbrequests.items():
        all_requests = defaultdict(list)
        for rec_key, rec in request.items():
            cid = user_key
            if 'cid' in rec:
                cid = rec['cid']
            all_requests[cid].append((rec_key, rec['time']))
            if 'questionnaire' in rec:
                user_q = rec['questionnaire']
                for q_key, ques in user_q.items():
                    if 'answer' in ques:
                        ans = ques['answer']
                        if "multi" in ques['type']:
                            for multi in ans:
                                if "_other" in multi['label']:
                                    other_answer = multi['encode']
                                    if other_answer != 'n/a' and other_answer != 'none' and other_answer != 'no' and other_answer != '':
                                        rec_request_free_text[rec_key].append((q_key, multi['label'], other_answer))
                        elif "label" in ques:
                            if"_other" in ans['label']:
                                other_answer = ans['encode'].strip().lower()
                                if other_answer != 'n/a' and other_answer != 'none' and other_answer != 'no' and other_answer != '':
                                    rec_request_free_text[rec_key].append((q_key, ans['label']), other_answer)
                        elif "textarea" in ques['type']:
                            textarea_answer = ans['encode'].strip().lower()
                            if textarea_answer != 'n/a' and textarea_answer != 'none' and textarea_answer != 'no' and textarea_answer != '':
                                rec_request_free_text[rec_key].append((q_key, 'textarea', textarea_answer))
    

    return rec_request_free_text


def has_diff(output, rec_schedule=False):
    diff_sections = []
    for o in output:
        c = o[2]
        changes = c['add'] + c['sub']
        description = o[0]['description'] or o[1]['description']
        if ( changes >= 50 and "optional" not in description.lower() and 
            'div' not in description.lower() ):
            
            #if we want to include schedule changes, then just append it
            if rec_schedule:
                diff_sections.append(description)
            elif 'recommended schedule' not in description.lower():
                #if we are not sure about schedule changes, we should check that recommend schedule does not appear
                diff_sections.append(description)
    
    return diff_sections
            
    
def has_warnings(drecs, id_):
    auto_rec_json = []
    for new_recdate, new_rec_space in drecs.items():
        try:
            creation_sig = new_rec_space['rec_signature']['creation_sig']
            
            if 'request_key' not in creation_sig:
                continue
                
            if new_rec_space['rec_signature']['_id'] != id_:
                continue
            
            if 'auto_gen' not in new_rec_space:
                continue
            
            auto_rec_json = new_rec_space['auto_gen']['rec_json']
            break
        except Exception as e:
            pass
    
    for card in auto_rec_json:
        if (card['type'] not in ['Schedule', 'Divider'] and
            card['description'].strip() not in ['Explanation of schedule', 'Sleep Profile', 'Additional Notes']):
        
            checkwhy = card['text'].strip().split("Why:")
            checkhow = card['text'].strip().split("How:")
            checknotes = card['text'].strip().split("Notes:")
            checknote = card['text'].strip().split("Note:")
            checkAddnotes = card['text'].strip().split("Additional Notes:")
            checkqa = card['text'].strip().split("Q:")

            if card['description'].strip()=='Beginning of the night':
                bOTN = 1
            if card['description'].strip()=='Falling asleep at bedtime':
                fAABT = 1
            if len(checkwhy) > 2:
                print(card['description'])
                print("checkwhy > 2")
                return True
            elif len(checkhow) > 2:
                print(card['description'])
                print("checkhow > 2")
                return True
            elif len(checkhow) < 2:
                print(card['description'])
                print("checkhow < 2")
                return True
            elif len(checkwhy) < 2:
                print(card['description'])
                print("checkwhy < 2")
                return True
            elif len(checknotes) > 1 and (checknotes[0] == '' or checknotes[0] == '<strong>'):
                print(card['description'])
                print("checknotes > 1")
                return True
            elif len(checknote) > 1 and (checknote[0] == '' or checknote[0] == '<strong>'):
                print(card['description'])
                print("checknote > 1")
                return True
            elif len(checkAddnotes) > 1 and (checkAddnotes[0] == '' or checkAddnotes[0] == '<strong>'):
                print(card['description'])
                print("checaddknote > 1")
                return True
            elif len(checkqa) > 1 and checkqa[0] == '':
                print(card['description'])
                print("checkqa > 1")
                return True
    return False
                    
def failed_schedule(output):
    sentences = ['fail', 'failed', "total sleep hours in the schedule is less than the minimum required sleep",
                 "schedule does not follow user's reported parameters", "total night hours is"]
    for o in output:
        if 'recommended schedule' in o[1]['description'].lower():
            text = o[1]['text'].lower()
            for s in sentences:
                if s in text:
                    return True
    return False


def time_convert(hhmm):
    if not isinstance(hhmm, str):
        return hhmm
    hhmm = hhmm.split(':')
    return int(hhmm[0])*3600 + int(hhmm[1])*60

def GetTrainingData(rec_request_free_text, triages_compare_list, req_key_to_triage, 
                    triage_features, consultants_list, useConsultantAnswerForChange=False):
    triage_to_t = defaultdict(list)
    sleep_training = []
    anything_else = []
    
    for t in triages_compare_list:
        if not t or 'creation_sig' not in t or 'request_key' not in t['creation_sig']:
            continue
    
        req_key = t['creation_sig']['request_key']
        if req_key not in req_key_to_triage:
            continue
    
        triage_key = req_key_to_triage[req_key]
        if triage_key not in first_time_triage:
            continue
    
        if triage_key not in triage_to_consultant:
            continue
            
        triage_to_t[triage_key].append(t)
    
    searchArrays = []
    y = []
    for triage_key, t_list in triage_to_t.items():
        unique_t = {}
        for t in t_list:
            unique_t[t['id']] = t
        all_ts = []
        for _, t in unique_t.items():
            all_ts.append(t)
        sorted(all_ts, key=lambda x: x['id'], reverse=True)
        t = all_ts[0]
        age = t['searchArray'].get('age', 0) or t['searchArray'].get('calc_age', -1)
        if age < 0:
            continue
        
        if age > 4:
            continue
            
        rec_request_key = t['creation_sig']['request_key']
        
        if rec_request_key not in triage_features:
            continue
        
        consultant = triage_to_consultant[triage_key]
        
        if consultant not in consultants_list:
            continue
        if '_id' in t['searchArray']:
            #some screwed up searchArray
            tmp = t['searchArray']['searchArray']
            t['searchArray'] = tmp
        
        if '_id' in t['searchArray']:
            print(t['searchArray'])
            print(asdfsas)
        t['searchArray']['uid'] = t['uid']    
        t['searchArray']['cid'] = t['cid']
        t['searchArray']['email'] = t['email']
        t['searchArray']['child'] = t['child']
        t['searchArray']['req_key'] = rec_request_key
        t['searchArray']['consultant'] = triage_to_consultant[triage_key]
        t['searchArray']['free_text'] = 1 if rec_request_key in rec_req_free_txt else 0
        has_tried_sleep_training = 0
        
        gender = t['searchArray'].get('gender', 'NA')
        
        diff_sections = has_diff(t['output'], False) #ignore schedule
        no_diff = 0 if diff_sections else 1
        
        sleep_training_note = ''
       
        if t['searchArray']['free_text'] == 1:
            for (qkey, label, ans) in rec_req_free_txt[rec_request_key]:
                if qkey == '30173308':
                    sleep_training_note = ans
                    sleep_training.append((t['uid'], t['cid'], t['email'], age, gender, ans, no_diff))
                
        other_ans = []
        for (qkey, label, ans) in rec_req_free_txt[rec_request_key]:
            if '_other' in label:
                other_ans.append(ans)
                
        t['searchArray']['other_ans'] = ' '.join(other_ans)
        t['searchArray']['other_len'] = len(t['searchArray']['other_ans'])
        t['searchArray']['slee_training'] = sleep_training_note
        t['searchArray']['sleep_training_len'] = len(sleep_training_note)
        
        if 'anything_else' in t['searchArray']:
            t['searchArray']['anything_else_length'] = len(t['searchArray']['anything_else'])
            anything_else.append((t['uid'], t['cid'], t['email'], age, gender, t['searchArray']['anything_else'], no_diff))
        else:
            t['searchArray']['anything_else_length'] = 0
        for k, v in triage_features[rec_request_key].items():
            if k == 'NoChangeNeeded':
                continue
            t['searchArray'][k] = v
        
       
        if 'NoChangeNeeded' in t['searchArray']:
            del t['searchArray']['NoChangeNeeded']
        cid = t['cid']
        t['searchArray']['bad_schedule'] = failed_schedule(t['output'])
        t['searchArray']['has_warning'] = has_warnings(dbrecs[cid], t['id'])
       
        if 'recWakeUpTime' in t['searchArray']:
            t['searchArray']['desired_diff_wakeup'] = t['searchArray']['recWakeUpTime'] - t['searchArray']['uiDesiredWakeup']
        if 'recNightStartTime' in t['searchArray']:
            t['searchArray']['desired_diff_bedtime'] = t['searchArray']['recNightStartTime'] - t['searchArray']['earliestGetforBedtime']
        searchArrays.append(t['searchArray'])
        for diff_section in diff_sections:
            colname = 'diffSection_'+diff_section
            t['searchArray'][colname] = 0
            if diff_section in t['searchArray']:
                del t['searchArray'][diff_section]
        
        
        feature = triage_features[rec_request_key]
        if useConsultantAnswerForChange:
            if 'NoChangeNeeded' in feature:
                y.append(feature['NoChangeNeeded'])
            else:
                y.append(no_diff)
        else:
            y.append(no_diff)
    
    #1 means there is no change and 0 means there is a change
    df = pd.DataFrame(searchArrays)
    df = pd.concat([df, pd.get_dummies(df['consultant'], prefix='consultant')], axis=1)
    
   
    #need to do this because some of the older searchArray had times as a string hh:mm
    if 'uiFallAsleepTyp' in df:
        df['uiFallAsleepTyp'] = df['uiFallAsleepTyp'].apply(time_convert)
    if 'uiWakeUpTyp' in df:
        df['uiWakeUpTyp'] = df['uiWakeUpTyp'].apply(time_convert)

    return df, np.array(y), sleep_training, anything_else

consultants = assignee_to_name.values()
#ignoring Jolan and Juliet since they change everytthing
consultants = [c for c in consultants if c not in ['Marko', 'Timm', 'Daniel', 'DH_test', 'Robert', 'Allan', 'Alejandra', 'Sophia', 'Seng BSS', 'Meg']]
consultants.append('nobody')
rec_req_free_txt = get_rec_requests()
X, y, sleep_training, anything_else = GetTrainingData(rec_req_free_txt, compare_triage_list1, 
                                                      req_key_to_triage, triage_features, 
                                                      consultants, useConsultantAnswerForChange=True)


Beginning of naps
checkwhy > 2
Beginning of the night
checkwhy > 2
Beginning of the night
checkwhy > 2
Beginning of the night
checkhow < 2
Beginning of naps
checkhow < 2
Beginning of naps
checkhow > 2
Beginning of naps
checkhow > 2
Beginning of naps
checkwhy > 2
Beginning of naps
checkwhy > 2
Beginning of the night
checkwhy > 2
Beginning of naps
checkhow < 2
Beginning of naps
checkwhy > 2
Beginning of naps
checkwhy > 2
Beginning of the night
checkwhy > 2
Beginning of naps
checkwhy > 2
Beginning of naps
checkhow > 2
Beginning of the night
checkhow > 2
Beginning of the night
checkwhy > 2
Beginning of the night
checkwhy > 2
Beginning of the night
checkhow < 2
Beginning of the night
checkwhy > 2
Beginning of the night
checkhow > 2
Beginning of naps
checkhow > 2
Beginning of the night
checkhow > 2
Beginning of the night
checkhow < 2
Beginning of naps
checkwhy > 2
Beginning of the night
checkwhy > 2
Beginning of the night
checkhow < 2
Beginning of the night
checkhow < 2
Beginning of naps
che

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve
from matplotlib import pyplot as plt

def xgb_using_cv(x_train, y_train, param_comb=100):
    clf_xg = XGBClassifier(learning_rate=0.025, n_estimators=100, 
                           objective='binary:logistic', 
                           batchseed=0, nthread=1)
    params = {
        'min_child_weight': [1, 2.0, 3.0],
        'gamma': [0, 1, 2.5],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [1, 2, 3],
        'colsample_bytree': [0.4, 0.6, 0.8],
        'scale_pos_weight': [2, 3, 4]
        }

    folds = 4
   
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 0)

    random_search = RandomizedSearchCV(clf_xg, param_distributions=params, n_iter=param_comb, 
                                       scoring='precision', n_jobs=4, cv=skf.split(x_train, y_train), 
                                       verbose=3, random_state=0)
    random_search.fit(x_train, y_train)
    print(random_search.best_params_)
    #now we fit the classifier on the training data
    clf_xg.set_params(**random_search.best_params_)
    clf_xg.set_params(nthread=4)
    clf_xg.fit(x_train, y_train)
    return clf_xg

def xgb_test(clf_xg, x_test, y_test):
    y_predict_test = clf_xg.predict(x_test)
    
    print(confusion_matrix(y_test, y_predict_test))
    y_predict_score = clf_xg.predict_proba(x_test)[:,1]
    fpr, tpr, thresh = roc_curve(y_test, y_predict_score)
    return fpr, tpr, y_predict_score


def clean_data(X, y, consultants):
    df1 = X.fillna(-5)

    for c in df1.columns:
        if 'free_text' in c or 'night_start_mis' in c or 'sleep_training' in c:
            continue
        if 'diffSection' in c: 
            df1 = df1.drop(columns=c)
        if 'consultant_' in c:
            consultant = c.split('_')[1]
            if consultants and consultant not in consultants:
                df1 = df1.drop(columns=c)
            if not consultants:
                df1 = df1.drop(columns=c)
            
    if consultants:
        consultant_indices = df1['consultant'].isin(consultants)
        df1 = df1[consultant_indices]
        y2 = y[consultant_indices]
    else:
        y2 = y
    
    if 'consultant' in df1.columns:
        df1 = df1.drop(columns='consultant')
    df1 = df1.apply(pd.to_numeric)
    
    return df1, y2


def split_data(X, y, use_random=True):
    #np.random.seed(10)
    np.random.seed(20)
    if use_random:
        mask = np.random.rand(len(X)) < 0.6
    else:
        mask = np.arange(0, len(X)) < 0.8*len(X)
    
    X_train, y_train = X[mask], y[mask]
    X_test, y_test = X[~mask], y[~mask]
    return X_train, y_train, X_test, y_test

def ROC_All(X1, y1, params, consultants=[], random_split=True):
    X, y = clean_data(X1, y1, consultants)
    X_train, y_train, X_test, y_test = split_data(X, y, random_split)
    if params:
        clf_xg = XGBClassifier(learning_rate=0.05, n_estimators=1600, 
                           objective='binary:logistic', seed=0, nthread=8)
        clf_xg.set_params(**params)
        clf_xg.fit(X_train, y_train)
    else:
        clf_xg = xgb_using_cv(X_train, y_train, 100)
    fpr_all, tpr_all, y_pred_score = xgb_test(clf_xg, X_test, y_test)
    
    #do the fpr, tpr for different consultants
    fig, ax = plt.subplots()
    ax.plot(fpr_all, tpr_all, label='consultant_feature')
    ax.plot([0, 1], [0, 1])
    
    return clf_xg, fpr_all, tpr_all, X_train, y_train, X_test, y_test


In [17]:
X

Unnamed: 0,age,ageMagIndex,analysisType,anything_else,bedTimeRoutineLength,birthday,calc_age,catBedTimeOffCONT,catBedtimeNotConsistent,catBedtimeUnconventional,...,consultant_Amy,consultant_AmyB,consultant_Beth,consultant_Cindy,consultant_Daniella,consultant_Eileen,consultant_Janelle,consultant_Jen,consultant_Jolan,consultant_Juliet
0,4,3,0,Judah is very attached to his soother when it ...,2700,2021-04-01,4,461.500000,1,0,...,0,0,0,0,0,0,0,0,1,0
1,2,1,0,,2700,2021-07-06,2,8100.000000,0,1,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,Newborn,1800,2021-08-27,0,-11700.000000,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,He loves sleeping on his stomach but we do und...,2700,2021-07-17,1,-1800.000000,0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,2,0,Work in office and remotely \nMultiple caregiv...,2700,2021-05-28,3,4904.250000,1,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430,2,1,0,,1800,2021-05-07,2,6581.500000,1,1,...,0,0,0,0,1,0,0,0,0,0
1431,3,2,0,Dad works out of town M-F and is home on the w...,1800,2021-04-14,3,105.000000,0,0,...,0,0,0,1,0,0,0,0,0,0
1432,2,1,0,,1800,2021-05-04,2,723.714286,0,0,...,0,0,0,1,0,0,0,0,0,0
1433,1,0,0,,2700,2021-06-04,1,-4651.666667,1,0,...,0,0,0,1,0,0,0,0,0,0


In [None]:
diff_sections = []
for c in X.columns:
    if 'diffSection_' in c:
        diff_sections.append(c)
        X[c].fillna(1, inplace=True)

diff_sections = [c for c in diff_sections if ('div' not in c and 'optional' not in c.lower())]
diff_sections = [c for c in diff_sections if 'recommended schedule' not in c.lower()]


In [None]:
for c in X.columns:
    if 'diffSection_' in c and ('rescue' in c.lower() or '5AM' in c.lower() or '5 AM' in c.lower()):
        print(c)

In [None]:
X['consultant'].unique()

In [19]:
#columns which does not make sense to use especially since some are missing
#age maybe important since premature weeks can affect the recommendations
#parTotalDaysleepAvg - some missing data, for now we will not put it in

columns_to_drop = ['age', 'gender', 'ageMagIndex', 'analysisType', 'parAgeRoundUp', 
                   'recTotalSleepforCustom', 'recTotalSleepforSched', 
                   'recTotalSleepforEoS', 'other_ans',
                   'uiEarliestDinner', 'dinner_time',
                   '28903230_other', '28903229_other', 'sched_morning_relaxed', 'sched_morning_relaxed_OK',
                   'sched_next_ageGroup', 'sched_night_relaxed',
                   'sched_night_relaxed_OK', 'sched_shorten_naps']

# columns_to_drop = ['age', 'ageMagIndex', 'analysisType', 'parAgeRoundUp', 
#                    'recTotalSleepforCustom', 'recTotalSleepforSched',
#                    'recTotalSleepforEoS', 'other_ans', 'bedTimeRoutineLength',
#                    'uiDesiredWakeup', 'recWakeUpTime', 'free_text', 'earliest_getready_bedtime',
#                    'uiEarliestDinner', 'earliestGetforBedtime', 'earliest_dinner', 'dinner_time', 'recNightStartTime',
#                    '28903230_other', '28903229_other', 'sched_morning_relaxed', 'sched_morning_relaxed_OK',
#        'sched_next_ageGroup', 'sched_night_relaxed',
#        'sched_night_relaxed_OK', 'sched_shorten_naps']

tmp = ['uiAlsoWhileHeldtoSleep', 'parTotalDaySleepAVG', 'num_recs', 'uiAlsoStrollertoSleep',]
#for each one of these groups, we will add a new label of -5 for not answering. one hot encoding and potentially
#that embedding stuff
ui_group = [['uiOnlyBreastFeedtoSleep', 'uiAlsoSlingtoSleep', 'uiAlsoOwnSpaceHandOnChildtoSleep', 
             'uiAlsoParentNextNotTouchingtoSleep'],
            
            ['uiHappyPleasantAfterNaps', 'uiTiredCrankyAfterNaps', 'uiAngryinPainAfterNaps', 
             'uiMoodVariesAfterNaps'],
            
            ['uiNoNightWaking', 'uiHungryNightWaking', 'uiWantsComfortingNightWaking', 
             'uiNightmareNightWaking', 'uiNightTerrorNightWaking', 'uiDiaperLeakNightWaking', 
             'uiPoopyDiaperNightWaking', 'uiUseToiletNightWaking', 'uiPlayPracticeSkillsNightWaking', 
             'uiWantstoGotoParentsNightWaking', 'uiItchyNightWaking', 'uiGasWindNightWaking', 
             'uiBlanketOffNightWaking', 'uiEnvironmentNightWaking', 'uiTeethingNightWaking'],
            
            ['napsameasnight', 'uiFallsAsleepUnassistedNap', 'uiInMotionNap', 'uiBreastFeedNap', 
             'uiBottleFeedNap', 'uiSlingNap', 'uiTransfertoBedNap', 'uiWatchTVNap', 
             'uiParentNextTouchingNap', 'uiParentNextNotTouchingNap', 'uiPacifierNap', 
             'uiComfortItemNap', 'uiSwingNap', 'uiSwaddledNap', 'uiLyingOnSomeoneNap', 'uiTransfertoBedNap'],
            
            ['uiFallsAsleepUnassistedBedTime', 'uiBreastFeedBedTime', 'uiSlingBedTime', 'uiInMotionBedTime', 
             'uiTransfertoBedBedTime', 'uiParentNextTouchingBedTime', 'uiParentNextNotTouchingBedTime', 
             'uiBottleFeedBedTime', 'uiPacifierBedTime', 'uiComfortItemBedTime', 'uiWatchTVBedTime', 
             'uiOwnSpaceHandOnChildBedTime', 'uiSwingBedTime', 'uiSwaddledBedTime', 'uiTransfertoBedBedTime'],
            
            ['uiCaredforByParentsDay', 'uiCaredforinNurseryDay', 'uiHomeNoParentsDay', 'uiVariesDayTimeCare'],
            
            ['uiTiredClingyBeforeNightSleep', 'uiEnergeticButTired1HrBeforeNightSleep', 
             'uiEnergeticBeforeNightSleep', 'uiHappyPleasantBeforeNightSleep'],

            ['uiWantstoGotoParentsNightWaking', 'uiItchyNightWaking', 'uiGasWindNightWaking', 
             'uiBlanketOffNightWaking', 'uiEnvironmentNightWaking', 'uiTeethingNightWaking'],
            
            ['uiHappyPleasantAfterMorningPlus1Hr', 'uiHappyPleasantAfterMorningTiredIn1Hr', 
             'uiTiredCrankyAfterMorning', 'uiMoodVariesAfterMorning'],
            
            ['uiMedicalNone', 'uiMedicalReflux', 'uiMedicalEczema', 'uiMedicalAsthma', 'uiMedicalSnoring', 
             'uiMedicalASD', 'uiMedicalADHD', 'uiMedicalSPDorAPD', 'uiMedicalNoDisclose', 'uiMedicalSweating'],
            
            ['uiOwnRoomOwnBedNight', 'uiSharedRoomOwnBedNight', 'uiParentRoomOwnBedNight', 'uiCoSleepNight', 
             'uiStartinOwnBedEndwithParentsNight'],
            
            ['uiPriorityNightWakings', 'uiPriorityNaps', 'uiPrioritySleepIn', 'uiPriorityEasierBedtime', 
             'uiPriorityCoSleeping', 'uiPrioritySchedule', 'uiPriorityNone'],
           ]

#these have multiple values and if we want to use one-hot encoding, need to add in multiple versions
ui_expand = ['uiContinueCoSleepNB', 'uiContinueShareRoomNB', 'uiRecentSleepIssueNB', 
             'uiSleepRoutineNB', 'uiStrangeSituationCryNB']



#again we need a -5 for this, but most of the values is mostly -5 (no answers)
ui_skip = [
           ['uiCaredforByParentsDayMon', 'uiCaredforinNurseryDayMon', 'uiHomeNoParentsDayMon', 
            'uiCaredforByParentsDayTue', 'uiCaredforinNurseryDayTue', 'uiHomeNoParentsDayTue', 
            'uiCaredforByParentsDayWed', 'uiCaredforinNurseryDayWed', 'uiHomeNoParentsDayWed', 
            'uiCaredforByParentsDayThu', 'uiCaredforinNurseryDayThu', 'uiHomeNoParentsDayThu', 
            'uiCaredforByParentsDayFri', 'uiCaredforinNurseryDayFri', 'uiHomeNoParentsDayFri'],

           
           ['uiFlexibleNapsDaycareNB'],
           
           ['uiResistsBedtime'],
           
           ['uiSharingNighDutiesNB'],
           
           ['uiTransfertoBedBedTime']
          ]

In [20]:
for c in ui_skip:
    columns_to_drop.extend(c)

# for c in X.columns:
#     if c.startswith('par'):
#         columns_to_drop.append(c)

columns_to_drop.extend(['birthday', 'child_name', 'desired_wakeup', 'dinner_time', 'earliest_getready_bedtime', 
                        'getready_bedtime', 'mhour', 'nhour', 'time_zone', 'uid', 'cid', 'req_key', 'email', 'child',
                        'Birthday', 'sibling_ages', 'sibling_age', 'date_submit', '28903232_other', 
                        'anything_else_length', 'other_len', 'dec_age', 'ssAdjust'])


par_skip = ['parNumDaysSelected']

columns_to_drop.extend(par_skip)

# columns_to_drop.extend(['birthday', 'child_name', 'desired_wakeup', 'dinner_time', 'earliest_getready_bedtime', 
#                         'getready_bedtime', 'mhour', 'nhour', 'time_zone', 'uid', 'cid', 'req_key', 'email', 'child', 'catBedTimeOffCONT',
#                         'dec_age', 'nrecs', 'nweeks', 'Birthday', 'sibling_ages', 'sibling_age', 'date_submit', '28903232_other'])

X1 = X
for c in columns_to_drop:
    if c in X1.columns:
        X1 = X1.drop(columns=c)

for c in X1.columns:
    if '_other' in c:
        X1 = X1.drop(columns=c)
        
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X1 = X1.drop(columns=['anything_else', 'slee_training'])

X1['bad_schedule'] = X1['bad_schedule']*1
X1['has_warning'] = X1['has_warning']*1
X1.fillna(-5)


Unnamed: 0,bedTimeRoutineLength,calc_age,catBedTimeOffCONT,catBedtimeNotConsistent,catBedtimeUnconventional,catCannotSelfSettle,catDesiredWakeUp1HrEarlier,catDesiredWakeUpVTypWakeup,catDinnerTimeTooLate,catExclusivelyBottleFed,...,consultant_Amy,consultant_AmyB,consultant_Beth,consultant_Cindy,consultant_Daniella,consultant_Eileen,consultant_Janelle,consultant_Jen,consultant_Jolan,consultant_Juliet
0,2700,4,461.500000,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2700,2,8100.000000,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1800,0,-11700.000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2700,1,-1800.000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2700,3,4904.250000,1,1,1,0,-1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430,1800,2,6581.500000,1,1,0,0,-1,0,0,...,0,0,0,0,1,0,0,0,0,0
1431,1800,3,105.000000,0,0,1,0,-1,0,0,...,0,0,0,1,0,0,0,0,0,0
1432,1800,2,723.714286,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1433,2700,1,-4651.666667,1,0,0,0,-1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [None]:
#keep only important features but it doesnt work
important_columns = ['uiHoldRockDuringNightWaking', 'uiMedicalAsthma',
       'uiMedicalDisclaimerAgreed', 'uiMedicalEczema', 'uiMedicalNoDisclose',
       'uiMedicalSPDorAPD', 'uiMedicalSnoring', 'uiMedicalTongueLipTie',
       'uiNightTerrorNightWaking', 'uiNightmareNightWaking',
       'uiOnlyBreastFeedtoSleep', 'uiOtherSiblings', 'uiOwnRoomOwnBedNight',
       'uiOwnSpaceHandOnChildBedTime', 'uiOwnSpaceHandOnChildNap',
       'uiPacifierNap', 'uiParentNextNotTouchingBedTime',
       'uiParentNextTouchingBedTime', 'uiMedicalAllergies',
       'uiParentsSleepwithChildDuringNightWaking', 'uiMedicalASD',
       'uiItchyNightWaking', 'uiContinueCoSleepNB', 'uiDiaperLeakNightWaking',
       'uiDinnerTime', 'uiDrinksfromCup', 'uiEnergeticBeforeNightSleep',
       'uiEnvironmentNightWaking', 'uiFallsAsleepUnassistedNap',
       'uiFallsBackAsleepDuringNightWaking', 'uiGiveWaterDuringNightWaking',
       'uiGoestoParentsDuringNightWaking', 'uiHappyPleasantAfterNaps',
       'useDesired', 'uiHomeNoParentsDay', 'uiHungryNightWaking',
       'uiInMotionBassinetBedTime', 'uiInMotionBassinetNap',
       'uiInMotionBedTime', 'uiLyingOnSomeoneNap']

X_important = X1[important_columns]

In [None]:
X['email']

In [None]:
if 'consultant' in X1.columns:
    X2 = X1.drop(columns='consultant')
else:
    X2 = X1
for c in X2.columns:
    if 'consultant_' in c:
        X2 = X2.drop(columns=c)
    if 'diffSection_' in c:
        X2 = X2.drop(columns=c)

if 'bad_schedule' in X2.columns:
    X2['bad_schedule'] = X2['bad_schedule']*1
if 'has_warning' in X2.columns:
    X2['has_warning'] = X2['has_warning']*1
X2 = X2.fillna(-5)
from scipy.spatial.distance import pdist, squareform
dist = pdist(X2.values, metric='correlation')
X2_dist = squareform(dist)

In [None]:
X2_dist
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
im = ax.imshow(X2_dist)
plt.show()

In [None]:
#X3 = X2[['uiFallsAsleepUnassistedBedTime', 'uiNightWakingsNB', 'nrecs', 'nweeks', 'nw_number', 'bad_schedule', 'has_warning']]
X3 = X[['uiFallsAsleepUnassistedBedTime', 'uiNightWakingsNB', 'nrecs', 'nweeks', 'nw_number', 'has_warning']]


In [None]:
idx = (X['has_warning']) | (X['bad_schedule'])
y1 = y.copy()
y1[idx] = 0

In [None]:
idx_consultant = (X1['consultant'].isin(['Amy', 'Daniella', 'Kristal', 'Eileen', 'Liz', 'Jessica', 'Heather', 'Jen', 'Amber']))
X_good = X[idx_consultant]
y1_good = y1[idx_consultant]

In [None]:
X4 = X_good
pd.options.display.max_colwidth = 100
X4['output'] = y1_good
X4.loc[(~X4['anything_else'].isnull()) & (X4['output'] == 1), ['anything_else','output', 'email']]

In [None]:
idx_change = (y1_good == 0)
zero_words = X_good['nw_number'] == 0
no_warnings = (X_good['has_warning'] == 0) & (X_good['bad_schedule']==0)
pd.set_option("display.min_rows", 20)

for diff_sec in diff_sections:
    total_change_sec = sum(idx_change&zero_words&no_warnings) - sum(X_good[idx_change&zero_words&no_warnings][diff_sec])
    if total_change_sec > 20:
        print(diff_sec, total_change_sec)
        sec_change = X_good[diff_sec] == 0
        final_idx = (sec_change & idx_change) & zero_words & no_warnings
        print(X_good[final_idx][['cid', 'consultant']])
        
    

In [None]:
for c in X1['consultant'].unique():
    c_idx = X1['consultant'] == c
    print(c, sum(y1[c_idx]), len(y1[c_idx]), sum(y1[c_idx])/len(y1[c_idx]))

In [None]:
X3

In [None]:
#params = {'subsample': 0.8, 'scale_pos_weight': 1.75, 'min_child_weight': 1, 'max_depth': 12, 'gamma': 8, 'colsample_bytree': 1.0}
#params = {'subsample': 0.8, 'scale_pos_weight': 1.75, 'min_child_weight': 1, 'max_depth': 10, 'gamma': 5}
params = {'subsample': 1.0, 'scale_pos_weight': 2.5, 'min_child_weight': 1, 'max_depth': 6, 'gamma': 5}
parasm = {'subsample': 1.0, 'scale_pos_weight': 1, 'min_child_weight': 3.0, 'max_depth': 6, 'gamma': 5}
#params = {}
clf_xg_con3, fpr_all_con3, tpr_all_con3, X_train_tmp1, y_train_tmp1, X_test, y_test = ROC_All(X3, y1, params, [], True)


In [None]:
fpr_all, tpr_all, y_pred_score = xgb_test(clf_xg_con2_all, X_train2_all, y_train2_all)
fig, ax = plt.subplots()
ax.plot(fpr_all, tpr_all, label='updated model')
ax.plot([0, 1], [0, 1])
ax.legend()

In [None]:
sorted_idx = clf_xg_con2_all.feature_importances_.argsort()
X2.columns[sorted_idx][:40]


In [None]:
#params = {'subsample': 1.0, 'scale_pos_weight': 2.5, 'min_child_weight': 2, 'max_depth': 5, 'gamma': 5}
#params = {'subsample': 1.0, 'scale_pos_weight': 1.35, 'min_child_weight': 1, 'max_depth': 6, 'gamma': 5}
params = {'subsample': 0.6, 'scale_pos_weight': 1, 'min_child_weight': 2.0, 'max_depth': 5, 'gamma': 5}
params = {'subsample': 0.8, 'scale_pos_weight': 2, 'min_child_weight': 1, 'max_depth': 3, 'gamma': 0, 'colsample_bytree': 0.4}

#X2_sub = X2[important]
clf_xg_con2_all, fpr_all_con2_all, tpr_all_con2_all, X_train2_all, y_train2_all, X_test2_all, y_test2_all = ROC_All(X2, y1, params, [], True)



In [None]:
#do testing against the current model
import pickle
model = pickle.load(open("angular_portal/xgb/NEWBORN_EXP.bin", "rb"))

def test(model, X_test, y_test):
    model_columns = model._Booster.feature_names
    print(model_columns)
    X_df = X_test.copy()
    X_df_columns = set(X_df.columns)
    for c in model_columns:
        if c not in X_df_columns:
            X_df[c] = -5
    
    if 'gender' in X_df_columns:
        X_df['gender_encode'] = 0
        X_df.loc[X_df['gender'] == 'M', 'gender_encode'] = 1
        X_df['gender'] = X_df['gender_encode']

    X_for_predict = X_df[model_columns]
    X_for_predict = X_for_predict.apply(pd.to_numeric)
    y_predict_score = model.predict_proba(X_for_predict)[:,1]
    y_predict_test = y_predict_score > 0.6
    print(confusion_matrix(y_test, y_predict_test))
    
    fpr, tpr, thresh = roc_curve(y_test, y_predict_score)
    return fpr, tpr, y_predict_score


fpr_all, tpr_all, y_pred_score = test(model, X2, y1)
fp_idx = (y_pred_score > 0.6) & (y1 < 0.5)
#do the fpr, tpr for different consultants
fig, ax = plt.subplots()
ax.plot(fpr_all, tpr_all, label='old model')
ax.plot(fpr_all_con2_all, tpr_all_con2_all, label='new model')

ax.plot([0, 1], [0, 1])

In [None]:
model2 = pickle.load(open("NEWBORN_MODEL_JULY27_2021.bin", "rb"))
fpr_all_con2_all, tpr_all_con2_all, y_pred_score = test(model2, X2, y1)
#do the fpr, tpr for different consultants
fig, ax = plt.subplots()
ax.plot(fpr_all, tpr_all, label='old model')
ax.plot(fpr_all_con2_all, tpr_all_con2_all, label='new model')
ax.legend()
ax.plot([0, 1], [0, 1])

fp_idx = (y_pred_score > 0.5) & (y1 < 0.5)
X[fp_idx]['email']

In [None]:
25/(25+41), 12/(25+41)

In [None]:
v = clf_xg_con2_all.predict_proba(X_test2_all)[:, 1]


In [None]:
num = 0
for idx, (yt, yhat) in enumerate(zip(y_test2_all, v)):
    if yt == 0 and yhat < 0.3:
        print(idx, yt, yhat)
        num += 1
        
num

In [None]:
loc = 470
print(y_test2_all[loc], v[loc])
X_df = X_test2_all.iloc[loc:loc+1, :].copy()
print(X.loc[X_df.index[0], ['email', 'consultant']])
from eli5 import show_prediction
show_prediction(clf_xg_con2_all, X_df, feature_names=X_df.columns.values, show_feature_values=True, targets=[True])

In [None]:
from eli5 import show_weights
show_weights(clf_xg_con2_all)

In [None]:
params = {'subsample': 0.6, 'scale_pos_weight': 4, 'min_child_weight': 5.0, 'max_depth': 10, 'gamma': 5}

clf_xg_con2_all, fpr_all_con2_all, tpr_all_con2_all, _, _, X_test2_all, y_test2_all = ROC_All(X2, y1, params, [], True)


In [None]:
X_test2_all[v > 0.5]

In [None]:
X_df

In [None]:
X_test2_all.reset_index(drop=True)

In [None]:
X_df = X_test2_all[X_test2_all['old_index'] == 173]
from eli5 import show_prediction
show_prediction(clf_xg_con2_all, X_df, feature_names=X_df.columns.values, show_feature_values=True)

In [None]:
import pickle
pickle.dump(clf_xg_con2_all, open('MODEL_USING_CONSULTANT_SURVEY.bin', 'wb'))


In [None]:
fpr_all, tpr_all, y_pred_score = xgb_test(clf_xg_con2_all, X_test2_tmp_tf2, y_test2_tmp_tf2)
fig, ax = plt.subplots()
ax.plot(fpr_all, tpr_all, label='updated model')
ax.plot(fpr_all_con2_tmp_tf2, tpr_all_con2_tmp_tf2, label='trained on just cases with NW <= 6')
ax.plot([0, 1], [0, 1])
ax.legend()

In [None]:
params = {'subsample': 0.8, 'scale_pos_weight': 2.75, 'min_child_weight': 2.0, 'max_depth': 8, 'gamma': 5}
#jen_idx = X1['consultant'] == 'Kristal'
#zero_words = (X_good['nw_number'] <= 6) & (X_good['sleep_training_len'] <= 5) & (X_good['other_len'] <= 5)
#zero_words = X_good['nw_number'] <= 6;
#zero_words = zero_words & (X_good['anything_else'].isnull())
#zero_words = (X_good['nw_number'] <= 6) & (X_good['anything_else'].isnull())
zero_words = (X_good['nw_number'] <= 6)
X2_good = X2[idx_consultant]
X1_jen = X2_good[zero_words]
y1_jen = y1_good[zero_words]

clf_xg_con2_tmp_tf2, fpr_all_con2_tmp_tf2, tpr_all_con2_tmp_tf2, X_train, y_train, X_test2_tmp_tf2, y_test2_tmp_tf2 = ROC_All(X1_jen, y1_jen, params, [], True)


In [None]:
 def diff_linesToWords(text1, text2, delimiter=re.compile('\n')):
    """
        Split two texts into an array of strings.  Reduce the texts to a string
        of hashes where each Unicode character represents one line.

        95% of this function code is copied from `diff_linesToChars` on:
            https://github.com/google/diff-match-patch/blob/895a9512bbcee0ac5a8ffcee36062c8a79f5dcda/python3/diff_match_patch.py#L381

        Copyright 2018 The diff-match-patch Authors.
        https://github.com/google/diff-match-patch
        Licensed under the Apache License, Version 2.0 (the "License");
        you may not use this file except in compliance with the License.
        You may obtain a copy of the License at
          http://www.apache.org/licenses/LICENSE-2.0

        Args:
            text1: First string.
            text2: Second string.
            delimiter: a re.compile() expression for the word delimiter type

        Returns:
            Three element tuple, containing the encoded text1, the encoded text2 and
            the array of unique strings.  The zeroth element of the array of unique
            strings is intentionally blank.
    """
    lineArray = []  # e.g. lineArray[4] == "Hello\n"
    lineHash = {}   # e.g. lineHash["Hello\n"] == 4

    # "\x00" is a valid character, but various debuggers don't like it.
    # So we'll insert a junk entry to avoid generating a null character.
    lineArray.append('')

    def diff_linesToCharsMunge(text):
        """Split a text into an array of strings.  Reduce the texts to a string
        of hashes where each Unicode character represents one line.
        Modifies linearray and linehash through being a closure.
        Args:
            text: String to encode.
        Returns:
            Encoded string.
        """
        chars = []
        # Walk the text, pulling out a substring for each line.
        # text.split('\n') would would temporarily double our memory footprint.
        # Modifying text would create many large strings to garbage collect.
        lineStart = 0
        lineEnd = -1
        while lineEnd < len(text) - 1:
            lineEnd = delimiter.search(text, lineStart)

            if lineEnd:
                lineEnd = lineEnd.start()

            else:
                lineEnd = len(text) - 1

            line = text[lineStart:lineEnd + 1]

            if line in lineHash:
                chars.append(chr(lineHash[line]))
            else:
                if len(lineArray) == maxLines:
                    # Bail out at 1114111 because chr(1114112) throws.
                    line = text[lineStart:]
                    lineEnd = len(text)
                lineArray.append(line)
                lineHash[line] = len(lineArray) - 1
                chars.append(chr(len(lineArray) - 1))
            lineStart = lineEnd + 1
        return "".join(chars)

    # Allocate 2/3rds of the space for text1, the rest for text2.
    maxLines = 666666
    chars1 = diff_linesToCharsMunge(text1)
    maxLines = 1114111
    chars2 = diff_linesToCharsMunge(text2)
    return (chars1, chars2, lineArray)
    
def diff_line(text1, text2):
    dmp = diff_match_patch()
    a = diff_linesToWords(text1, text2, delimiter=re.compile('[.!?] '))
    lineText1 = a[0]
    lineText2 = a[1]
    lineArray = a[2] # .lineArray;

    print(lineArray)
    diffs = dmp.diff_main(lineText1, lineText2, True);
    print(diffs)
    dmp.diff_charsToLines(diffs, lineArray);
   
    #dmp.diff_cleanupSemantic(diffs)
    return diffs

def cleanhtml2(raw_html):
    soup = BeautifulSoup(raw_html)
    for elem in soup.find_all(["p", "div", "br"]):
        elem.replace_with("\n")
    text = soup.get_text()
    text = '  '.join([a for a in text.split('\n') if a.strip()])
    text = ' '.join([a for a in text.split() if a.strip()])
    cleantext = unicodedata.normalize("NFKD", text)
    #return ''.join(cleantext.splitlines())
    return cleantext
    
def get_diff(cid, request_key, output, dbrecs):
    #get the req_key
    added = []
    deleted = []
    card_replaced = []
    added_sentences = defaultdict(list)
    deleted_sentences = defaultdict(list)
    sentenced_replaced = defaultdict(list)
    found_rec = None
    for rec_date, rec in dbrecs[cid].items():
        if 'rec_signature' not in rec:
            continue
        if rec['rec_signature']['creation_sig']['request_key'] == request_key:
            found_rec = rec
            break
    if found_rec is None:
        return added, deleted, card_replaced, added_sentences, deleted_sentences
    
    if 'auto_gen' not in found_rec:
        return added, deleted, card_replaced, added_sentences, deleted_sentences
    source_id = found_rec['auto_gen']['rec_signature']['_id']
    for o in output:
        if o[0]['description'] and 'div' in o[0]['description']:
            continue
        if o[1]['description'] and 'div' in o[1]['description']:
            continue
        
        if o[0]['description'] == '':
            deleted.append(o[1]['description'])
            continue
        
        #check if source sig is right
        skip = False
        for card in found_rec['rec_json']:
            if card['description'] == o[0]['description'] and card['rec_sig']['source'] == 'custom_scheduler':
                skip = True
                break
            if card['description'] == o[0]['description'] and card['type'] == 'Schedule':
                skip = True
                break
            
            if o[1]['description'] == '' and card['description'] == o[0]['description'] and not card['rec_sig']['source'].startswith(source_id):
#                 try:
                    
#                     added.append('+'.join(card['rectag']))                    
#                 except:
#                     added.append(o[0]['description'])
                added.append(o[0]['description'])
                skip = True
                break
            if card['description'] == o[0]['description'] and not card['rec_sig']['source'].startswith(source_id):
                skip = True
                card_replaced.append(o[0]['description'])
                break
        if skip:
            continue
        #check if it is swapped
        if (o[2]['add'] + o[2]['sub']) > 50:
            text0 = None
            text1 = None
            for card in found_rec['rec_json']:
                if card['description'] == o[0]['description']:
                    text0 = card['htmltext']
            
            for card in found_rec['auto_gen']['rec_json']:
                if card['description'] == o[0]['description']:
                    text1 = card['text']
            
            if text0 == None or text1 == None:
                continue
            
            if cid != 'fi6PZbB5MqDf1Ia2r4i4':
                continue
            text0 = cleanhtml2(text0)
            text1 = cleanhtml2(text1)
            diffs = diff_line(text1, text0)
            if cid == 'fi6PZbB5MqDf1Ia2r4i4':
                print(text0)
                print(diffs)
            for d in diffs:
                if d[0] == 1:
                    added_sentences[o[0]['description']].append(d[1].strip())
                if d[0] == -1:
                    deleted_sentences[o[0]['description']].append(d[1].strip())
                    
    return added, deleted, card_replaced, added_sentences, deleted_sentences

In [None]:
no_free_text_idx = (X_good['nw_number'] <= 5) & (X_good['sleep_training_len'] <= 5) & (X_good['anything_else_length'] <= 5) & (X_good['other_len'] == 0)
X_good_no_freetxt = X_good.copy() #X_good[no_free_text_idx].copy()
X_good_no_freetxt['output'] = y1_good

X_good_no_freetxt['section_added'] = ''
X_good_no_freetxt['section_deleted'] = ''
X_good_no_freetxt['section_replaced'] = ''

for idx, r in X_good_no_freetxt[X_good_no_freetxt['output'] == 0].iterrows():
    cid = r['cid']
    req_key = r['req_key']
    output = None
    for t in compare_triage_list1:
        if not t:
            continue
        if t['cid'] == cid and t['creation_sig']['request_key'] == req_key:
            output = t['output']
            break
    if output is None:
        continue
        
    added, deleted, card_replaced, added_sentences, deleted_sentences = get_diff(cid, req_key, output, dbrecs)
    if added:
        if '(optional) wean from the pacifier at sleep times' in '&&'.join([c.strip().lower() for c in added]):
            print(idx, r['cid'])
        X_good_no_freetxt.at[idx, 'section_added'] = '+'.join([c.strip().lower() for c in added])
    if deleted:
        X_good_no_freetxt.at[idx, 'section_deleted'] = ','.join([c.strip().lower() for c in deleted])
    
    if card_replaced:
        X_good_no_freetxt.at[idx, 'section_replaced'] = ','.join([c.strip().lower() for c in card_replaced])
        
    if added_sentences:
        for k, v in added_sentences.items():
            X_good_no_freetxt.at[idx, 'sentences_add_'+k] = ','.join(v)


In [None]:
rec_json = dbrecs['fi6PZbB5MqDf1Ia2r4i4']['1587936774']['rec_json']
for card in rec_json:
    if card['description'] == 'Explanation of schedule':
        break

soup = BeautifulSoup(card['htmltext'])
for elem in soup.find_all(['p', 'div', 'br']):
    elem.replace_with("\n")
    
text = soup.get_text()
text = '\n'.join([a for a in text.split('\n') if a.strip()])
text
#cleantext = unicodedata.normalize("NFKD", cleantext)

In [None]:
X_good_no_freetxt[~X_good_no_freetxt['sentences_add_Explanation of schedule'].isnull()][['cid', 'email','sentences_add_Explanation of schedule']]

In [None]:
nap_columns = [c for c in X_good_no_freetxt.columns if 'nap' in c.lower() and 'diffSection' not in c]
tmp_df2.loc[tmp_df2['faun4+faun20+faun13+faun14+faun19'] == 1, ['consultant', 'cid'] + nap_columns]
tmp_df2.loc[tmp_df2['faun4+faun20+faun13+faun14+faun19'] == 1, ['consultant', 'email', 'cid', 'catParentSleepHelpBedTime', 'catNapTimeDurInconsistent', 'catParentSleepHelpNaps']]


In [None]:
print(X_good_no_freetxt.loc[2407:2408]['section_added'].str.contains('(optional) wean from the pacifier at sleep times'))
sum(X_good_no_freetxt['section_added'].str.lower().str.contains('(optional) wean from the pacifier at sleep times'))

In [None]:
d = set()
for x in X_good_no_freetxt['section_added'].str.split('+').values:
    for a in x:
        if a == '(optional) wean from the pacifier at sleep times':
            print(x)
        d.add(a)
d

In [None]:
for a in d:
    if not a:
        continue
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    changed_idx = X_good_no_freetxt['output'] == 0
    institute_nap_hour = X_good_no_freetxt['section_added'].str.lower().str.contains(a.lower(), regex=False)
    print(a, ":", sum(institute_nap_hour))
    print("changed:", sum(changed_idx), a + " changed:", sum(changed_idx & institute_nap_hour))
    consultant = True #True | X_good_no_freetxt['consultant'] == 'Liz'
    institute_nap_hour = consultant & institute_nap_hour
    print(sum(changed_idx & institute_nap_hour))
    print(X_good_no_freetxt.loc[(changed_idx & institute_nap_hour) & priority_naps, ['uiCannotStandCrying', 'catShortNaps', 'catParentSleepHelpNaps', 'uiPriorityNaps']].sum())
    X_good_no_freetxt.loc[(changed_idx & institute_nap_hour) & priority_naps, ['cid', 'age', 'consultant', 'section_added', 'uiCannotStandCrying', 'catShortNaps', 'catParentSleepHelpNaps', 'uiPriorityNaps']]


In [None]:
changed_idx = X_good_no_freetxt['output'] == 0
institute_nap_hour = X_good_no_freetxt['section_added'].str.lower().str.contains("middle of the night wakings", regex=False)
print("changed:", sum(changed_idx), "changed with nap hour:", sum(changed_idx & institute_nap_hour))
consultant = True #True | X_good_no_freetxt['consultant'] == 'Liz'
institute_nap_hour = consultant & institute_nap_hour
print(sum(changed_idx & institute_nap_hour))
print(X_good_no_freetxt.loc[(changed_idx & institute_nap_hour) & priority_naps, ['uiCannotStandCrying', 'catShortNaps', 'catParentSleepHelpNaps', 'uiPriorityNaps']].sum())
X_good_no_freetxt.loc[(changed_idx & institute_nap_hour) & priority_naps, ['email', 'cid', 'age', 'consultant', 'section_added', 'catReqNightAssistance', 'catShortNaps', 'catParentSleepHelpNaps', 'uiPriorityNaps']]

In [None]:
y_pred = clf_xg_con2_tmp_tf2.predict_proba(X_test2_tmp_tf2)
tmp = (y_test2_tmp_tf2 == 0) & (y_pred[:, 1] > 0.7)
idx = X_test2_tmp_tf2[tmp].index
X_good1 = X_good.loc[idx]
consultant = 'Amy'
daniella = X_good1['consultant'] == consultant
X_good3 = X_good[X_good['consultant'] == consultant].copy()
X_good3['output'] = y1_good[X_good['consultant'] == consultant]
print(len(X_good1[daniella]))
X_good1[daniella][['email', 'cid', 'uid']]


In [None]:
tmp_case = X_test2_tmp_tf2.loc[637]
show_prediction(clf_xg_con2_tmp_tf2, tmp_case, feature_names=X_test2_tmp_tf2.columns.values, show_feature_values=True, targets=[True])

In [None]:
show_prediction(clf_xg_con2_tmp_tf2, X_test2_tmp_tf2.iloc[8:9], feature_names=X_test2_tmp_tf2.columns.values, show_feature_values=True, targets=[True])

In [None]:
params = {'subsample': 1.0, 'scale_pos_weight': 8.5, 'min_child_weight': 3.0, 'max_depth': 6, 'gamma': 5}
clf_xg_con2_tmp_tf2, fpr_all_con2_tmp_tf2, tpr_all_con2_tmp_tf2, _, _, X_test2_tmp_tf2, y_test2_tmp_tf2 = ROC_All(X2, y1, params, [], True)


In [None]:
#choose a different scorer that increases recall
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=5)

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve
from matplotlib import pyplot as plt

def xgb_using_cv(x_train, y_train, param_comb=100):
    clf_xg = XGBClassifier(learning_rate=0.05, n_estimators=1000, 
                           objective='binary:logistic', batchseed=0, nthread=1)
    params = {
        'min_child_weight': [1, 2.0, 3.0],
        'gamma': [0, 1, 2.5, 5],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [6, 8, 10, 12],
        'scale_pos_weight': [1.5, 2, 2.5, 2.75]
        }

    folds = 3
   
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 10)

    random_search = RandomizedSearchCV(clf_xg, param_distributions=params, n_iter=param_comb, 
                                       scoring=ftwo_scorer, n_jobs=4, cv=skf.split(x_train, y_train), 
                                       verbose=3, random_state=0)
    random_search.fit(x_train, y_train)
    print(random_search.best_params_)
    #now we fit the classifier on the training data
    clf_xg.set_params(**random_search.best_params_)
    clf_xg.set_params(nthread=4)
    clf_xg.fit(x_train, y_train)
    return clf_xg

def xgb_test(clf_xg, x_test, y_test):
    y_predict_test = clf_xg.predict(x_test)
    
    print(confusion_matrix(y_test, y_predict_test))
    y_predict_score = clf_xg.predict_proba(x_test)[:,1]
    fpr, tpr, thresh = roc_curve(y_test, y_predict_score)
    return fpr, tpr, y_predict_score


def clean_data(X, y, consultants):
    df1 = X.fillna(-5)

    for c in df1.columns:
        if 'free_text' in c or 'night_start_mis' in c or 'sleep_training' in c:
            continue
        if 'diffSection' in c: 
            df1 = df1.drop(columns=c)
        if 'consultant_' in c:
            consultant = c.split('_')[1]
            if consultants and consultant not in consultants:
                df1 = df1.drop(columns=c)
            if not consultants:
                df1 = df1.drop(columns=c)
            
    if consultants:
        consultant_indices = df1['consultant'].isin(consultants)
        df1 = df1[consultant_indices]
        y2 = y[consultant_indices]
    else:
        y2 = y
    
    if 'consultant' in df1.columns:
        df1 = df1.drop(columns='consultant')
    df1 = df1.apply(pd.to_numeric)
    
    return df1, y2


def split_data(X, y, use_random=True):
    np.random.seed(10)
    if use_random:
        mask = np.random.rand(len(X)) < 0.75
    else:
        mask = np.arange(0, len(X)) < 0.8*len(X)
    
    X_train, y_train = X[mask], y[mask]
    X_test, y_test = X[~mask], y[~mask]
    return X_train, y_train, X_test, y_test

def ROC_All(X1, y1, params, consultants=[], random_split=True):
    X, y = clean_data(X1, y1, consultants)
    X_train, y_train, X_test, y_test = split_data(X, y, random_split)
    if params:
        clf_xg = XGBClassifier(learning_rate=0.05, n_estimators=1000, 
                           objective='binary:logistic', seed=0, nthread=8)
        clf_xg.set_params(**params)
        clf_xg.fit(X_train, y_train)
    else:
        clf_xg = xgb_using_cv(X_train, y_train, 100)
    fpr_all, tpr_all, y_pred_score = xgb_test(clf_xg, X_test, y_test)
    
    #do the fpr, tpr for different consultants
    fig, ax = plt.subplots()
    ax.plot(fpr_all, tpr_all, label='consultant_feature')
    ax.plot([0, 1], [0, 1])
    
    for c in ['Amy', 'Kristal', 'Eileen', 'Liz', 'Jen', 'Amber', 'Heather', 'Jessica']:
        if 'consultant_'+c not in X_test.columns:
            continue
        indices = X_test['consultant_'+c] == 1
        if len(y_test[indices]) == 0:
            continue
        fpr, tpr, _ = roc_curve(y_test[indices], y_pred_score[indices])
        ax.plot(fpr, tpr, label=c)
    ax.legend(loc='lower right')
    return clf_xg, fpr_all, tpr_all, X, y, X_test, y_test



In [None]:
params = {}
clf_xg_con2_tmp_tf2, fpr_all_con2_tmp_tf2, tpr_all_con2_tmp_tf2, _, _, X_test2_tmp_tf2, y_test2_tmp_tf2 = ROC_All(X2, y1, params, [], True)


In [None]:
len(y[y == 1])/len(y)

In [None]:
fig, ax = plt.subplots()
ax.plot(fpr_all_con2_tmp_tf, tpr_all_con2_tmp_tf, label='correct tf features')
ax.plot(fpr_all_con2, tpr_all_con2, label='incorrect tf features')
ax.legend()

In [None]:
test = X2.iloc[[366]]
features = clf_xg_con2._Booster.feature_names
test = test.apply(pd.to_numeric)
clf_xg_con2.predict_proba(test)

In [None]:
import pickle
pickle.dump(clf_xg_con2_tmp_tf, open('FIRST_TIME_CASE_PREDICT_CHANGE_CORRECT_TRIAGE_FEATURES.bin', 'wb'))


In [None]:
import pickle
import pandas as pd
import xgboost as xgb

import logging
logging.basicConfig()
logger = logging.getLogger("")

class XGBRecClassifier(object):
    def __init__(self, saved_model_filepath):
        """ 
        Initialize the class.
        
        Parameters
        -----------
        saved_model_filepath - Path to the pickle XGB model
        """
        
        self.model = pickle.load(open(saved_model_filepath, 'rb'))
    
    def _parse_request(self, rec):
        rec_request_free_text2 = []
        if 'questionnaire' not in rec:
            return rec_request_free_text2
        
        user_q = rec['questionnaire']
        for q_key, ques in user_q.items():
            if 'answer' in ques:
                ans = ques['answer']
                if "multi" in ques['type']:
                    for multi in ans:
                        if "_other" in multi['label']:
                            other_answer = multi['encode']
                            if other_answer != 'n/a' and other_answer != 'none' and other_answer != 'no' and other_answer != '':
                                rec_request_free_text2.append((q_key, multi['label'], other_answer))
                elif "label" in ques:
                    if"_other" in ans['label']:
                        other_answer = ans['encode'].strip().lower()
                        if other_answer != 'n/a' and other_answer != 'none' and other_answer != 'no' and other_answer != '':
                            rec_request_free_text2.append((q_key, ans['label']), other_answer)
                elif "textarea" in ques['type']:
                    textarea_answer = ans['encode'].strip().lower()
                    if textarea_answer != 'n/a' and textarea_answer != 'none' and textarea_answer != 'no' and textarea_answer != '':
                        rec_request_free_text2.append((q_key, 'textarea', textarea_answer))
        return rec_request_free_text2
    
    
    def _has_warning(self, rec_json):
        for card in rec_json:
            if (card['type'] not in ['Schedule', 'Divider'] and
                card['description'].strip() not in ['Explanation of schedule', 'Sleep Profile', 'Additional Notes']):

                if card.get('warn', 0) == 1:
                    return 1
        return 0
    
    def _bad_schedule(self, rec_json):
        for card in rec_json:
            if card['type'] == 'Schedule':
                for entry in card['entries']:
                    if entry['description'].lower() != 'note':
                        continue
                    sentences = ['fail', 'failed', 
                                 "total sleep hours in the schedule is less than the minimum required sleep",
                                 "schedule does not follow user's reported parameters", "total night hours is"]
                    for sen in sentences:
                        if sen in entry['time'].lower():
                            return 1
        return 0
            
    def predict(self, full_rec, rec_request, triage_features):
        """
        For a given recommendation, recommendation request, and triage features
        determine if the recommendation is EXP and the probability of it being EXP.
        Probability ranges from 0 to 1, with 1 being 100% confident.
        
        Parameters
        ------------
        full_rec - The full recommendation including 'auto_gen', 'rec_json', 'rec_signature'
        rec_request - The request that is associated with the full_rec
        triage_features - Dictionary of features that shows up in triage like SA, VIP, etc.

        Returns
        --------
        isEXP - bool
           Whether the recommendation is EXP
        prob - float
           Confidence of recommendation being EXP

        """
        
        try:
            X = full_rec['rec_signature']['searchArray']
            X['other_len'] = 0
            X['sleep_training_len'] = 0
            rec_req_free_txt2 = self._parse_request(rec_request)
            
            if rec_req_free_txt2:
                X['free_text'] = 1
                sleep_training_note = ''
       
                for (qkey, label, ans) in rec_req_free_txt2:
                    if qkey == '30173308':
                        sleep_training_note = ans
                
                other_ans = []
                for (qkey, label, ans) in rec_req_free_txt2:
                    if '_other' in label:
                        other_ans.append(ans)
                X['other_len'] = len(' '.join(other_ans))
                X['sleep_training_len'] = len(sleep_training_note)
                      
            if 'anything_else' in X:
                X['anything_else_length'] = len(X['anything_else'])
            else:
                X['anything_else_length'] = 0
            for k, v in triage_features.items():
                X[k] = v
            
            X['bad_schedule'] = self._bad_schedule(full_rec['rec_json'])
            X['has_warning'] = self._has_warning(full_rec['rec_json'])
            
            X_df = pd.DataFrame([X])
            #FIXME: this part is a hack because ideally the feature we have matches what is in the model
            model_columns = self.model._Booster.feature_names
            X_df_columns = set(X_df.columns)
            for c in model_columns:
                if c not in X_df_columns:
                    X_df[c] = -5
            
            X_df['gender_encode'] = 0
            X_df.loc[X_df['gender'] == 'M', 'gender_encode'] = 1
            X_df['gender'] = X_df['gender_encode']
            X_df = X_df[model_columns]
            
           
            X_df = X_df.apply(pd.to_numeric)
            probability = self.model.predict_proba(X_df).flatten()[1] 
            if X['bad_schedule'] == 1 or X['has_warning'] == 1:
                return False, round(probability, 2), X_df
            else:
                return probability > 0.5, round(probability, 2), X_df
            
        except Exception as e:
            logger.exception("Error calling predict")
            return False, 0, None



In [None]:
#get triages that have been generated and is first time user
#get current data that are in the queue and see which can be send out without changes
def GetGeneratedTriages():
    ps_connection = pool.getconn()
    ps_cursor = ps_connection.cursor(cursor_factory = psycopg2.extras.NamedTupleCursor)

    q = """  
        SELECT t.*, hist.id, hist.details, hist.agent, hist.time as hist_time, hist.duration, hist.notes 
        FROM triage as t RIGHT JOIN triage_history as hist on t.triage_key = hist.triage_key 
        WHERE t.status = %s AND t.age >= 4 ORDER BY t.time desc;
        """

    ps_cursor.execute(q, ('generated',))

    triage_dict = {}
    for rec in ps_cursor:
        t = rec._asdict()
        key = t['triage_key']
        if key not in triage_dict:
            triage_dict[key] = {'uid': t['uid'],
                                'cid': t['cid'],
                                'age': t['age'],
                                'sql_key': key,
                                'status': t['status'],
                                'due': t['due'],
                                'time': t['time'],
                                'turnaround_time': t['turnaround_time'],
                                'rec_request_key': t['rec_request_key'],
                                'request': t['request'],
                               }
            if t['assignee']:
                triage_dict[key]['assignee'] = t['assignee']
            if t['generate']:
                triage_dict[key]['generate'] = t['generate']
                
            triage_dict[key]['history'] = {}
    
        if t['details'] == 'recs_history':
            triage_dict[key]['recs_history'] = t['notes']
        else:            
            history = triage_dict[key]['history']
            history_id = t['id']
            history[history_id] = {'time': t['hist_time']}
            if t['agent']:
                history[history_id]['agent'] = t['agent']
            if t['notes']:
                history[history_id]['notes'] = t['notes']
            if t['details']:
                history[history_id]['details'] = t['details']
            if t['duration']:
                history[history_id]['duration'] = t['duration']
        
    pool.putconn(ps_connection)
    
    #we also need to add in recs_history
    return triage_dict

reorder_triages = GetGeneratedTriages()

In [None]:
first_time_triage, req_key_to_triage, triage_to_consultant = GetFirstReq(reorder_triages)

In [None]:
len(first_time_triage)

In [None]:
def update_triage_history(triage_key, isEXP, prob, agent='XGB'):
    id_ = dn.Delorean().epoch
    post_time = int(id_)
    id_ = post_time
    
    if not isEXP:
        return
    notes = 'EXP={0:.2f}'.format(prob)
    q = ('INSERT INTO triage_history (triage_key, id, details, agent, time, notes) VALUES '
         '(%(key)s, %(id)s, %(details)s, %(agent)s, %(time)s, %(notes)s)')
    params = {}
    params['key'] = triage_key
    params['id'] = id_
    params['details'] = 'XGB Prediction'
    params['agent'] = agent
    params['time'] = post_time
    params['notes'] = notes
    print(params)
    try:
        ps_connection = pool.getconn()
        ps_cursor = ps_connection.cursor(cursor_factory = psycopg2.extras.DictCursor)
        ps_cursor.execute(q, params)
        ps_connection.commit()
        ps_cursor.close()
        
        if (ps_cursor.rowcount > 0):
            return True
        else:
            return False
    except Exception as e:
        logger.exception("error updating triage history")
    finally:
        pool.putconn(ps_connection)


In [None]:
def del_triage_history(triage_key, id_):
    q = "DELETE FROM triage_history where triage_key={} and id = '{}' and agent='XGB'".format(triage_key, id_)
    try:
        ps_connection = pool.getconn()
        ps_cursor = ps_connection.cursor(cursor_factory = psycopg2.extras.DictCursor)
        ps_cursor.execute(q)
        ps_connection.commit()
        ps_cursor.close()
        print(q)
    except Exception as e:
        logger.exception("error deleting")
    finally:
        pool.putconn(ps_connection)
        
def del_xgb(triages):
    num_deleted = 0
    for triage_key, triage in triages.items():
        skip = True
        id_ = ''
        for k, v in triage['history'].items():
            if 'agent' in v and 'details' in v and v['agent'] == 'XGB' and v['details'] == 'XGB Prediction':
                id_ = str(k)
                skip = False
                break
        
        if not skip:
            print(triage_key, id_, triage['cid'])
            del_triage_history(triage_key, id_)
            num_deleted += 1
        
xgb_triages = GetGeneratedTriages()
del_xgb(xgb_triages)

In [None]:
X_df = X_test2_all.copy()
xgbclassifier = XGBRecClassifier("FIRST_TIME_CASE_PREDICT_CHANGE_CORRECT_TRIAGE_FEATURES.bin")
model = xgbclassifier.model
model_columns = xgbclassifier.model._Booster.feature_names
X_df_columns = set(X_df.columns)
for c in model_columns:
    if c not in X_df_columns:
        print(c)
        X_df[c] = -5
            
X_df['gender_encode'] = 0
X_df.loc[X_df['gender'] == 'M', 'gender_encode'] = 1
X_df['gender'] = X_df['gender_encode']
X_df = X_df[model_columns]
            
X_df = X_df.apply(pd.to_numeric)
probability = xgbclassifier.model.predict_proba(X_df)[:, 1] 

fpr, tpr, thresh = roc_curve(y_test2_all, probability)
fig, ax = plt.subplots()
ax.plot(fpr, tpr, label='old model')
ax.plot(fpr_all_con2_all, tpr_all_con2_all, label='new model')
ax.legend()

In [None]:
import time
xgbclassifier = XGBRecClassifier("FIRST_TIME_CASE_PREDICT_CHANGE_CORRECT_TRIAGE_FEATURES.bin")
new_triages = GetGeneratedTriages()
first_time_triage, req_key_to_triage, triage_to_consultant = GetFirstReq(new_triages)

num = 0
for sql_key in first_time_triage:
    triage = new_triages[sql_key]
    skip = False
    for k, v in triage['history'].items():
        if 'agent' in v and v['agent'] == 'XGB':
            skip = True
            break
#     if skip:
#         continue

    if 'cid' not in triage or triage['cid'] == None: 
        continue
    
    if not triage['cid'].startswith('yQPQ'):
        continue
        
    rec_request = db.reference('rec_requests/{uid}/{rec_req_key}'.format(uid=triage['uid'], 
                                                                         rec_req_key=triage['rec_request_key']))
    rec_request = rec_request.get()
    full_recs = db.reference('draft_recommendations/{cid}'.format(cid=triage['cid'])).get()
    if not full_recs:
        continue
    num_rec = 0
    try:
        for recdate, rec in full_recs.items():
            num_rec += 1
    except Exception as e:
        print(triage, triage['cid'])
        continue
    
    if num_rec > 1:
        logger.warning("not first time user")
        continue
    if num_rec == 1:
        #perform classification
        _, triage_features = case_classifier(triage)
        isEXP, prob, X_df = xgbclassifier.predict(rec, rec_request, triage_features)
        if isEXP:
            print(isEXP, prob, triage, triage_features)
            time.sleep(2)
            #update_triage_history(triage['sql_key'], isEXP, prob)
            num += 1
        

In [None]:
from eli5 import show_prediction
show_prediction(xgbclassifier.model, X_df, feature_names=X_df.columns.values, show_feature_values=True)