In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf

In [2]:
def avg_time_between_occurrences(data, include_start=True):
    results = {'event_id':[],'avg_time_between':[],'number':[]}
    #print("event_id : average time between events, number of events")
    for eid in sorted(list(set(data.event_id))):
        results['event_id'].append(eid)
        this_id = data[(data.event_id==eid)]
        results['number'].append(this_id.shape[0])
        if include_start:
            total = 0
            ser = list(pd.concat((pd.Series([0]),this_id.event_time_dbl)))
            for i, t in enumerate(ser[1:]):
                total += t-ser[i]
            avg = total/max((this_id.shape[0]-1),1)
            results['avg_time_between'].append(avg)
        else:
            if this_id.shape[0] > 1:
                total = 0
                for i, t in enumerate(this_id.event_time_dbl[1:]):
                    total += t-this_id.event_time_dbl[i]
                avg = total/(this_id.shape[0]-1)
                #print(eid,':',avg,',',this_id.shape[0])
                results['avg_time_between'].append(avg)
            else:
                #print(eid, ': N/A, 1')
                results['avg_time_between'].append(np.nan)
    return pd.DataFrame(results)

In [3]:
def avg_time_between_ids(data, start_id, end_id, start_first=True, end_first=False):
    df = data[(data.event_id==start_id)|(data.event_id==end_id)]
    start_encountered = False
    end_encountered = False
    start_time = 0
    end_time = 0
    times = []
    for i, x in df.iterrows():
        if x.event_id == start_id:
            if end_encountered and start_encountered and not end_first:
                times.append(end_time-start_time)
                start_encountered = False
                end_encountered = False
            if not start_first or not start_encountered:
                start_time = x.event_time_dbl
                start_encountered = True
        elif x.event_id == end_id:
            if start_encountered and end_first:
                end_time = x.event_time_dbl
                times.append(end_time-start_time)
                start_encountered = False
                end_encountered = False
            elif start_encountered:
                end_time = x.event_time_dbl
                end_encountered = True
    return sum(times)/len(times)

In [4]:
def events_between_times(data, start=None, end=None, event_ids=None, span=None):
    if not (event_ids is None):
        data = data[[x.event_id in tuple(event_ids) for i, x in data.iterrows()]]
    if not (start is None):
        data = data[data.event_time_dbl>=start]
    start = np.min(data.event_time_dbl)
    if not (end is None):
        data = data[data.event_time_dbl<=end]
    end = np.max(data.event_time_dbl)
    if span is None:
        return data.shape[0]
    else:
        return data.shape[0]/((end-start)/span)

In [5]:
def events_between_times_squared(data, large_span, start=None, end=None, event_ids=None, span=None):
    if start is None:
        start = np.min(data.event_time_dbl)
    if end is None:
        end = np.max(data.event_time_dbl)
    nums = []
    for i in range(start,end+1,large_span):
        nums.append(events_between_times(data, i, i+large_span-1, event_ids=event_ids, span=span))
    return nums

In [6]:
def get_minigame_data(dataset):
    data = dataset.copy()
    init_time = None
    total_r = 0
    q = 0
    new_stars = 0
    e = 0

    times = []
    restarts = []
    quit = []
    stars = []
    events = []
    clean_completes = []
    lists = [times, restarts, quit, stars, events, clean_completes]
    for index, row in data.iterrows():
        e += 1
        if(row["event_id"] == 1004):
            init_time = row["event_time_dbl"]
            for i in lists:
                i.append(np.nan)
        elif(row["event_id"] == 1001):
            if(init_time is None):
                init_time = row["event_time_dbl"]
            times.append(row["event_time_dbl"] - init_time)
            init_time = None
            restarts.append(total_r)
            total_r = 0
            quit.append(0)
            stars.append(new_stars)
            new_stars = 0
            events.append(e)
            e = 0
            clean_completes.append(1)
        elif(row["event_id"] == 1000):
            if(init_time is None):
                init_time = row["event_time_dbl"]
            total_r += 1
            for i in lists:
                i.append(np.nan)
        elif(row["event_id"] == 1002):
            if(init_time is None):
                init_time = row["event_time_dbl"]
            times.append(row["event_time_dbl"] - init_time)
            init_time = None
            restarts.append(total_r)
            total_r = 0
            quit.append(1)
            stars.append(new_stars)
            new_stars = 0
            events.append(e)
            e = 0
            clean_completes.append(0)
        elif(row["event_id"] == 1005):
            if(init_time is None):
                init_time = row["event_time_dbl"]
            new_stars += row["new_skill_point"] - row["old_skill_point"]
            for i in lists:
                i.append(np.nan)
        else:
            for i in lists:
                i.append(np.nan)
    data["minigame_time_elapsed"] = times
    data["minigame_restarts"] = restarts
    data["minigame_quit"] = quit
    data["minigame_additional_stars"] = stars
    data["minigame_events"] = events
    data["minigame_completed"] = clean_completes
    return data

In [7]:
def minigame_agg(dataset, player_id=None):
    data = dataset.copy()
    if("minigame_time_elapsed" not in data.columns):
        data = get_minigame_data(data)
    if(player_id is None):
        ids = data.player_id.unique()
    else:
        ids = [player_id]
        
    minigame_data = None
    
    for player in ids:
        sub_data = data[data["player_id"] == player].dropna(subset=["minigame_time_elapsed"])
        time_sum = 0
        restart_sum = 0
        quit_sum = 0
        star_sum = 0
        event_sum = 0
        complete_sum = 0
        for index, row in sub_data.iterrows():
            time_sum += row["minigame_time_elapsed"]
            restart_sum += row["minigame_restarts"]
            quit_sum += row["minigame_quit"]
            star_sum += row["minigame_additional_stars"]
            event_sum += row["minigame_events"]
            complete_sum += row["minigame_completed"]
        #print(f"player_id: {player}")
        #print(f"Total Minigames | Completed: {sub_data.shape[0]-restart_sum-quit_sum}, Attempted: {sub_data.shape[0]}")
        #print(f"Minigame Time Elapsed | Sum: {time_sum}, Average for Completed: {time_sum/(sub_data.shape[0]-restart_sum-quit_sum)}, Average for Attempted: {time_sum/sub_data.shape[0]}")
        #print(f"Minigame Restarts | Sum: {restart_sum}, Average for Completed: {restart_sum/(sub_data.shape[0]-restart_sum-quit_sum)}, Average for Attempted: {restart_sum/sub_data.shape[0]}")
        #print(f"Minigame Quits | Sum: {quit_sum}, Average for Completed: {quit_sum/(sub_data.shape[0]-restart_sum-quit_sum)}, Average for Attempted: {quit_sum/sub_data.shape[0]}")
        #print(f"Minigame Stars | Sum: {star_sum}, Average for Completed: {star_sum/(sub_data.shape[0]-restart_sum-quit_sum)}, Average for Attempted: {star_sum/sub_data.shape[0]}")
        
        if(minigame_data is None):
            d = {
                "player_id": [player],
                "minigames_attempted": [sub_data.shape[0]],
                "minigames_completed": [complete_sum],
                "minigame_total_time": [time_sum],
                "minigame_total_restarts": [restart_sum],
                "minigame_total_quits": [quit_sum],
                "minigame_total_stars": [star_sum],
                "minigame_total_events": [event_sum],
            }
            minigame_data = pd.DataFrame(d)
        else:
            d = {
                "player_id": player,
                "minigames_attempted": sub_data.shape[0],
                "minigames_completed": complete_sum,
                "minigame_total_time": time_sum,
                "minigame_total_restarts": restart_sum,
                "minigame_total_quits": quit_sum,
                "minigame_total_stars": star_sum,
                "minigame_total_events": event_sum
            }
            minigame_data = minigame_data.append(d, ignore_index=True)
    return minigame_data

In [1]:
def elbow_plot(data, maxK=10, seed_centroids=None):
    sse = {}
    for k in range(1, maxK):
        print("k: ", k)
        if seed_centroids is not None:
            seeds = seed_centroids.head(k)
            kmeans = KMeans(n_clusters=k, max_iter=500, n_init=100, random_state=0, init=np.reshape(seeds, (k, 1))).fit(data)
        else:
            kmeans = KMeans(n_clusters=k, max_iter=300, n_init=100, random_state=0).fit(data)
        data["clusters"] = kmeans.labels_
        sse[k] = kmeans.inertia_
        plt.figure()
        plt.plot(list(sse.keys()), list(sse.values()))
        plt.show()
        return

In [None]:
def runModel(model, xt, yt, xv, yv):
    
    model.fit(xt, yt)
    y_pred = model.predict(xv)
    y_pred_prob = model.predict_proba(xv)[:, 1]
    #print(y_pred_prob[0:10])  # first coloumn is prob of negative class (fail)
    labels = np.unique(yv)
    cm = confusion_matrix(yv, y_pred_prob > Y_PRED_PROB_THRESH, labels=labels)
    print(pd.DataFrame(cm, index=labels, columns=labels))
    print(f"{accuracy_score.__name__} : {accuracy_score(yv, y_pred_prob > Y_PRED_PROB_THRESH)}")
    for func in [recall_score, precision_score, f1_score]:
        print(f"{func.__name__} :  {func(yv, y_pred_prob > Y_PRED_PROB_THRESH, average = 'weighted')}")

    # print classification report
    print(metrics.classification_report(yv, y_pred))
    # calculate scores & Extracting probabilities
    auc = roc_auc_score(yv, pd.Series(model.predict_proba(xv)[:, 1]))
    # summarize scores
    print(f"ROC AUC : {auc:.3f}")
    plt.rcParams["figure.figsize"]=(10, 5)
    plt.figure()
    m_fpr, m_tpr, _ = roc_curve(yv, pd.Series(y_pred_prob))
    plt.plot(m_fpr, m_tpr, color="darkorange", lw=3)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.savefig(f"{model}.png", dpi=150, bbox_inches="tight")

    # Cross-validation
    scores = cross_val_score(model, xt, yt, cv=CROSS_VAL_SPLIT, scoring="recall")
    print(f"cross validation : {scores}\nmean : {scores.mean()}")

    # Cross-validation splitter as a cv parameter
    shuffle_split = StratifiedShuffleSplit(
        #TEST_SIZE,
        #n_splits=CROSS_VAL_SPLIT,
        random_state=RAND,
    )
    scores = cross_val_score(model, xt, yt, cv=shuffle_split, scoring="recall")
    print(f"(shuffled-split) cross validation : {scores}\nmean : {scores.mean()}")