In [None]:
import os

!pip install matplotlib -q

import pandas as pd

## Data loading

In [None]:
!ls $HOME/ds-bowl-from-scratch/raw-data

In [None]:
home = os.environ['HOME']
home

In [None]:
# os.path OR pathlib
home = os.path.join(os.environ['HOME'], 'ds-bowl-from-scratch', 'raw-data')
print(home)

In [None]:
os.listdir(home)

In [None]:
raw_data = {}

for fi in os.listdir(home):
    if 'csv' in fi:
        print(fi)
        raw_data[fi] = pd.read_csv(os.path.join(home, fi))

## Basic data inspection

Look at all csvs

In [None]:
for name, df in raw_data.items():
    print(name, df.shape)
    print(df.columns)
    print(' ')

## Inspection of the target

Four classes

In [None]:
labels = raw_data['train_labels.csv']
set(labels.loc[:, 'accuracy_group'])

In [None]:
labels.loc[:, 'accuracy_group'].hist()

Number of installs

In [None]:
def inspect_df(data, csv):
    """
    show unique values in all cols of a df 
    
    data (dict)
    csv (str)
    """
    df = data[csv]
    for col in df.columns:
        print(col)
        uniq = len(set(df.loc[:, col]))
        print('num. uniques {}'.format(uniq))
        print(uniq / df.shape[0])
        print(' ')
        
inspect_df(raw_data, 'train_labels.csv')

In [None]:
raw_data.keys()

## Event data

`train_labels.csv` has the correct labels for some of our game sessions:

In [None]:
raw_data['train_labels.csv'].iloc[0, :]

In [None]:
def extract_game_session(data, idx):
    """Get info about a game_session from our labels, based on an int index"""
    df = data['train_labels.csv']
    row = df.iloc[idx, :]
    assert row.loc['game_session'] in data['train.csv'].loc[:, 'game_session'].values
    return row.to_dict()

gs = extract_game_session(raw_data, 900)
gs

In [None]:
import json

def get_event_data(data, game_session_dict):
    """ gets all the events for a given game session 
    
    returns list of json
    """
    df = data['train.csv']
    sub = df[df.loc[:, 'game_session'] == game_session_dict['game_session']]
    if 'Bird Measurer' in game_session_dict['title']:
        code = 4110
    else:
        code = 4100
        
    out = list(sub[sub.loc[:, 'event_code'] == code].loc[:, 'event_data'])
    
    return [json.loads(j) for j in out]

get_event_data(raw_data, gs)

3: the assessment was solved on the first attempt

2: the assessment was solved on the second attempt

1: the assessment was solved after 3 or more attempts

0: the assessment was never solved

In [None]:
def get_accuracy_label(corr, incorr):
    """ labels based on the num correct & incorrect """
    corr = sum(corr)
    incorr = sum(incorr)
    
    if corr == 1 and incorr == 0:
        return 3
    elif corr == 1 and incorr == 1:
        return 2
    elif corr == 1 and incorr >= 2:
        return 1
    else:
        return 0
    
def test_get_acc():
    """ test get_accuracy_label """
    corr, incorr = [1], [0]
    expected = 3
    assert get_accuracy_label(corr, incorr) == expected

    corr, incorr = [0], [10]
    expected = 0
    assert get_accuracy_label(corr, incorr) == expected
    
test_get_acc()

def extract_event_data(events):
    """ turns list of json into num correct & incorrect"""
    corr, incorr = [], []
    for event in events:
        if event['correct']:
            corr.append(1)
        else:
            incorr.append(1)
    return corr, incorr

def check_labels(game_session_dict, event_data):
    """ checks that the labels we generate are the same as in train_labels """
    corr, incorr = extract_event_data(event_data)
    assert sum(corr) == game_session_dict['num_correct']
    assert sum(incorr) == game_session_dict['num_incorrect']
    label = get_accuracy_label(corr, incorr)
    assert label == game_session_dict['accuracy_group']
    
check_labels(gs, event_data)

### Check that our labelling is working by cross-check with `train_labels.csv`:

In [None]:
data = []
for i in range(5):
    gs = extract_game_session(raw_data, i)
    out = get_event_data(raw_data, gs)
    label = check_labels(gs, out)
    
    data.append({
        'game_session': gs['game_session'],
        'label': label
    })

### Label rows in train.csv that aren't labelled in `train_labels.csv`

In [None]:
data = []
for row in range(5):
    tr = raw_data['train.csv']
    train_row = tr.iloc[row, :]
    gs_id = {
        'game_session': train_row.loc['game_session'],
        'title': train_row.loc['title']
    }
    
    event_data = get_event_data(raw_data, gs_id)
    corr, incorr = extract_event_data(event_data)
    label = get_accuracy_label(corr, incorr)
    
    data.append({
        'game_session': gs['game_session'],
        'label': label
    })

## Garbage below

In [None]:
import json

json.loads(raw_data['train.csv'].loc[:, 'event_data'].iloc[1000])

In [None]:
raw_data['sample_submission.csv'].head()

In [None]:
len(set(raw_data['train_labels.csv'].loc[:, 'installation_id']))

In [None]:
raw_data['train.csv'].shape

In [None]:
len(set(raw_data['train.csv'].loc[:, 'game_session']))

In [None]:
len(set(raw_data['train.csv'].loc[:, 'installation_id']))

In [None]:
df = raw_data['train_labels.csv']

mask = df['accuracy_group'] == 3
mask = df.loc[:, 'accuracy_group'] == 3

sub = df.loc[mask, :]

sub.head()

In [None]:
set(sub.loc[:, 'num_correct'])

In [None]:
set(df.loc[:, 'num_correct'])

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# hardness of games

In [None]:
!pip install seaborn -q
import seaborn as sns

sns.pairplot(data['train_labels.csv'])

In [None]:
mask = df['num_correct'] == 1

mask

In [None]:
mask2 = df['num_incorrect'] > 0

df.loc[mask & mask2, :].loc[:, 'num_incorrect'].hist()

In [None]:
max(df.loc[mask & mask2, :].loc[:, 'num_incorrect'])