In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

In [2]:
%%time
root = "/Users/a117/Desktop/kaggle/data-science-bowl-2019/data/"
train = pd.read_csv(root + "train.csv")
train_labels = pd.read_csv(root + "train_labels.csv")

# Function to reduce DF size

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if (c_min > np.iinfo(np.int8).min) and  \
                        (c_max < np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)

                elif (c_min > np.iinfo(np.int16).min) and \
                        (c_max < np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)

                elif (c_min > np.iinfo(np.int32).min) and \
                        (c_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)

                elif (c_min > np.iinfo(np.int64).min) and \
                        (c_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)

            else:
                if (c_min > np.finfo(np.float16).min) and \
                        (c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)

                elif (c_min > np.finfo(np.float32).min) and \
                        (c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)

                elif (c_min > np.finfo(np.float64).min) and \
                        (c_max < np.finfo(np.float64).max):
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% '
                      'reduction)'.format(
                        end_mem, 100 * (start_mem - end_mem) / start_mem))

    return df

# 减少内存使用
train = reduce_mem_usage(train)
train_labels = reduce_mem_usage(train_labels)

Mem. usage decreased to 778.73 Mb (18.2% reduction)
Mem. usage decreased to  0.49 Mb (48.2% reduction)
CPU times: user 49.5 s, sys: 18.8 s, total: 1min 8s
Wall time: 1min 15s


# 以前100000行数据为例探索算法

In [3]:
%%time
train_example = train[:100000]

train_columns = train.columns

CPU times: user 3.03 ms, sys: 3.74 ms, total: 6.77 ms
Wall time: 5.18 ms


# 原来的方法

In [4]:
def clean_train(df, new_df):
    for index, row in df.iterrows():
        if row['event_code'] == 4100 and row['title'].startswith(('Cart', 'Cauldron', 'Chest', 'Mushroom')):
            new_df = new_df.append(row)
        elif row['event_code'] == 4110 and row['title'].startswith('Bird'):
            new_df = new_df.append(row)
    
    return new_df

In [5]:
%%time
train_example_4100or4110assessment = pd.DataFrame(columns=train_columns)
train_example_4100or4110assessment = clean_train(train_example, train_example_4100or4110assessment)
len(train_example_4100or4110assessment)

CPU times: user 15 s, sys: 91.4 ms, total: 15.1 s
Wall time: 15.3 s


333

In [6]:
%%time
train_example_result = train_example[((train_example.event_code == 4100)
              & (train_example.title.str.startswith(('Cart', 'Cauldron', 'Chest', 'Mushroom'))))
             |((train_example.event_code == 4110)
              & (train_example.title.str.startswith('Bird')))]

len(train_example_result)

CPU times: user 102 ms, sys: 4.53 ms, total: 107 ms
Wall time: 110 ms


333

# 整体时间测试

In [7]:
%%time
train_new = train[((train.event_code == 4100)
              & (train.title.str.startswith(('Cart', 'Cauldron', 'Chest', 'Mushroom'))))
             |((train.event_code == 4110)
              & (train.title.str.startswith('Bird')))]
len(train_new)

CPU times: user 9.91 s, sys: 294 ms, total: 10.2 s
Wall time: 10.3 s


41549

# 尝试对event_data进行拆分

In [8]:
%%time
dict_event_data = {index: json.loads(row['event_data']) for index, row in train_example.iterrows()}

len(dict_event_data)

CPU times: user 15.1 s, sys: 301 ms, total: 15.4 s
Wall time: 16 s


100000

In [None]:
%%time
dict_event_data_all = {index: json.loads(row['event_data']) for index, row in train.iterrows()}

len(dict_event_data_all)

In [None]:
dict_event_data_all

In [None]:
%%time
dict_data_len = {keys: len(values) for keys, values in dict_event_data.items()}

dict_data_len_all = {keys: len(values) for keys, values in dict_event_data_all.items()}

In [None]:
%%time
sorted(dict_data_len_all.items(), key=lambda item:item[1], reverse=True)

In [None]:
dict_event_data_all[1788]

In [None]:
dict_event_data_all[22203]

In [None]:
dict_event_data_all[75502]

In [None]:
dict_event_data_all[80149]

每个event_data中的keys并不相同，比如1788和22203，虽然都有12个key，但1788中有dinosaur、house等，22203中则有weight、destination等。当然也有几部分是相同的，比如correct、event_count、game_time、event_code。

我想要探索event_data中不同行的字典数据中的keys出现的频次，找出一些共有的keys来，作为模型的候选考虑因素。

In [None]:
count_keys = {}
for key, value in dict_event_data.items():
    for k, v in value.items():
        if k not in count_keys.keys():
            count_keys[k] = 1
        else:
            count_keys[k] += 1
            
count_keys

In [None]:
count_keys_all = {}
for key, value in dict_event_data_all.items():
    for k, v in value.items():
        if k not in count_keys_all.keys():
            count_keys_all[k] = 1
        else:
            count_keys_all[k] += 1
            
count_keys_all

In [None]:
%%time
sorted(count_keys_all.items(), key=lambda item:item[1], reverse=True)

In [None]:
%matplotlib notebook
fig, ax = plt.subplots()
b = ax.barh(range(len(count_keys_all.keys())), count_keys_all.values())

In [None]:
train.head()

In [None]:
%%time
new_columns = ['event_id', 'installation_id', 'game_session', 
               'timestamp', 'title', 'type', 'world', 'event_code', 
               'event_count', 'game_time', 'event_data']

new_train = train[new_columns]
new_train.sort_values(by=['installation_id', 'game_session', 'timestamp']).head(50)

In [None]:
%%time
new_train_labels = train_labels[['installation_id', 'game_session', 'title', 'num_correct',
                                'num_incorrect', 'accuracy', 'accuracy_group']]

new_train_labels.sort_values(by=['installation_id', 'game_session']).head(50)

In [None]:
len(new_train[(new_train['installation_id'] == '0006a69f')
         & (new_train['event_code'] == 4100)])