# 载入train文件

In [1]:
import pandas as pd
import numpy as np

In [5]:
%%time
root = "/Users/a117/Desktop/kaggle/data-science-bowl-2019/data/"
train = pd.read_csv(root + "train.csv")

CPU times: user 54.1 s, sys: 45.3 s, total: 1min 39s
Wall time: 2min 21s


In [3]:
# Function to reduce DF size

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if (c_min > np.iinfo(np.int8).min) and  \
                        (c_max < np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)

                elif (c_min > np.iinfo(np.int16).min) and \
                        (c_max < np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)

                elif (c_min > np.iinfo(np.int32).min) and \
                        (c_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)

                elif (c_min > np.iinfo(np.int64).min) and \
                        (c_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)

            else:
                if (c_min > np.finfo(np.float16).min) and \
                        (c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)

                elif (c_min > np.finfo(np.float32).min) and \
                        (c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)

                elif (c_min > np.finfo(np.float64).min) and \
                        (c_max < np.finfo(np.float64).max):
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% '
                      'reduction)'.format(
                        end_mem, 100 * (start_mem - end_mem) / start_mem))

    return df

In [6]:
# 减少内存使用
train = reduce_mem_usage(train)

Mem. usage decreased to 778.73 Mb (18.2% reduction)


# 清洗函数

In [7]:
def choose_4100or4110assessment(df, new_df):
    for index, row in df.iterrows():
        if row['event_code'] == 4100 or row['event_code'] == 4110:
            if row['type'] == 'Assessment':
                new_df = new_df.append(row)

    return new_df

In [8]:
def delete_4100_bird(df):
    delete_index = []
    for index, row in df.iterrows():
        if row['title'].startswith('Bird') and row['event_code'] == 4100:
            delete_index.append(index)

    return df.drop(index=delete_index)

In [22]:
def clean_train(df, new_df):
    for index, row in df.iterrows():
        if row['event_code'] == 4100 and row['title'].startswith(('Cart', 'Cauldron', 'Chest', 'Mushroom')):
            new_df = new_df.append(row)
        elif row['event_code'] == 4110 and row['title'].startswith('Bird'):
            new_df = new_df.append(row)
    
    return new_df

# 先用前10000个数据做样本

In [26]:
train_example = train[:10000]

train_columns = train.columns

train_example_4100or4110assessment = pd.DataFrame(columns=train_columns)
train_example_test = pd.DataFrame(columns=train_columns)

In [27]:
%%time
train_example_4100or4110assessment = choose_4100or4110assessment(train_example, train_4100or4110assessment)
train_example_4100or4110assessment = delete_4100_bird(train_example_4100or4110assessment)

len(train_example_4100or4110assessment)

CPU times: user 1.53 s, sys: 12.5 ms, total: 1.55 s
Wall time: 1.56 s


24

In [28]:
%%time
train_example_test = clean_train(train_example, train_example_test)
len(train_example_test)

CPU times: user 1.44 s, sys: 7.13 ms, total: 1.45 s
Wall time: 1.47 s


24

# 整体

In [41]:
train_4100or4110assessment = pd.DataFrame(columns=train_columns)

In [42]:
%%time
train_4100or4110assessment = clean_train(train, train_4100or4110assessment)

# filename = '/Users/a117/Desktop/kaggle/data-science-bowl-2019/data/' \
#            'train_4100or4110assessment.csv'
# train_4100or4110assessment.to_csv(path_or_buf=filename)

CPU times: user 36min 46s, sys: 57.3 s, total: 37min 43s
Wall time: 38min 13s


In [44]:
len(train_4100or4110assessment)

41549

# 增加True和False列

In [46]:
%%time
root = "/Users/a117/Desktop/kaggle/data-science-bowl-2019/data/"
train_assessment = pd.read_csv(root + "train_4100or4110assessment.csv")

CPU times: user 157 ms, sys: 29.7 ms, total: 187 ms
Wall time: 200 ms


In [48]:
train_assessment.head()

Unnamed: 0.1,Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,2228,25fa8af4,901acc108f55a5a1,2019-08-06T05:22:32.357Z,"{""correct"":true,""stumps"":[1,2,4],""event_count""...",0006a69f,44,4100,31011,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1,2709,17113b36,77b8ee947eb84b4e,2019-08-06T05:35:54.898Z,"{""correct"":false,""caterpillars"":[11,8,3],""even...",0006a69f,29,4110,35771,Bird Measurer (Assessment),Assessment,TREETOPCITY
2,2715,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:01.927Z,"{""correct"":false,""caterpillars"":[11,8,11],""eve...",0006a69f,35,4110,42805,Bird Measurer (Assessment),Assessment,TREETOPCITY
3,2720,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:06.512Z,"{""correct"":false,""caterpillars"":[11,8,5],""even...",0006a69f,40,4110,47388,Bird Measurer (Assessment),Assessment,TREETOPCITY
4,2725,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:09.739Z,"{""correct"":false,""caterpillars"":[11,8,7],""even...",0006a69f,45,4110,50605,Bird Measurer (Assessment),Assessment,TREETOPCITY


In [53]:
train_assessment.rename(columns={'Unnamed: 0':'original_index'}, inplace=True)
train_assessment.head()

Unnamed: 0,original_index,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,2228,25fa8af4,901acc108f55a5a1,2019-08-06T05:22:32.357Z,"{""correct"":true,""stumps"":[1,2,4],""event_count""...",0006a69f,44,4100,31011,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1,2709,17113b36,77b8ee947eb84b4e,2019-08-06T05:35:54.898Z,"{""correct"":false,""caterpillars"":[11,8,3],""even...",0006a69f,29,4110,35771,Bird Measurer (Assessment),Assessment,TREETOPCITY
2,2715,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:01.927Z,"{""correct"":false,""caterpillars"":[11,8,11],""eve...",0006a69f,35,4110,42805,Bird Measurer (Assessment),Assessment,TREETOPCITY
3,2720,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:06.512Z,"{""correct"":false,""caterpillars"":[11,8,5],""even...",0006a69f,40,4110,47388,Bird Measurer (Assessment),Assessment,TREETOPCITY
4,2725,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:09.739Z,"{""correct"":false,""caterpillars"":[11,8,7],""even...",0006a69f,45,4110,50605,Bird Measurer (Assessment),Assessment,TREETOPCITY


In [59]:
train_assessment.loc[0, 'event_data']

'{"correct":true,"stumps":[1,2,4],"event_count":44,"game_time":31011,"event_code":4100}'

In [60]:
import json

In [68]:
json.loads(train_assessment.loc[0, 'event_data'])['correct']

True

In [74]:
%%time
for i in train_assessment.index:
    if json.loads(train_assessment.loc[i, 'event_data'])['correct'] is True:
        train_assessment.loc[i, 'True'] = 1
        train_assessment.loc[i, 'False'] = 0
    elif json.loads(train_assessment.loc[i, 'event_data'])['correct'] is False:
        train_assessment.loc[i, 'True'] = 0
        train_assessment.loc[i, 'False'] = 1

CPU times: user 48.7 s, sys: 380 ms, total: 49.1 s
Wall time: 49.3 s


In [75]:
train_assessment.head(100)

Unnamed: 0,original_index,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,True,False
0,2228,25fa8af4,901acc108f55a5a1,2019-08-06T05:22:32.357Z,"{""correct"":true,""stumps"":[1,2,4],""event_count""...",0006a69f,44,4100,31011,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1.0,0.0
1,2709,17113b36,77b8ee947eb84b4e,2019-08-06T05:35:54.898Z,"{""correct"":false,""caterpillars"":[11,8,3],""even...",0006a69f,29,4110,35771,Bird Measurer (Assessment),Assessment,TREETOPCITY,0.0,1.0
2,2715,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:01.927Z,"{""correct"":false,""caterpillars"":[11,8,11],""eve...",0006a69f,35,4110,42805,Bird Measurer (Assessment),Assessment,TREETOPCITY,0.0,1.0
3,2720,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:06.512Z,"{""correct"":false,""caterpillars"":[11,8,5],""even...",0006a69f,40,4110,47388,Bird Measurer (Assessment),Assessment,TREETOPCITY,0.0,1.0
4,2725,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:09.739Z,"{""correct"":false,""caterpillars"":[11,8,7],""even...",0006a69f,45,4110,50605,Bird Measurer (Assessment),Assessment,TREETOPCITY,0.0,1.0
5,2730,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:13.951Z,"{""correct"":false,""caterpillars"":[11,8,4],""even...",0006a69f,50,4110,54822,Bird Measurer (Assessment),Assessment,TREETOPCITY,0.0,1.0
6,2733,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:17.407Z,"{""correct"":false,""caterpillars"":[11,8,4],""even...",0006a69f,53,4110,58280,Bird Measurer (Assessment),Assessment,TREETOPCITY,0.0,1.0
7,2738,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:21.390Z,"{""correct"":false,""caterpillars"":[11,8,2],""even...",0006a69f,58,4110,62256,Bird Measurer (Assessment),Assessment,TREETOPCITY,0.0,1.0
8,2743,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:26.296Z,"{""correct"":false,""caterpillars"":[11,8,1],""even...",0006a69f,63,4110,67164,Bird Measurer (Assessment),Assessment,TREETOPCITY,0.0,1.0
9,2750,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:32.187Z,"{""correct"":false,""caterpillars"":[11,8,1],""even...",0006a69f,70,4110,73056,Bird Measurer (Assessment),Assessment,TREETOPCITY,0.0,1.0


In [78]:
%whos

Variable                             Type         Data/Info
-----------------------------------------------------------
a                                    dict         n=5
choose_4100or4110assessment          function     <function choose_4100or41<...>ssessment at 0x220f87158>
clean_train                          function     <function clean_train at 0x1806a8268>
delete_4100_bird                     function     <function delete_4100_bird at 0x10d0e90d0>
filename                             str          /Users/a117/Desktop/kaggl<...>_4100or4110assessment.csv
i                                    int          41548
json                                 module       <module 'json' from '/Lib<...>hon3.7/json/__init__.py'>
np                                   module       <module 'numpy' from '/Li<...>kages/numpy/__init__.py'>
pd                                   module       <module 'pandas' from '/L<...>ages/pandas/__init__.py'>
reduce_mem_usage                     function     <function