In [1]:
import pandas as pd
import json

In [2]:
LOG_FILE = "game_overs_2014_05_21.txt"
OUTPUT_FILE= "game_overs_compact.csv"
logs = []
with open(LOG_FILE,'r') as file:
   for log in file:
       row = json.loads(log)
       logs.append(row)
df = pd.DataFrame(logs)

# Slicing

In [3]:
# work with only the version 1.0 of the application.
df['version'] = df['ver'].astype(str)
df =  df[df['version'] == "1.0"]

In [4]:
# work with only the first 50 levels
df = df[df['level_number'] <= 50]

# remove rows with end_reason equal to "quit"
df = df[~df['end_reason'].isin(['quit'])]

In [5]:
# remove columns that do not apply to the current version or that has debug/ dev data
"""df = df.drop(['application', 'msg', 'type', 'ver', 'headstart', 'levelpack',
              'n_evil_masks', 'n_fountains', 'n_freezers','n_frozen_spikyplants', 
              'n_trampoline_free','n_pineapple_value', 'n_pineapples',
              'bonus_bb2.tigerstrike', 'collected_bb2.tigerstrike',
              'bonus_bb2.snakeoil', 'collected_bb2.snakeoil', 
              'n_chilis', 'n_tiger_strikes', 'n_tigers',
              'happening_before_death','level'], axis=1)"""
df =df[['device_id', 'ts', 'level_number', 
        'end_reason', 'lives_left','n_bananavalue', 'n_distance',
        'n_specialcollectibles', 'n_swings', 'pathtrace',
        'stars','swings_left', 'time_used', 'total_duration','score']]

# Cleaning

In [6]:
df = df.drop_duplicates()
df.describe()

Unnamed: 0,ts,level_number,lives_left,n_bananavalue,n_distance,n_specialcollectibles,n_swings,stars,swings_left,time_used,total_duration,score
count,782561.0,782561.0,782561.0,782561.0,782561.0,782560.0,782560.0,782561.0,782561.0,782561.0,782561.0,782560.0
mean,1400738000.0,12.461257,68607.94,79.075409,283.841021,0.03505,9.229953,1.291405,0.512084,38.000148,45.359144,1314.369662
std,6094903.0,9.551357,12137640.0,141.405982,223.004633,0.387486,7.309185,1.435501,1.512776,29.732864,30.399311,1507.377595
min,1049.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
25%,1400643000.0,5.0,2.0,0.0,93.0,0.0,2.0,0.0,0.0,10.0,21.0,100.0
50%,1400653000.0,10.0,4.0,0.0,232.0,0.0,8.0,0.0,0.0,31.0,38.0,940.0
75%,1400660000.0,18.0,5.0,115.0,425.0,0.0,14.0,3.0,0.0,61.0,66.0,1816.0
max,2095238000.0,50.0,2147484000.0,2818.0,2118.0,10.0,60.0,3.0,21.0,898.0,2322.0,28229.0


In [7]:
# remove rows with negative distances
df = df[df['n_distance'] >= 0]

# remove outliers
df = df[df['lives_left'] <= 50]

# Transforming

In [9]:
# convert timestamp to datetime
df['date_time'] = pd.to_datetime(df['ts'], unit='s')

# remove rows with wrong date
df = df[df['date_time'].dt.date == pd.to_datetime('2014-05-21').date()]
df = df.drop(['ts'], axis=1)

In [10]:
df.to_parquet('game_overs_compact.parquet', engine='pyarrow', compression='zstd')