In [1]:
import pandas as pd
import json

In [2]:
LOG_FILE = "game_overs_2014_05_21.txt"
OUTPUT_FILE= "game_overs_compact.csv"
logs = []
with open(LOG_FILE,'r') as file:
   for log in file:
       row = json.loads(log)
       logs.append(row)
df = pd.DataFrame(logs)

# Slicing

In [3]:
# work with only the version 1.0 of the application.
df['version'] = df['ver'].astype(str)
df =  df[df['version'] == "1.0"]

In [4]:
# work with only the first 50 levels
df = df[df['level_number'] <= 50]

# remove rows with end_reason equal to "quit"
df = df[~df['end_reason'].isin(['quit'])]

In [5]:
# remove columns that do not apply to the current version or that has debug/ dev data
df = df.drop(['application', 'msg', 'type', 'ver', 'headstart', 'levelpack',
              'n_evil_masks', 'n_fountains', 'n_freezers','n_frozen_spikyplants', 
              'n_trampoline_free','n_pineapple_value', 'n_pineapples',
              'bonus_bb2.tigerstrike', 'collected_bb2.tigerstrike',
              'bonus_bb2.snakeoil', 'collected_bb2.snakeoil', 
              'n_chilis', 'n_tiger_strikes', 'n_tigers',
              'happening_before_death','level'], axis=1)

# Cleaning

In [6]:
df = df.drop_duplicates()
df.describe()

Unnamed: 0,nr,ts,framerate,level_number,lives_left,n_bananabunches,n_bananas,n_bananavalue,n_distance,n_ground_hits,...,n_spikyplants,n_swings,n_trampoline_cost,score,session,stars,swings_left,time_used,timestamp_from_start_of_session,total_duration
count,782561.0,782561.0,782561.0,782561.0,782561.0,782561.0,782561.0,782561.0,782561.0,782560.0,...,782560.0,782560.0,782560.0,782560.0,782561.0,782561.0,782561.0,782561.0,782561.0,782561.0
mean,523.745049,1400738000.0,67.512014,12.461257,68607.94,0.052712,65.312213,79.075409,283.841021,1.552225,...,0.000951,9.229953,0.002591,1314.369662,9.65042,1.291405,0.512084,38.000148,249.342083,45.359144
std,649.209691,6094903.0,3.894441,9.551357,12137640.0,0.560573,111.503771,141.405982,223.004633,1.523614,...,0.05998,7.309185,0.071017,1507.377595,14.64801,1.435501,1.512776,29.732864,1191.002921,30.399311
min,16.0,1049.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
25%,108.0,1400643000.0,68.0,5.0,2.0,0.0,0.0,0.0,93.0,0.0,...,0.0,2.0,0.0,100.0,1.0,0.0,0.0,10.0,104.0,21.0
50%,284.0,1400653000.0,68.0,10.0,4.0,0.0,0.0,0.0,232.0,1.0,...,0.0,8.0,0.0,940.0,4.0,0.0,0.0,31.0,185.0,38.0
75%,689.0,1400660000.0,68.0,18.0,5.0,0.0,101.0,115.0,425.0,2.0,...,0.0,14.0,0.0,1816.0,12.0,3.0,0.0,61.0,324.0,66.0
max,36314.0,2095238000.0,72.0,50.0,2147484000.0,20.0,1419.0,2818.0,2118.0,62.0,...,10.0,60.0,4.0,28229.0,262.0,3.0,21.0,898.0,518584.0,2322.0


In [7]:
# remove rows with negative distances
df = df[df['n_distance'] >= 0]

# remove outliers
df = df[df['lives_left'] <= 50]

# Transforming

In [9]:
# convert timestamp to datetime
df['date_time'] = pd.to_datetime(df['ts'], unit='s')

# remove rows with wrong date
df = df[df['date_time'].dt.date == pd.to_datetime('2014-05-21').date()]

In [10]:
df.to_csv("game_overs_compact.csv", index=False)