In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read in the Player Information data
pi_df = pd.read_csv("PlayerInformationFeatures.csv")

In [3]:
# Read the Social Interaction Diversity dataset, that has mapping of which player is human vs bot.
sid_df = pd.read_csv("SocialInteractionDiversityFeatures.csv")

In [5]:
# We need only the three columns.
sid_df = sid_df[["Actor", "A_Acc", "Type"]]

In [8]:
sid_df['Player'] = sid_df['Actor'].astype(str) + '_'+ sid_df['A_Acc'].astype(str)

In [10]:
# We only need Player and its type
sid_df = sid_df[["Player", "Type"]]

In [13]:
player_class_grpby = sid_df.groupby(by=["Player"])

In [15]:
# Lets test
player_class_grpby.get_group('10002_6129182')

Unnamed: 0,Player,Type
250,10002_6129182,Human


In [20]:
player_class_df = sid_df.groupby(by=["Player"]).agg(lambda val1: val1).reset_index()

In [21]:
player_class_df.head(2)

Unnamed: 0,Player,Type
0,10002_6129182,Human
1,100050_7732344,Human


In [24]:
pi_df.head(2)

Unnamed: 0,Act_time,Actor,A_Acc,loc_x,loc_y,loc_z,etc_str1,etc_num2
0,2010-04-09 00:00:02.310,376398,10534989,1686,1535,119,210.206.,10
1,2010-04-09 00:00:03.623,343471,8291880,2463,2062,578,116.36.1,50


In [26]:
# The Act_time is Login time, so is taken as an object.
# etc_str1 looks like its part of IP address. So its taken as object
pi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5687619 entries, 0 to 5687618
Data columns (total 8 columns):
Act_time    object
Actor       int64
A_Acc       int64
loc_x       int64
loc_y       int64
loc_z       int64
etc_str1    object
etc_num2    int64
dtypes: int64(6), object(2)
memory usage: 347.1+ MB


In [27]:
# We do not have any null values
pi_df.isnull().any()

Act_time    False
Actor       False
A_Acc       False
loc_x       False
loc_y       False
loc_z       False
etc_str1    False
etc_num2    False
dtype: bool

In [28]:
# We do not have NA values as well.
pi_df.isna().any()

Act_time    False
Actor       False
A_Acc       False
loc_x       False
loc_y       False
loc_z       False
etc_str1    False
etc_num2    False
dtype: bool

# From the above two, we can see that the dataset does not have any missing values.

In [29]:
# Lets find if a player (Actor) has multiple accounts. This is to determine if we can just choose Actor to uniquely
# identify a player or Actor + A_Acc as the combination to uniuely identify a player.

In [30]:
# Below is an example to check the length of A_Acc that is associated with an Actor. The length of 1 means that an
# Actor is associated with one account only.
len(pi_df[pi_df.Actor == 372311]['A_Acc'].unique())

1

In [31]:
# We will perform the same validation on all Actors. 
actor_account_grp_by = pi_df[['Actor', 'A_Acc']].groupby(by=['A_Acc'])

In [32]:
a = actor_account_grp_by['Actor'].nunique()
#for grp_name, grp in actor_account_grp_by.groups():
#    if len(np.unique(grp['A_Acc'])) > 1:
#        print(grp_name)

In [33]:
a.head(2)

A_Acc
4698     2
15272    4
Name: Actor, dtype: int64

In [34]:
actor_account_grp_by.get_group(4698)

Unnamed: 0,Actor,A_Acc
1662380,93092,4698
1662577,25761,4698
3946187,25761,4698


# From the above, we can see that the account 4698 is not associated with one Actor. Hence, we will have to make use of Actor + A_Acc as a unique combination to identify a player.

In [35]:
# We will combine Actor and A_Acc and use that as primary key to identify a player

In [36]:
pi_df['Player'] = pi_df['Actor'].astype(str) + '_'+ pi_df['A_Acc'].astype(str)

In [37]:
pi_df.head(2)

Unnamed: 0,Act_time,Actor,A_Acc,loc_x,loc_y,loc_z,etc_str1,etc_num2,Player
0,2010-04-09 00:00:02.310,376398,10534989,1686,1535,119,210.206.,10,376398_10534989
1,2010-04-09 00:00:03.623,343471,8291880,2463,2062,578,116.36.1,50,343471_8291880


In [38]:
pi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5687619 entries, 0 to 5687618
Data columns (total 9 columns):
Act_time    object
Actor       int64
A_Acc       int64
loc_x       int64
loc_y       int64
loc_z       int64
etc_str1    object
etc_num2    int64
Player      object
dtypes: int64(6), object(3)
memory usage: 390.5+ MB


In [39]:
# Now we will add the Player Type column to the pi_df

In [41]:
new_pi_df = pi_df.join(sid_df.set_index("Player"), on="Player")

In [96]:
new_pi_df.info()
#new_pi_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5687619 entries, 0 to 5687618
Data columns (total 11 columns):
Act_time      object
Actor         int64
A_Acc         int64
loc_x         int64
loc_y         int64
loc_z         int64
etc_str1      object
etc_num2      int64
Player        object
Type          object
login_freq    float64
dtypes: float64(1), int64(6), object(4)
memory usage: 477.3+ MB


In [62]:
new_pi_df[(new_pi_df.Type != 'Human') & (new_pi_df.Type != 'Bot')].count()

Act_time    243648
Actor       243648
A_Acc       243648
loc_x       243648
loc_y       243648
loc_z       243648
etc_str1    243648
etc_num2    243648
Player      243648
Type             0
dtype: int64

# We have lot of rows for which we dont have the classification. We might need to include the values from NetworkInteractionMeasures as well.

# Analysing player login frequency

In [78]:
player_login_freq_grpby = new_pi_df[['Player', 'Act_time']].groupby(by=['Player'])

In [86]:
#player_login_freq_grpby.count().head(2)
#tmp = player_login_freq_grpby.count().sort_values(by=['Act_time'],ascending=False)
tmp = player_login_freq_grpby.count()

In [97]:
tmp.info()
tmp.index.name
tmp.head(2)
#new_pi_df['login_freq'] = tmp['Act_time']

<class 'pandas.core.frame.DataFrame'>
Index: 97354 entries, 10002_6129182 to 99996_7764469
Data columns (total 1 columns):
Act_time    97354 non-null int64
dtypes: int64(1)
memory usage: 4.0+ MB


Unnamed: 0_level_0,Act_time
Player,Unnamed: 1_level_1
10002_6129182,173
100035_7764486,1


In [100]:
new1 = new_pi_df.join(tmp, on="Player")

ValueError: columns overlap but no suffix specified: Index(['Act_time'], dtype='object')

# The below analysis assumes that column etc_str1 is  the first three octet of IP address. The column etc_num2 is the last octet of the IP address. We need to combine them into one and probably make a new column out of it.
# However, on doing the below steps, this seems as incorrect assumption. The etc_str1 is the only column having the IP address information. The missing octets is the missing information and can not be projected to a value. The column etc_num2 is really the level to which the player goes to while playing the game. 

In [73]:
def get_ip_from_values(octet123, octet4):
    ip = None
    octet123 = str(octet123)
    octet4 = str(octet4)
    if octet123.endswith("."):
        octet123 = octet123[0:len(octet123)-1]
    octet_list = octet123.split(".")
    #print(octet_list)
    if len(octet_list) == 3:
        return octet_list[0] + "." + octet_list[1] + "." + octet_list[2] + "." + octet4
    elif len(octet_list) == 2:
        # This means that octet2 is missing, so assume 0 for octet 2.
        return octet_list[0] + "." + octet_list[1] + ".0." + octet4
    elif len(octet_list) == 1:
        # This means that octet2 and octet 1 is missing, so assume 0 for octet 2 and octet 1.
        return octet_list[0] + ".0.0." + octet4

In [74]:
# Test out the function
print(get_ip_from_values("1.2.3", "4"))
print(get_ip_from_values("1.2", "4"))
print(get_ip_from_values("1", "4"))
print(get_ip_from_values("1.2.", 4))
print(get_ip_from_values("1.2.3.", 4))
print(get_ip_from_values("1.", 4))

1.2.3.4
1.2.0.4
1.0.0.4
1.2.0.4
1.2.3.4
1.0.0.4


In [75]:
# Now we will combine the two columns into one

In [76]:
#test = pd.DataFrame({'col1': ['1.2.3', '1.2', '1'], 'col2': [1,2,3]})

In [19]:
# Here we are taking only the etc_str1 and etc_num2 columns. For each of the row(variable s in lambda fn), we
# apply the get_ip_from_values and get the output IP
pi_df["ip_address"] = pi_df[['etc_str1', 'etc_num2']].apply(lambda s: get_ip_from_values(s[0], s[1]), axis=1)

In [77]:
pi_df.head(5)

Unnamed: 0,Act_time,Actor,A_Acc,loc_x,loc_y,loc_z,etc_str1,etc_num2,ip_address,Player
0,2010-04-09 00:00:02.310,376398,10534989,1686,1535,119,210.206.,10,210.206.0.10,37639810534989
1,2010-04-09 00:00:03.623,343471,8291880,2463,2062,578,116.36.1,50,116.36.1.50,3434718291880
2,2010-04-09 00:00:04.187,338523,10091973,212,194,107,122.35.8,42,122.35.8.42,33852310091973
3,2010-04-09 00:00:04.187,424234,6602247,1714,1868,255,218.237.,27,218.237.0.27,4242346602247
4,2010-04-09 00:00:04.827,296513,9098427,1244,1562,214,119.192.,47,119.192.0.47,2965139098427


In [78]:
def validate_ip(ipaddress):
    ipaddress = str(ipaddress)
    octets = ipaddress.split(".")
    try:
        for octet in octets:
            if int(octet) > 255:
                return False
        return True
    except Exception:
        #print(ipaddress)
        return False

In [79]:
print(validate_ip('210.206.0.10'))

True


In [80]:
invalid_ip_df = pi_df[pi_df.ip_address.isnull()]

In [81]:
invalid_ip_df.head(5)

Unnamed: 0,Act_time,Actor,A_Acc,loc_x,loc_y,loc_z,etc_str1,etc_num2,ip_address,Player
989,2010-04-09 00:13:55.420,362935,10400260,1686,1535,119,59.3.5.3,10,,36293510400260
1798,2010-04-09 00:27:51.043,362935,10400260,1686,1535,119,59.3.5.3,10,,36293510400260
2671,2010-04-09 00:43:35.827,362935,10400260,1686,1535,119,59.3.5.3,10,,36293510400260
3441,2010-04-09 00:57:45.013,362935,10400260,1686,1535,119,59.3.5.3,10,,36293510400260
4226,2010-04-09 01:11:29.373,362935,10400260,1686,1535,119,59.3.5.3,10,,36293510400260


In [82]:
invalid_ip_df.Player.value_counts().head(5)

40852210811282    12231
43920811016837     8924
36293510400260      540
2250418871463       347
2529348871463       228
Name: Player, dtype: int64

In [26]:
invalid_ip_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24518 entries, 989 to 5687468
Data columns (total 9 columns):
Act_time      24518 non-null object
Actor         24518 non-null int64
A_Acc         24518 non-null int64
loc_x         24518 non-null int64
loc_y         24518 non-null int64
loc_z         24518 non-null int64
etc_str1      24518 non-null object
etc_num2      24518 non-null int64
ip_address    0 non-null object
dtypes: int64(6), object(3)
memory usage: 1.9+ MB


In [83]:
pi_df_player_ips_grpby = pi_df[['Player', 'ip_address']].groupby(by=['Player'])

In [63]:
pi_df_player_ips_grpby.get_group('20396130143')['etc_str1'].unique()

NameError: name 'pi_df_player_ips_grpby' is not defined

In [122]:
pi_df_player_ips_grpby.get_group('22385811587').count()
#invalid_ip_df[invalid_ip_df['Actor'] == 1047]

Player        9
ip_address    9
dtype: int64

In [73]:
tmp = pi_df[pi_df.Player == '1049_6275719']
print(np.sum(tmp.loc_x), np.sum(tmp.loc_y), np.sum(tmp.loc_z),)


73293 25120 28157


In [None]:
# 20396130143   --- Logged in only once.    Total play time:   10827
# 26386298485 -- three entries over 2 days.  Total play time : 12353   

In [None]:
pi_df[pi_df.]

# Finding Players login frequency

55

Unnamed: 0,Act_time,Actor,A_Acc,loc_x,loc_y,loc_z,etc_str1,etc_num2,ip_address


dtype('int64')

In [110]:
# First, group by Actor
actor_login_time_grp_by = pi_df[['Actor', 'A_Acc', 'Act_time']].groupby(by=['Actor', 'A_Acc'])

In [111]:
actor_login_time_grp_by.count().sort_values(by='Act_time', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Act_time
Actor,A_Acc,Unnamed: 2_level_1
372311,10501244,15247
374318,10501244,15242
374321,10501244,15242
374319,10501244,15238
374322,10501244,15237
408522,10811282,13032
318771,9883148,12708
376398,10534989,12628
424111,10960451,12494
403485,10765729,12460
