In [1]:
import numpy as np
import pandas as pd

In [2]:
pi_df = pd.read_csv("PlayerInformationFeatures.csv")

In [3]:
pi_df.head(5)

Unnamed: 0,Act_time,Actor,A_Acc,loc_x,loc_y,loc_z,etc_str1,etc_num2
0,2010-04-09 00:00:02.310,376398,10534989,1686,1535,119,210.206.,10
1,2010-04-09 00:00:03.623,343471,8291880,2463,2062,578,116.36.1,50
2,2010-04-09 00:00:04.187,338523,10091973,212,194,107,122.35.8,42
3,2010-04-09 00:00:04.187,424234,6602247,1714,1868,255,218.237.,27
4,2010-04-09 00:00:04.827,296513,9098427,1244,1562,214,119.192.,47


In [4]:
pi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5687619 entries, 0 to 5687618
Data columns (total 8 columns):
Act_time    object
Actor       int64
A_Acc       int64
loc_x       int64
loc_y       int64
loc_z       int64
etc_str1    object
etc_num2    int64
dtypes: int64(6), object(2)
memory usage: 347.1+ MB


In [6]:
# We do not have any null values
pi_df.isnull().any()

Act_time    False
Actor       False
A_Acc       False
loc_x       False
loc_y       False
loc_z       False
etc_str1    False
etc_num2    False
dtype: bool

In [7]:
pi_df.isna().any()

Act_time    False
Actor       False
A_Acc       False
loc_x       False
loc_y       False
loc_z       False
etc_str1    False
etc_num2    False
dtype: bool

In [8]:
# From the above two, we can see that the dataset does not have any missing values.

In [9]:
# The column etc_str1 is really the first three octet of IP address. The column etc_num2 is the last octet
# of the IP address. We need to combine them into one and probably make a new column out of it.

In [52]:
def get_ip_from_values(octet123, octet4):
    ip = None
    octet123 = str(octet123)
    octet4 = str(octet4)
    if octet123.endswith("."):
        octet123 = octet123[0:len(octet123)-1]
    octet_list = octet123.split(".")
    #print(octet_list)
    if len(octet_list) == 3:
        return octet_list[0] + "." + octet_list[1] + "." + octet_list[2] + "." + octet4
    elif len(octet_list) == 2:
        # This means that octet2 is missing, so assume 0 for octet 2.
        return octet_list[0] + "." + octet_list[1] + ".0." + octet4
    elif len(octet_list) == 1:
        # This means that octet2 and octet 1 is missing, so assume 0 for octet 2 and octet 1.
        return octet_list[0] + ".0.0." + octet4

In [53]:
# Test out the function
print(get_ip_from_values("1.2.3", "4"))
print(get_ip_from_values("1.2", "4"))
print(get_ip_from_values("1", "4"))
print(get_ip_from_values("1.2.", 4))
print(get_ip_from_values("1.2.3.", 4))
print(get_ip_from_values("1.", 4))

1.2.3.4
1.2.0.4
1.0.0.4
1.2.0.4
1.2.3.4
1.0.0.4


In [None]:
# Now we will combine the two columns into one

In [34]:
#test = pd.DataFrame({'col1': ['1.2.3', '1.2', '1'], 'col2': [1,2,3]})

In [54]:
# Here we are taking only the etc_str1 and etc_num2 columns. For each of the row(variable s in lambda fn), we
# apply the get_ip_from_values and get the output IP
pi_df["ip_address"] = pi_df[['etc_str1', 'etc_num2']].apply(lambda s: get_ip_from_values(s[0], s[1]), axis=1)

In [55]:
pi_df.head(5)

Unnamed: 0,Act_time,Actor,A_Acc,loc_x,loc_y,loc_z,etc_str1,etc_num2,ip_address
0,2010-04-09 00:00:02.310,376398,10534989,1686,1535,119,210.206.,10,210.206.0.10
1,2010-04-09 00:00:03.623,343471,8291880,2463,2062,578,116.36.1,50,116.36.1.50
2,2010-04-09 00:00:04.187,338523,10091973,212,194,107,122.35.8,42,122.35.8.42
3,2010-04-09 00:00:04.187,424234,6602247,1714,1868,255,218.237.,27,218.237.0.27
4,2010-04-09 00:00:04.827,296513,9098427,1244,1562,214,119.192.,47,119.192.0.47


In [68]:
def validate_ip(ipaddress):
    ipaddress = str(ipaddress)
    octets = ipaddress.split(".")
    try:
        for octet in octets:
            if int(octet) > 255:
                return False
        return True
    except Exception:
        #print(ipaddress)
        return False

In [71]:
print(validate_ip('210.206.0.10'))

True


In [75]:
invalid_ip_df = pi_df[pi_df.ip_address.isnull()]

In [77]:
invalid_ip_df.Actor.value_counts()

408522    12231
439208     8924
362935      540
225041      347
252934      228
442068      209
368708      173
427679      136
427674      127
427672      122
340481      121
427677      119
4396        115
5802        113
98207        86
409066       86
194759       86
290168       79
440269       72
446535       66
437740       28
422980       28
468378       27
162605       24
29300        24
418569       19
130133       18
169993       18
441434       18
441165       17
          ...  
129991        1
347041        1
444913        1
393704        1
170331        1
85886         1
462630        1
384796        1
379374        1
211280        1
342000        1
22261         1
407966        1
98825         1
407356        1
273310        1
438973        1
401839        1
297290        1
100506        1
59446         1
427722        1
366219        1
447776        1
203705        1
207279        1
348176        1
444289        1
142988        1
101833        1
Name: Actor, Length: 192