In [22]:
""" 

    Check which IPs are common to train & test.

Assign rows to 1, 2, 3 according to ip.
1: only train
2: only test
3: common in train & test
"""

import pandas as pd
from tqdm import tqdm

TRAIN_CSV = 'input/train.csv'
TEST_V0_CSV = 'input/test_v0.csv'

dtypes = {
    'ip':'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}

def check_overlapping(column):    
    df_train = pd.read_csv(TRAIN_CSV, dtype=dtypes, usecols=[column])
    df_test = pd.read_csv(TEST_V0_CSV, dtype=dtypes, usecols=[column])
    train_values = set(list(df_train.loc[:, column].values))
    test_values = set(list(df_test.loc[:, column].values))
    
    x1 = train_values - test_values
    x2 = test_values - train_values
    x3 = train_values & test_values
    return x1, x2, x3
    
    
def build_mapping(train_only, test_only, common):
    print('load train')
    train_df = pd.read_csv(TRAIN_CSV, usecols=['ip'])
    
    print('load test')
    test_df = pd.read_csv(TEST_V0_CSV, usecols=['ip']) 

    df = train_df.append(test_df) 
    print('.')
    
    ips = df['ip'].values
    mapped = [0]* len(ips)
    
    for idx, x in tqdm(enumerate(ips), mininterval=5):
        if x in train_only:
            mapped[idx] = 1
        elif x in test_only:
            mapped[idx] = 2 
        elif x in common:
            mapped[idx] = 3
    return mapped


In [23]:
train_only, test_only, common = check_overlapping('ip')

In [24]:
len(train_only), len(test_only), len(common)

(238365, 87383, 39031)

In [25]:
mapped = build_mapping(train_only, test_only, common)

load train
load test


0it [00:00, ?it/s]

.


242441395it [18:11, 222054.01it/s]


In [26]:
type(mapped)

list

In [28]:
import pickle
with open('cache/ip_mapping.pkl', 'wb') as f:
    pickle.dump([mapped, train_only, test_only, common], f)

1

In [1]:
import pickle
import os

with open('cache/ip_mapping.pkl', 'rb') as f:
    mapped, _, _, _ = pickle.load(f)

In [2]:

with open(os.path.join('cache/train_test_base.pkl'), 'rb') as f:        
    df = pickle.load(f)        
    df = df.reset_index(drop=True)


In [3]:
ips = df[['ip']].values

In [4]:
import numpy as np
mapped = np.array(mapped)



In [6]:
print(np.sum((mapped==3).astype(np.uint8)))
print('.')

201189954
.


In [5]:
len(ips)

242441395

In [None]:
df_train = pd.read_csv(TRAIN_CSV, dtype=dtypes, usecols=[column], parse_dates=['click_time'])
df_test = pd.read_csv(TEST_V0_CSV, dtype=dtypes, usecols=[column], parse_dates=['click_time'])

df_train['day'] = df_train.click_time.dt.day.astype('uint8')
df_train89 = df_train[df_train.day.isin([8, 9])]

def check_overlapping(df1, df2, column):    
    x1_values = set(list(x1.loc[:, column].values))
    x2_values = set(list(x2.loc[:, column].values))
    
    x1 = x1_values - x2_values
    x2 = x2_values - x1_values
    x3 = x1_values & x2_values
    return x1, x2, x3

print(check_overlapping(df_train89, df_test, 'ip'))