### Requirements
#### Folder heirarchy for running these codes
- analytics-vidhya (top level folder)
    - data
        - click-prediction
            - data files in csv here, all outputs of different stages are also mv into here
    - click-prediction
        - Ipython notebook files here

#### Software requirements
- Anaconda Ipython distribution
- MySQL for geocode data analysis
- XGBoost compiled for respective platform
- Any other libs will be mentioned on the go


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mplt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython.core.interactiveshell import InteractiveShell
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
%matplotlib inline
import seaborn as sns
import math
import gc
import ipaddress

import sys
from pathlib import Path
d = Path().resolve().parent.parent
sys.path.insert(0, str(d))
import util.utils as utils
import util.plot_utils as plot_utils



plt.rcParams["figure.figsize"] = (12,4)

from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score



InteractiveShell.ast_node_interactivity = "all"
np.set_printoptions(precision=2)

In [2]:
project_name="click-prediction"
def get_file_location(filename):
    return "../data/%s/%s" % (project_name,filename)

In [3]:

dtypes_train = {"ID":"int64",
"Country":"object",
"Carrier":"float64",
"TrafficType":"object",
"ClickDate":"object",
"Device":"object",
"Browser":"object",
"OS":"object",
"RefererUrl":"object",
"UserIp":"object",
"ConversionStatus":"bool",
"ConversionDate":"object",
"ConversionPayOut":"float64",
"publisherId":"object",
"subPublisherId":"object",
"advertiserCampaignId":"float64",
"Fraud":"float64"}

dtypes_test = {"ID":"int64",
"Country":"object",
"Carrier":"float64",
"TrafficType":"object",
"ClickDate":"object",
"Device":"object",
"Browser":"object",
"OS":"object",
"RefererUrl":"object",
"UserIp":"object",
"publisherId":"object",
"subPublisherId":"object",
"advertiserCampaignId":"float64",
"Fraud":"float64"}
# test size = 25,548,872
# train size = 63,367,217

In [5]:


def fast_read_and_append(file_path,chunksize,fullsize,dtypes):
    # in chunk reading be careful as pandas might infer a columns dtype as different for diff chunk
    # As such specifying a dtype while reading by giving params to read_csv maybe better
    # Label encoding will fail if half the rows for same column is int and rest are str
    # In case of that already happened then df_test["publisherId"] = df_test["publisherId"].apply(str)
    df = pd.DataFrame()
    iterations = 0
    total_needed_iters = math.ceil(fullsize/chunksize)
    for x in pd.read_csv(file_path, chunksize=chunksize,low_memory=False,dtype=dtypes):
        print("iterations= %s out of %s" %  (iterations,total_needed_iters))
        df = pd.concat([df, x], ignore_index=True)
        iterations += 1
    return df
    

In [6]:

df=fast_read_and_append("../data/%s/train.csv" % project_name,1000000,63367217,dtypes_train)



iterations= 0 out of 64
iterations= 1 out of 64
iterations= 2 out of 64
iterations= 3 out of 64
iterations= 4 out of 64
iterations= 5 out of 64
iterations= 6 out of 64
iterations= 7 out of 64
iterations= 8 out of 64
iterations= 9 out of 64
iterations= 10 out of 64
iterations= 11 out of 64
iterations= 12 out of 64
iterations= 13 out of 64
iterations= 14 out of 64
iterations= 15 out of 64
iterations= 16 out of 64
iterations= 17 out of 64
iterations= 18 out of 64
iterations= 19 out of 64
iterations= 20 out of 64
iterations= 21 out of 64
iterations= 22 out of 64
iterations= 23 out of 64
iterations= 24 out of 64
iterations= 25 out of 64
iterations= 26 out of 64
iterations= 27 out of 64
iterations= 28 out of 64
iterations= 29 out of 64
iterations= 30 out of 64
iterations= 31 out of 64
iterations= 32 out of 64
iterations= 33 out of 64
iterations= 34 out of 64
iterations= 35 out of 64
iterations= 36 out of 64
iterations= 37 out of 64
iterations= 38 out of 64
iterations= 39 out of 64
iterations

In [7]:
df.shape
df.dtypes

(63367217, 17)

ID                        int64
Country                  object
Carrier                 float64
TrafficType              object
ClickDate                object
Device                   object
Browser                  object
OS                       object
RefererUrl               object
UserIp                   object
ConversionStatus           bool
ConversionDate           object
ConversionPayOut        float64
publisherId              object
subPublisherId           object
advertiserCampaignId    float64
Fraud                   float64
dtype: object

In [9]:
df_test=fast_read_and_append("../data/%s/test.csv" % project_name,500000,24510750,dtypes_test)

iterations= 0 out of 50
iterations= 1 out of 50
iterations= 2 out of 50
iterations= 3 out of 50
iterations= 4 out of 50
iterations= 5 out of 50
iterations= 6 out of 50
iterations= 7 out of 50
iterations= 8 out of 50
iterations= 9 out of 50
iterations= 10 out of 50
iterations= 11 out of 50
iterations= 12 out of 50
iterations= 13 out of 50
iterations= 14 out of 50
iterations= 15 out of 50
iterations= 16 out of 50
iterations= 17 out of 50
iterations= 18 out of 50
iterations= 19 out of 50
iterations= 20 out of 50
iterations= 21 out of 50
iterations= 22 out of 50
iterations= 23 out of 50
iterations= 24 out of 50
iterations= 25 out of 50
iterations= 26 out of 50
iterations= 27 out of 50
iterations= 28 out of 50
iterations= 29 out of 50
iterations= 30 out of 50
iterations= 31 out of 50
iterations= 32 out of 50
iterations= 33 out of 50
iterations= 34 out of 50
iterations= 35 out of 50
iterations= 36 out of 50
iterations= 37 out of 50
iterations= 38 out of 50
iterations= 39 out of 50
iterations

In [10]:
df_test.shape

(25548873, 14)

In [11]:
# transforms for space efficiency
def transform_1(df):
    df["Carrier"] = df["Carrier"].astype(int)
    

transform_1(df)
transform_1(df_test)


def transform_2(df):
    df["advertiserCampaignId"] = df["advertiserCampaignId"].fillna(-997)
    df["advertiserCampaignId"] = df["advertiserCampaignId"].astype(int)
    

transform_2(df)
transform_2(df_test)

def transform_3(df):
    df["Fraud"] = df["Fraud"].fillna(-997)
    df["Fraud"] = df["Fraud"].astype(int)
    
transform_3(df)
transform_3(df_test)
    

In [12]:
def label_encode_field(df,df_test,field):
    df[field] = df[field].fillna('-997')
    df_test[field] = df_test[field].fillna('-997')
    encoded_df = df[field].append(df_test[field])
    encoded_df.fillna('-1')
    label_encoder = LabelEncoder()
    encoder = label_encoder.fit(encoded_df.values)
    encoded_t=encoder.transform(df_test[field].fillna("-997").values)
    df_test[field] = encoded_t
    encoded_df=encoder.transform(df[field].fillna("-997").values)
    df[field] = encoded_df
    return encoder


def store_encoder_as_file(le,column_name):
    ids =np.arange(0,len(list(le.classes_))).astype(int)
    my_encoding = pd.DataFrame(list(le.classes_), ids, columns = [column_name])
    print(my_encoding.shape)
    my_encoding.to_csv(get_file_location("%s-encoding.csv" % column_name), index_label = ["id"])

In [13]:
# df_test["publisherId"] = df_test["publisherId"].apply(str)


store_encoder_as_file(label_encode_field(df,df_test,"Country"),"Country")
store_encoder_as_file(label_encode_field(df,df_test,"TrafficType"),"TrafficType")


store_encoder_as_file(label_encode_field(df,df_test,"Device"),"Device")


store_encoder_as_file(label_encode_field(df,df_test,"Browser"),"Browser")



(233, 1)
(3, 1)
(1344, 1)
(42, 1)


In [14]:




store_encoder_as_file(label_encode_field(df,df_test,"OS"),"OS")


store_encoder_as_file(label_encode_field(df,df_test,"RefererUrl"),"RefererUrl")


store_encoder_as_file(label_encode_field(df,df_test,"UserIp"),"UserIp")


(22, 1)
(404286, 1)
(14385484, 1)


In [15]:


store_encoder_as_file(label_encode_field(df,df_test,"publisherId"),"publisherId")
store_encoder_as_file(label_encode_field(df,df_test,"subPublisherId"),"subPublisherId")

(9443, 1)
(8596, 1)


In [16]:
df.to_csv(get_file_location("train-step-1.csv"))

In [17]:
df_test.to_csv(get_file_location("test-step-1.csv"))