### Data source and method for IP
[Source Maxmind](http://dev.maxmind.com/geoip/geoip2/geolite2/)

Download city csv data [link](http://geolite.maxmind.com/download/geoip/database/GeoLite2-City-CSV.zip)

Used Files
- GeoLite2-City-Blocks-IPv4.csv ()
- GeoLite2-City-Locations-en.csv ()

Create Table for Mysql
```
CREATE TABLE `geocodes_data` (
  `network` varchar(64) DEFAULT NULL,
  `geoname_id` bigint(11) NOT NULL,
  `postal_code` varchar(256) DEFAULT NULL,
  KEY `geoname_id` (`geoname_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;


CREATE TABLE `geocodes` (
  `geoname_id` bigint(11) NOT NULL,
  `country_iso_code` varchar(8) DEFAULT NULL,
  `city_name` varchar(1024) DEFAULT NULL,
  `subdivision_1_iso_code` varchar(256) DEFAULT NULL,
  UNIQUE KEY `geoname_id` (`geoname_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

```
After importing these files in mysql run
```
select gd.network as network, gc.geoname_id as geoname_id,gc.country_iso_code as country_iso_code, gc.city_name as city,gc.`subdivision_1_iso_code` as region_code, gd.postal_code as postal_code
from geocodes gc inner join geocodes_data gd 
where gc.geoname_id=gd.geoname_id
```

then download the file generated as csv for python processing, name the file as `geocodes_joined.csv`

scp the file to server `scp geocodes_joined.csv.gz ahemf@server.com:/home/ahemf/mygit/data-science/analytics-vidhya/data/click-prediction`

#### Improving Country data
- we have added some addition country data using ips to geocode mapping
- for that update the encoding file `Country-encoding.csv` by adding a line `-1,GB`

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mplt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython.core.interactiveshell import InteractiveShell
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
%matplotlib inline
import seaborn as sns
import math

import sys
from pathlib import Path
d = Path().resolve().parent.parent
sys.path.insert(0, str(d))
import util.utils as utils
import util.plot_utils as plot_utils
import gc



plt.rcParams["figure.figsize"] = (12,4)

from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
import ipaddress

InteractiveShell.ast_node_interactivity = "all"
np.set_printoptions(precision=2)

In [2]:
project_name="click-prediction"
def get_file_location(filename):
    return "../data/%s/%s" % (project_name,filename)

In [4]:
df_ip2l=pd.read_csv(get_file_location("IP2LOCATION-LITE-DB11.CSV"))

In [7]:
df_ip2l.columns = ["start","end","Country","Country_full","region","city","lat","long","postal_code","tz"]


Unnamed: 0,start,end,Country,Country_full,region,city,lat,long,postal_code,tz
0,16777216,16777471,AU,Australia,Queensland,Brisbane,-27.46794,153.02809,4000,+10:00
1,16777472,16778239,CN,China,Fujian,Fuzhou,26.06139,119.30611,350004,+08:00
2,16778240,16778495,AU,Australia,Victoria,Melbourne,-37.814,144.96332,8010,+10:00
3,16778496,16779263,AU,Australia,-,-,-33.86785,151.20732,-,+10:00
4,16779264,16781311,CN,China,Guangdong,Guangzhou,23.11667,113.25,510140,+08:00


In [8]:
df_ip2l.drop(['Country_full'], axis = 1, inplace = True)
df_ip2l.drop(['lat'], axis = 1, inplace = True)
df_ip2l.drop(['long'], axis = 1, inplace = True)
df_ip2l.head()

Unnamed: 0,start,end,Country,region,city,postal_code,tz
0,16777216,16777471,AU,Queensland,Brisbane,4000,+10:00
1,16777472,16778239,CN,Fujian,Fuzhou,350004,+08:00
2,16778240,16778495,AU,Victoria,Melbourne,8010,+10:00
3,16778496,16779263,AU,-,-,-,+10:00
4,16779264,16781311,CN,Guangdong,Guangzhou,510140,+08:00


In [14]:
df_ip2l_processed = pd.DataFrame()
for index, row in df_ip2l.iterrows():
    df_row = pd.DataFrame()
    for ipaddr in ipaddress.summarize_address_range(ipaddress.IPv4Address(row["start"]),ipaddress.IPv4Address(row["end"])):
        df_cur=pd.DataFrame(data={"ip":[ipaddr.exploded],"Country":[row["Country"]],"region":[row["region"]],"city":[row["city"]],
                                  "postal_code":[row["postal_code"]],"tz":[row["tz"]]})
        df_row = pd.concat([df_row,df_cur],ignore_index=True)
    if index % 100000==0:
        print(index)
    df_ip2l_processed = pd.concat([df_ip2l_processed,df_row],ignore_index=True)
    
df_ip2l_processed.head()
        
        

0
100000


KeyboardInterrupt: 

In [3]:
def fast_read_and_append(file_path,chunksize,fullsize,sample):
    # in chunk reading be careful as pandas might infer a columns dtype as different for diff chunk
    # As such specifying a dtype while reading by giving params to read_csv maybe better
    # Label encoding will fail if half the rows for same column is int and rest are str
    # In case of that already happened then df_test["publisherId"] = df_test["publisherId"].apply(str)
    df = pd.DataFrame()
    iterations = 0
    total_needed_iters = math.ceil(fullsize/chunksize)
    for x in pd.read_csv(file_path, chunksize=chunksize,low_memory=False):
        x = x.sample(frac=sample)
        print("iterations= %s out of %s" %  (iterations,total_needed_iters))
        df = pd.concat([df, x], ignore_index=True)
        iterations += 1
    gc.collect()
    return df

In [7]:
df_conv_status = pd.read_csv("../data/%s/%s" % (project_name,"train-true_5.csv"))
df_fraud= pd.read_csv("../data/%s/%s" % (project_name,"train-fraud_5.csv"))
df = fast_read_and_append("../data/%s/%s" % (project_name,"train_5.csv"),500000,64000000,0.6)

# df_net = pd.read_csv("../data/%s/%s" % (project_name,"network-step-4-encoding.csv"))
df_subnet= pd.read_csv("../data/%s/%s" % (project_name,"subnet-step-4-encoding.csv"))

iterations= 0 out of 128
iterations= 1 out of 128
iterations= 2 out of 128
iterations= 3 out of 128
iterations= 4 out of 128
iterations= 5 out of 128
iterations= 6 out of 128
iterations= 7 out of 128
iterations= 8 out of 128
iterations= 9 out of 128
iterations= 10 out of 128
iterations= 11 out of 128
iterations= 12 out of 128
iterations= 13 out of 128
iterations= 14 out of 128
iterations= 15 out of 128
iterations= 16 out of 128
iterations= 17 out of 128
iterations= 18 out of 128
iterations= 19 out of 128
iterations= 20 out of 128
iterations= 21 out of 128
iterations= 22 out of 128
iterations= 23 out of 128
iterations= 24 out of 128
iterations= 25 out of 128
iterations= 26 out of 128
iterations= 27 out of 128
iterations= 28 out of 128
iterations= 29 out of 128
iterations= 30 out of 128
iterations= 31 out of 128
iterations= 32 out of 128
iterations= 33 out of 128
iterations= 34 out of 128
iterations= 35 out of 128
iterations= 36 out of 128
iterations= 37 out of 128
iterations= 38 out of 

In [33]:
df_test= pd.read_csv("../data/%s/%s" % (project_name,"test_5.csv"))

In [8]:
df.shape

(38020330, 19)

In [9]:
gc.collect()

14

In [10]:
df_geo=pd.read_csv("../data/%s/%s" % (project_name,"geocodes_joined.csv"))

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
df_geo["subnet"] = df_geo["network"].map(lambda x: ".".join(x.split(".")[:3]))

In [12]:
df_geo_needed=df_geo[df_geo["subnet"].isin(df_subnet["subnet-step-4"])]

In [13]:
df_subnet["subnet"]=df_subnet["subnet-step-4"]

In [14]:
df_geo_merged=df_geo_needed.merge(df_subnet,on="subnet")

In [15]:
df_geo_merged.drop(['network'], axis = 1, inplace = True)
df_geo_merged.drop(['geoname_id'], axis = 1, inplace = True)
# df_geo_merged.drop(['country_name'], axis = 1, inplace = True)
# df_geo_merged.drop(['subnet-step-4'], axis = 1, inplace = True)


df_geo_merged["subnet"] = df_geo_merged["id"]
df_geo_merged.drop(['id'], axis = 1, inplace = True)

In [16]:
df_country = pd.read_csv("../data/%s/%s" % (project_name,"Country-encoding.csv"))

In [17]:
df_country["country_iso_code"] = df_country["Country"]

In [18]:
df_geo_merged=df_geo_merged.merge(df_country,on="country_iso_code",how="left")

In [19]:
df_geo_merged["id"].fillna(-997,inplace=True)
df_geo_merged["Country"] = df_geo_merged["id"].astype(int)
df_geo_merged.drop(['id'], axis = 1, inplace = True)

In [20]:

df_geo_merged["Country"] = df_geo_merged["Country"].astype(int)

In [21]:
df_geo_merged.drop(['country_iso_code'], axis = 1, inplace = True)

In [22]:
df_geo_merged.drop(['subnet-step-4'], axis = 1, inplace = True)

In [27]:
def geo_add(df,df_geo):
    df = df.merge(df_geo,on="subnet",how="left")
    df = df.drop_duplicates(subset="ID")
    
    # replace by loc 
    c1=df["Country_x"].isin([0,1])
    c2=~df["Country_y"].isnull()
    print(c1.shape)
    df.loc[(c1) & (c2),"Country_x"] = df.loc[(c1) & (c2),"Country_y"]
#     for i, row in dft.iterrows():
#         country = row["Country_x"]
#         if (country<2) & (country>=0) & (~np.isnan(row["Country_y"])):
#             country = row["Country_y"]
#             dft.set_value(i,'Country_x',country)
    df["Country"] = df["Country_x"]
    print(df[(df["Country_x"]<2) & (df["Country_x"]>=0)].shape)
    df.drop(['Country_x'], axis = 1, inplace = True)
    df.drop(['Country_y'], axis = 1, inplace = True)
    return df
    
    

In [28]:
df_conv_status = geo_add(df_conv_status,df_geo_merged)
df_fraud = geo_add(df_fraud,df_geo_merged)




(34524,)
(18, 24)
(396,)
(0, 24)


In [29]:
df = geo_add(df,df_geo_merged)

(38020330,)
(28808, 24)


In [34]:
df_test = geo_add(df_test,df_geo_merged)

(25548873,)
(11417, 22)


In [31]:
df.to_csv(get_file_location("train_6.csv"),index=False)
df_conv_status.to_csv(get_file_location("train-true_6.csv"),index=False)
df_fraud.to_csv(get_file_location("train-fraud_6.csv"),index=False)


In [35]:
df_test.to_csv(get_file_location("test_6.csv"),index=False)

In [30]:
df.head()

Unnamed: 0,ID,Carrier,TrafficType,Device,Browser,OS,RefererUrl,ConversionPayOut,publisherId,subPublisherId,...,conv_status,dayofweek,hour,dayofyear,network,subnet,city,region_code,postal_code,Country
0,50910305,-1,2,589,1,1,2,0.0,256,0,...,0,0,22,240,625,28790,Chittagong,B,4000.0,19.0
1,1392717,25,1,601,4,1,2,0.0,8658,0,...,0,0,5,233,22406,1017985,,,,204.0
2,62101070,-1,1,388,2,1,1764,0.0,4660,0,...,0,2,19,242,28218,1242420,Guwahati,AS,781102.0,99.0
11,21625906,-1,2,388,4,1,2,0.0,678,0,...,0,2,22,235,735,34348,,,,19.0
12,14672381,-1,0,388,2,1,2,0.0,678,0,...,0,2,2,235,33934,1570253,Vienne,ARA,38200.0,72.0


In [36]:
len(df.dtypes)
len(df_conv_status.dtypes)
len(df_test.dtypes)

22

22

20

In [None]:
df_geo_merged.tail()

NameError: name 'df' is not defined

In [None]:
df_geo.head()