In [None]:

import zipfile
import os
import csv
import gc
import pickle
import dill
from datetime import datetime
import pandas as pd

import geopy
from geopy.geocoders import Nominatim

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:

df = pd.read_csv('./data/sessions_hits_merged.csv')  #see DataPreparation.ipynb for more details
train, test = train_test_split(df, test_size=0.2, stratify=df['target'])



In [None]:

#load prepared geo data

with open('./data/capitals-list.csv', encoding='UTF-8') as f:
    capitals_dict = {line[0].strip():line[1].strip() for line in csv.reader(f)}

with open('./data/countries of the world.csv', encoding='UTF-8') as f:
    country_dict = {line[0].strip():line[1].strip() for line in csv.reader(f)}

cities = pd.read_csv('./data/cities_mod.csv').drop_duplicates('city')

In [None]:

def filter_cols(data):
    columns_to_drop = [
        'utm_keyword',
        'device_model',
        'client_id',
        'session_id',
        'geo_city',
        'geo_country',
        'visit_time',
        'visit_date',
        'device_os',
        'geo_region',
        'utm_adcontent',
        'utm_campaign',
        'utm_source',
        'utm_medium']
    data = data.drop(columns_to_drop, axis=1)
    return data

In [None]:

def get_coordinates(data):
    data['geo_country'] = data['geo_country'].replace('(not set)', 'Russia')  #take Russia for the default country, fill where absent

    data['geo_country'] = data['geo_country'].fillna('Russia')


    #fill missing and not valid values with capitals
    data.loc[(data['geo_city'] == '(not set)') | (data['geo_city'].str.contains(r'\d{4}')), 'geo_city'] = data.geo_country.apply(
        lambda x: capitals_dict[x] if x in capitals_dict.keys() else x)


    #merge the prepared list city coordinates
    data = data.merge(cities, left_on='geo_city', right_on='city', how='left', copy=False)

    #add missing values via geolocator
    lostcoord = list(data[data.lat.isna()].geo_city.value_counts().to_dict().keys())
    geolocator = Nominatim(user_agent="Geolocation", timeout=10)

    coord_dict = dict()
    for i in lostcoord:
        location = geolocator.geocode(i)
        if location:
            coord_dict[i] = (location.latitude, location.longitude)

    data.loc[data.lat.isna(), 'lat'] = data.geo_city.apply(lambda x: coord_dict[x][0] if x in coord_dict.keys() else None)


    data.loc[data.lng.isna(), 'lng'] = data.geo_city.apply(lambda x: coord_dict[x][1] if x in coord_dict.keys() else None)


    data = data.drop('city', axis=1)


    return data


In [None]:

def get_region(data): #add geo_region feature in place of geo_country
    country_dict['Russia'] = 'Russia'
    country_dict['Belarus'] = 'Belarus'
    country_dict['Czechia'] = 'EASTERN EUROPE'
    country_dict['Montenegro'] = 'EASTERN EUROPE'
    country_dict['Kosovo'] = 'EASTERN EUROPE'
    country_dict['North Macedonia'] = 'EASTERN EUROPE'
    country_dict['South Korea'] = 'ASIA (EX. NEAR EAST)'
    country_dict['North Korea'] = 'ASIA (EX. NEAR EAST)'
    country_dict['North Korea'] = 'ASIA (EX. NEAR EAST)'
    country_dict['Myanmar (Burma)'] = 'ASIA (EX. NEAR EAST)'
    country_dict['Wallis & Futuna'] = 'OCEANIA'

    data['geo_region'] = data['geo_country'].apply(lambda x: country_dict[x] if x in country_dict else x)
    return data

In [None]:

def process_date_time(data): #month and year not relevant (one year and not all months present )
    data.visit_date = pd.to_datetime(data.visit_date)
    data['visit_dayofweek'] = data.visit_date.apply(lambda x: x.weekday())
    data['visit_day'] = data.visit_date.apply(lambda x: x.day)

    data.visit_time = pd.to_datetime(data.visit_time)
    data['visit_hour'] = data.visit_time.apply(lambda x: x.hour)

    return data

In [None]:

def add_categories(data): #group, binarize,  drop rare categories
    #здесь просится цикл, конечно,  может, позже
    brows = ['Chrome', 'Safari', 'YaBrowser', 'Safari (in-app)', 'Android Webview', 'Samsung Internet', 'Opera', 'Edge',
             'Firefox']
    data['device_browser'] = data['device_browser'].apply(lambda x: 'other' if x not in brows else x)
    data['device_browser'] = data['device_browser'].replace('Safari (in-app)', 'Safari')

    data.device_brand = data.device_brand.fillna('unknown')
    data.device_brand = data.device_brand.replace('(not set)', 'unknown')

    brands = ['Apple', 'unknown', 'Samsung', 'Xiaomi', 'Huawei', 'Realme']
    data.device_brand = data.device_brand.apply(lambda x: 'other' if x not in brands else x)

    organic = ['organic', 'referral', '(none)', '(not set)']
    data['paid_traffic'] = data.utm_medium.apply(lambda x: 0 if x in organic else 1)

    sn = ['QxAxdyPLuQMEcrdZWdWb', 'MvfHsxITijuriZxsqZqt', 'ISrKoXQCxqqYvAZICvjs', 'IZEXUFLARCUMynmHNBGo',
          'PlbkrSYoHuZBWfYjYnfw', 'gVRrcxiDQubJiljoTbGm']
    data['social_net'] = data.utm_source.apply(lambda x: 1 if x in sn else 0)

    some_relevant_utmcampaign = ['LTuZkdKfxRGVceoWkVyg', 'LEoPHuyFvzoNfnzGgfcd', 'FTjNLDyTrXaWYgZymFkV']
    data['top_3_campaigns'] = data.utm_campaign.apply(lambda x: 1 if x in some_relevant_utmcampaign else 0)

    return data


In [None]:

def add_freqs(data): # we need more numeric features

    cols = ['utm_adcontent', 'utm_source', 'utm_medium', 'geo_region', 'geo_city', 'visit_dayofweek', 'visit_day', 'visit_hour', 'device_browser', 'device_brand', 'device_screen_resolution' ]
    newcols = ['utm_adcontent_freq', 'utm_source_freq', 'utm_medium_freq', 'geo_region_freq', 'geo_city_freq', 'visit_dayofweek_freq', 'visit_day_freq', 'visit_hour_freq', 'device_browser_freq', 'device_brand_freq', 'device_screen_resolution_freq' ]

    for c in range(len(cols)):
        freq = data[cols[c]].value_counts(dropna=False).to_dict()
        for i in freq:
            freq[i] = round(freq[i] / len(data.index), 4)
        data[newcols[c]] = data[cols[c]].apply(lambda x: freq[x])

    return data


In [None]:


def multiply_res(data):
    data.device_screen_resolution = data.device_screen_resolution.apply(lambda x: x.split('x')).apply(lambda x: int(x[0]) * int(x[1]))

    return data


In [None]:

def treat_outliers(data):
    def calculate_outliers(data):
        q25 = data.quantile(0.25)
        q75 = data.quantile(0.75)
        iqr = q75 - q25
        boundaries = ((q25 - 1.5 * iqr), (q75 + 1.5 * iqr))
        return boundaries

    cols = ['device_screen_resolution', 'visit_number']
    for col in cols:
        boundaries = calculate_outliers(data[col])
        data.loc[data[col] < boundaries[0], col] = boundaries[0]
        data.loc[data[col] > boundaries[1], col] = boundaries[1]

    return data

In [None]:

constructor = Pipeline(steps=[
    ('get coordinates', FunctionTransformer(get_coordinates)),
    ('get region', FunctionTransformer(get_region)),
    ('process date', FunctionTransformer(process_date_time)),
    ('add frequencies', FunctionTransformer(add_freqs)),
    ('add categories', FunctionTransformer(add_categories)),
    ('screen resolution', FunctionTransformer(multiply_res))
     ])


In [None]:

cleaner = Pipeline(steps=[
    ('outliers', FunctionTransformer(treat_outliers)),
    ('filter cols', FunctionTransformer(filter_cols))
])



In [None]:

numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('OHE', OneHotEncoder(handle_unknown='ignore', sparse=False))])

encoder = ColumnTransformer(remainder='passthrough', verbose_feature_names_out=False, transformers=[
    ('num', numerical_transformer, ['visit_number', 'device_screen_resolution', 'lat', 'lng', 'visit_dayofweek', 'visit_day', 'visit_hour', 'utm_adcontent_freq', 'utm_source_freq', 'utm_medium_freq', 'geo_region_freq', 'geo_city_freq', 'visit_dayofweek_freq', 'visit_day_freq', 'visit_hour_freq', 'device_browser_freq', 'device_brand_freq', 'device_screen_resolution_freq' ]),
    ('cat', categorical_transformer,  ['device_category', 'device_brand', 'device_browser']),
]).set_output(transform='pandas')

In [None]:

#let's create some more numeric features based on IsolationForest classifier.

interrim_pipe = Pipeline([
('constructor', constructor),
('encoder', encoder)])

train_iso = interrim_pipe.fit_transform(train)


In [None]:

train_iso.head()

Unnamed: 0,visit_number,device_screen_resolution,lat,lng,visit_dayofweek,visit_day,visit_hour,utm_adcontent_freq,utm_source_freq,utm_medium_freq,geo_region_freq,geo_city_freq,visit_dayofweek_freq,visit_day_freq,visit_hour_freq,device_browser_freq,device_brand_freq,device_screen_resolution_freq,device_category_desktop,device_category_mobile,device_category_tablet,device_brand_Apple,device_brand_Huawei,device_brand_Realme,device_brand_Samsung,device_brand_Xiaomi,device_brand_other,device_brand_unknown,device_browser_Android Webview,device_browser_Chrome,device_browser_Edge,device_browser_Firefox,device_browser_Opera,device_browser_Safari,device_browser_Samsung Internet,device_browser_YaBrowser,device_browser_other,session_id,client_id,visit_date,visit_time,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_os,device_model,geo_country,geo_city,target,geo_region,paid_traffic,social_net,top_3_campaigns
0,-0.139954,-0.548662,0.088367,-0.173755,1.613481,-0.579728,-0.915791,-1.471254,-1.360319,-1.77251,0.170042,1.026596,-1.444444,-0.423308,-0.797651,0.851323,-1.135945,-1.177134,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6936046170891716847.1639287023.1639287023,1614924000.0,2021-12-12,2023-08-23 08:30:23,vFcAhRxLfOWKhvxjELkx,organic,okTXSMadDkjvntEHzIjp,LLfCasrxQzJIyuldcuWy,aXQzDWsJuGXeBXexNHjc,Android,,Russia,Moscow,0,Russia,0,0,0
1,-0.054228,-0.485753,1.023769,-0.551096,-0.399892,-0.807138,1.042055,-0.729673,-1.223439,0.371485,0.170042,-0.419103,-0.343638,-0.731225,0.371521,-1.667279,-0.14454,-0.953705,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5022319931903866765.1636564037.1636564037,1169350000.0,2021-11-10,2023-08-23 20:07:17,QxAxdyPLuQMEcrdZWdWb,cpc,NLWjXuYiXlKrFJfSWfKt,,JTBldRAXvttfVmCNgppl,Android,,Russia,Saint Petersburg,0,Russia,1,1,0
2,-0.139954,-0.389172,-0.04674,1.055374,0.103451,1.466957,0.552593,0.903689,1.329334,1.152588,0.170042,-1.126158,0.069164,1.088286,0.503259,0.851323,-0.451404,0.92683,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4138513132372217773.1633013678.1633013678,963572700.0,2021-09-30,2023-08-23 17:00:00,ZpYIoDJMcFzVoPFsHGJL,banner,gecBYcKZCPMcVYdSSzKP,JNHcPlZPxEMWDnRiyoBf,,,,Russia,Chelyabinsk,0,Russia,1,0,0
3,-0.139954,-0.328794,0.088367,-0.173755,-1.406579,0.557319,-1.078945,-0.729673,-0.89388,0.371485,0.170042,1.026596,1.113679,-0.185372,-1.546908,-1.689116,-0.14454,0.115035,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4244105010723100759.1637554264.1637554264,988157700.0,2021-11-22,2023-08-23 07:11:04,BHcvLfOaCWvWTykYqHVe,cpc,,,ZNhSIPchlbnKwZWcsKzz,Android,,Russia,Moscow,0,Russia,1,0,0
4,-0.054228,-0.376899,-0.14128,0.773574,-0.903236,0.216205,0.878901,0.903689,1.329334,1.152588,0.170042,-1.111756,1.307571,-0.325334,0.387988,0.851323,-0.451404,-0.354168,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,270360620890506005.1634659715.1634659715,62948240.0,2021-10-19,2023-08-23 19:00:00,ZpYIoDJMcFzVoPFsHGJL,banner,gecBYcKZCPMcVYdSSzKP,JNHcPlZPxEMWDnRiyoBf,,,,Russia,Ufa,0,Russia,1,0,0


In [None]:

def train_iso_features(data):
    train_normal = data[data['target']==0]
    features=['utm_adcontent_freq', 'utm_source_freq', 'paid_traffic', 'utm_medium_freq', 'top_3_campaigns', 'device_screen_resolution_freq', 'device_browser_freq', 'lat','lng',  'social_net', 'geo_city_freq', 'visit_hour_freq', 'visit_day', 'device_screen_resolution']
    newcolslist=[]
    for feat in features:
        isf =  IsolationForest().fit(train_normal[[feat]])
        filename = './models/'+ str(feat) + '.pkl'
        with open(filename, 'wb') as f:
            pickle.dump(isf, f)

train_iso_features(train_iso)


In [None]:

def add_iso_features(data):
    features=['utm_adcontent_freq', 'utm_source_freq', 'paid_traffic', 'utm_medium_freq', 'top_3_campaigns', 'device_screen_resolution_freq', 'device_browser_freq', 'lat','lng',  'social_net', 'geo_city_freq', 'visit_hour_freq', 'visit_day', 'device_screen_resolution']

    for feat in features:
        newcol = str(feat)+'isf'
        filename = './models/'+ str(feat) + '.pkl'
        with open(filename, 'rb') as f:
            isf = pickle.load(f)
        data[newcol] =  isf.score_samples(data[[feat]])
    return data

In [None]:

preprocessor = Pipeline([
('constructor', constructor),
('encoder', encoder),
( 'iso', FunctionTransformer(add_iso_features)),
 ('cleaner', cleaner)
])


In [None]:
 #for faster modelling
#newdf = preprocessor.fit_transform(train)
#newdf.to_csv('./new_train_with_iso.csv', index=False)
#newtest = preprocessor.fit_transform(test)
#newtest.to_csv('./new_test_with_iso.csv', index=False)

In [None]:

x_train = train.drop('target',  axis=1)
x_test = test.drop('target',  axis=1)

y_train = train['target']
y_test = test['target']

model = GradientBoostingClassifier(n_estimators=200)




In [None]:

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)

])

In [None]:

model = pipe.fit(x_train, y_train)


In [None]:

probs = model.predict_proba(x_test)
probs = probs[:, 1]
# рассчитываем ROC AUC
clf_auc_test = roc_auc_score(y_test, probs)
print('значение метрики ROC AUC на тестовой выборке %.3f' % (clf_auc_test))

probs = model.predict_proba(x_train)
probs = probs[:, 1]
# рассчитываем ROC AUC
clf_auc = roc_auc_score(y_train, probs)
print('значение метрики ROC AUC на обучающей выборке %.3f' % (clf_auc))


  data.visit_time = pd.to_datetime(data.visit_time)
  data.visit_time = pd.to_datetime(data.visit_time)


значение метрики ROC AUC на тестовой выборке 0.679
значение метрики ROC AUC на обучающей выборке 0.679


In [None]:

#retrain isf for added fetures on the whole datatset
train_iso = interrim_pipe.fit_transform(df)
train_iso_features(train_iso)



In [None]:

x = df.drop('target', axis=1)
y = df['target']



In [None]:

#retrain the resulting model on the whole dataset
pipe.fit(x, y)

  data.visit_time = pd.to_datetime(data.visit_time)


In [None]:


from datetime import datetime
object_to_dump = {
                'model': pipe,
                'metadata': {
                'author': 'O.K.',
                'version': 1,
                'date': datetime.now(),
                'type': type(pipe.named_steps["classifier"]).__name__,
                'test ROC AUC score ': clf_auc_test

                 }
}




filename = './models/pipe.pkl'
with open(filename, 'wb') as file:
    dill.dump(object_to_dump, file)

In [None]:

#Check that it works

filename = './models/pipe.pkl'
with open(filename, 'rb') as file:
    model = dill.load(file)


In [None]:

test1 = x.sample(1)
test1

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
117932,1321277687297916222.1628093758.1628093758,307633900.0,2021-08-04,19:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,800x600,Chrome,Russia,Moscow


In [None]:

for i in range(1, 6):
    test1 = x.sample(1)
    print(test1['session_id'])
    filename = f'./data/{i}.json'
    test1.to_json(filename, orient='records', lines=True)

1050812    5792447167676083502.1636625712.1636625712
Name: session_id, dtype: object
524055    3267050412456643359.1640336247.1640336247
Name: session_id, dtype: object
396216    2655842674632106052.1623531591.1623531657
Name: session_id, dtype: object
1390912    7423446648842157826.1632435970.1632435970
Name: session_id, dtype: object
385511    2607473594758608104.1633776916.1633776916
Name: session_id, dtype: object
