In [1]:
# Imports
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import svm
import pprint
import utils
from sklearn.externals import joblib

In [2]:
# Load data, parsing dates and renaming column names
mdir = 'data/Integral_data_set.tsv'
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
df = pd.read_csv(mdir, sep='\t', header=None, parse_dates=[0], date_parser=dateparse)
df.columns = ['time', 'ip', 'browser', 'user', 'url', 'impressions', 'plugins', 'position', 'latency']

In [3]:
# About the Data
print("The shape of data is {}".format(df.shape))
df.head()

The shape of data is (235083, 9)


Unnamed: 0,time,ip,browser,user,url,impressions,plugins,position,latency
0,2014-08-25,393.414.443.469,Safari/Webkit,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...,http://www.domain.com.au,0.0,,"(0,0,1280,629)",0.0
1,2014-08-25,393.414.443.469,Safari/Webkit,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...,http://www.domain.com.au,0.0,,"(0,0,1280,629)",0.0
2,2014-08-25,325.441.386.395,Unknown,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,http://www.mangareader.net,,,,
3,2014-08-25,325.441.386.395,Unknown,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,http://www.mangareader.net,,,,
4,2014-08-25,325.441.386.395,Unknown,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,http://www.mangareader.net,,,,


In [4]:
# Statistics about the timestamps
print("Min timestamp {} \n Max timestamp {}".format(df['time'].min(), df['time'].max()))

Min timestamp 2014-08-25 00:00:00 
 Max timestamp 2014-08-25 23:59:58


In [5]:
# Statistics about the ip addresses
print("{} unique IPs from total {} data points".format(len(df['ip'].unique()), df.shape[0]))

8216 unique IPs from total 235083 data points


In [6]:
# Statistics about the browsers
print("{} unique Browsers from total {} data points".format(len(df['browser'].unique()), df.shape[0]))

6 unique Browsers from total 235083 data points


In [7]:
# Statistics about the users
print("{} unique user agent string from total {} data points".format(len(df['user'].unique()), df.shape[0]))

4779 unique user agent string from total 235083 data points


In [8]:
# Statistics about the users
print("{} unique URLs from total {} data points".format(len(df['url'].unique()), df.shape[0]))

11779 unique URLs from total 235083 data points


In [9]:
# Ground truth "fraud" urls
gnd_t = ['http://www.featureplay.com', 
       'http://videos1.uvidi.com', 
       'http://spryliving.com', 
       'http://greatxboxgames.com',
       'http://www.mmabay.co.uk', 
       'http://video.workingmothertv.com',                                                                                                                                                                                                                                          
       'http://besthorrorgame.com',
       'http://dailyparent.com',
       'http://superior-movies.com',
       'http://yourhousedesign.com',
       'http://video.outdoorlife.tv',
       'http://drumclub.info',
       'http://video.cycleworld.tv',
       'http://hmnp.us',
       'http://go.nlinevideos.com']
print("There are {} Ground truth fraud labels among {} unique labels".format(len(gnd_t), len(df['url'].unique())))

There are 15 Ground truth fraud labels among 11779 unique labels


In [10]:
# Fraud Data
fraud = df[df['url'].isin(gnd_t)]
# Investigate by timestamps
fraud = fraud.sort_values(by=['time'])
print("There are {} total number of fraud data points".format(fraud.shape[0]))
fraud.head()

There are 2179 total number of fraud data points


Unnamed: 0,time,ip,browser,user,url,impressions,plugins,position,latency
279,2014-08-25 00:02:08,574.491.567.341,Internet Explorer,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,http://go.nlinevideos.com,1.0,,"(0,0,1024,673)",3496.0
311,2014-08-25 00:02:23,476.494.399.426,Chrome,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,http://spryliving.com,0.0,3.0,"(0,0,1024,706)",0.0
1186,2014-08-25 00:11:03,324.338.423.496,Chrome,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,http://greatxboxgames.com,0.0,1.0,,
1360,2014-08-25 00:12:38,525.537.550.349,Internet Explorer,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,http://video.outdoorlife.tv,1.0,,"(0,0,1280,1024)",144.0
1365,2014-08-25 00:12:40,525.537.550.349,Internet Explorer,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,http://video.outdoorlife.tv,1.0,,"(0,0,1280,1024)",129.0


In [11]:
# Number of fraud urls in 5-minute window
t_start = pd.Timestamp('2014-08-25 00:15:00')
t_end = pd.Timestamp('2014-08-25 00:20:00')
t_5 = fraud[(fraud['time'] < t_end) & (fraud['time'] > t_start)]
t_5_total = df[(df['time'] < t_end) & (df['time'] > t_start)]
print("{} / {} fraud entres in given 5 minute window of ground truth fraud labels".format(t_5.shape[0], t_5_total.shape[0]))

4 / 575 fraud entres in given 5 minute window of ground truth fraud labels


In [12]:
# Number of fraud urls in 5-minute window
t_start = pd.Timestamp('2014-08-25 00:25:00')
t_end = pd.Timestamp('2014-08-25 00:30:00')
t_5 = fraud[(fraud['time'] < t_end) & (fraud['time'] > t_start)]
t_5_total = df[(df['time'] < t_end) & (df['time'] > t_start)]
print("{} / {} fraud entres in given 5 minute window of ground truth fraud labels".format(t_5.shape[0], t_5_total.shape[0]))

5 / 583 fraud entres in given 5 minute window of ground truth fraud labels


## Use of time-stamps as features
Given that we have only ground truth information for 15 URLs, 5 / 583 fraud urls cannot be discounted, hence timestamps can also be used as a feature, by segmenting time into small windows. Using timestamps, leads to a better overal test accuracy, and accuracy for the (majority voting) metric defined for this task. You can try to exclude it by disabling use_time=False. There is a substantial(>20% decrease in accuracy in this scenario).The timestamps are one-hot encoded into a bucket window of "5 minutes", which was a heuristic choice. This gives 288 "5-minute" windows for a given day. Choosing a value less than 5 minutes will improve the model, but will explode the feature space, a window larger than 5-minuts won't be able to capture as much information.

In [13]:
help(utils.encodeDates)

Help on function encodeDates in module utils:

encodeDates(df, feat, column, freq='5min')
    Helper function to convert timestamps into a one-hot encoded bucekts with frequency "freq"
    Input: {df, feat, column, freq}
    -- df =  Input Dataframe 
    -- feat = List to store one hot features generated,
    -- column = Name of feature in df
    -- freq = Frequency with which to segment the data into "freq" long periods
    
    Output: {feat} 
    -- feat = Returns a list with appended feature generated



## Model Features
For a model to understand text data, we can either model the text as word-vectors(word2vec, gloVe etc), or treat them as a categorical features, as the ratio of unique number of features vs total data points is very low, we have chosen to stick with approach 2. 
## Features Not Included in model 1.x
IP address, browser, user string are used as features in this model to capture the variability of url in the data. As the data from impression, plugin, position and latency is very dirty, with a lot of NaNs, and without a better heuristic knowledge to tackle the NaNs(can't just fill in zeros, this will severy bias the model), these features were not selected for the model. However, using these features "properly" will surely improve the model, which is a task for model 2.x after getting more insight into how these features impact fraud/non-fraud data 

In [14]:
help(utils.getEncoded)

Help on function getEncoded in module utils:

getEncoded(df, feat, column, out_name, oh=False, clip=False, write_out=False, thresh=False)
    Helper function to obtain a clipped-one-hot encoded vector, clipping decided by "thresh" 
    Input: {df, feat, column, out_name, oh, clip, write_out}
    -- df =  Input Dataframe 
    -- feat = List to store one hot features generated,
    -- column = Column name of feature in df
    -- out_name = name of column for the resulting data to be stored in df based on write_out
    -- oh = Boolean, set it True to obtain one-hot encoded vectors, calls oneHot()
    -- clip = Clips the number of features to be less than a threshold(thresh) see oneHot() for details
    -- write_out = Boolean, set to True, if you want the resulting feature to be stored in the df inplace
    -- thresh = {thresh = False; thresholds at mean value, 
                          integer value} 
    
    Output: {df, feat, le} 
    -- df =  Input Dataframe with features written inp

In [15]:
# Feature Engineering
feat = []
# url is modelled as Y for this task
df, feat, le_y = utils.getEncoded(df, feat, 'url', 'y', oh=False, clip=False, write_out=True)
df.head(5)

Unnamed: 0,time,ip,browser,user,url,impressions,plugins,position,latency,y
0,2014-08-25,393.414.443.469,Safari/Webkit,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...,http://www.domain.com.au,0.0,,"(0,0,1280,629)",0.0,7330
1,2014-08-25,393.414.443.469,Safari/Webkit,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...,http://www.domain.com.au,0.0,,"(0,0,1280,629)",0.0,7330
2,2014-08-25,325.441.386.395,Unknown,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,http://www.mangareader.net,,,,,8979
3,2014-08-25,325.441.386.395,Unknown,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,http://www.mangareader.net,,,,,8979
4,2014-08-25,325.441.386.395,Unknown,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,http://www.mangareader.net,,,,,8979


In [16]:
# Encoding IP addresses  
_, feat, _ = utils.getEncoded(df, feat, 'ip', 'f1', True, True, False)
print("Shape of feature vector is {}".format(feat[0].shape))
print("Peek into features \n {}".format(feat[0][:5, :]))

Shape of feature vector is (235083, 1301)
Peek into features 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
# Encoding browser
_, feat, _ = utils.getEncoded(df, feat, 'browser', 'f2', True, False , False) # Only 6 browsers
print("Shape of feature vector is {}".format(feat[1].shape))
print("Peek into features \n {}".format(feat[1][:5, :]))

Shape of feature vector is (235083, 6)
Peek into features 
 [[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]]


In [18]:
# Encoding user
_, feat, _ = utils.getEncoded(df, feat, 'user', 'f3', True, True, False)
print("Shape of feature vector is {}".format(feat[2].shape))
print("Peek into features \n {}".format(feat[2][:5, :]))

Shape of feature vector is (235083, 375)
Peek into features 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [19]:
# Encoded Dates
use_time = True  # Change this to True to include timestamp as features
if use_time is True:
    feat = utils.encodeDates(df, feat, 'time')
    print("Shape of feature vector is {}".format(feat[3].shape))
    print("Peek into features \n {}".format(feat[3][:5, :]))

Shape of feature vector is (235083, 287)
Peek into features 
 [[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [20]:
# Feature concatenation
X = np.hstack(feat)
print("There are {} datapoints, and {} features in the final feature vector".format(X.shape[0], X.shape[1]))

There are 235083 datapoints, and 1969 features in the final feature vector


In [21]:
# Convert ground truth fraud urls using the same label encoder used for  'y'
gnd_le = le_y.transform(gnd_t)
# Label all fraud data points as 1, and rest as class -1 
Y = df['y'].isin(gnd_le)
gnd = df['y']
Y = Y.astype(int)
Y=Y.replace(to_replace=0,value=-1).as_matrix()

In [22]:
# About the data
print("Peek X matrix \n {}".format(X[:5,:]))
print("Peek Y matrix \n {}".format(Y[:5]))

Peek X matrix 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Peek Y matrix 
 [-1 -1 -1 -1 -1]


## Structuring the Problem
### Model
As the ground truth information for only a very few fraud labels are given, we predict the fraud data points using a one-class SVM, that predicts +1 for fraud labels, and -1 for non-fraud labels once trained.
### Feeding the Data
As only ground truth information for a few fraud labels is provided, the training-testing split is done such that a fixed percent(per=0.05) of data points with +1 lables are split into testing set, and the rest +1 labels are used in the training set in addition to a fixed number(negative = 10000) of -1 samples that are drawn at random. Please note that the test set only contains +1 samples     

In [23]:
help(utils.split_training_testing)

Help on function split_training_testing in module utils:

split_training_testing(X, Y, gnd, negative=10000, per=0.05)
    Helper function to split data into training and testing set
    Train = "1 - per" fraction of randomly drawn +1 samples and "negative" number of randomly drawn -1 samples
    Test = "per" fraction of randomly drawn +1 samples 
    Input: {X, Y, gnd, negative, per}
    -- X = Feature matrice: Input features generated by stacking getEncoded() output on multiple features
    -- Y = Labels of corresponding X matrix
    -- gnd = Class labels of all data points in X from LabelEncoder()
    -- negative = Int, Number of negative class samples to be sampled
    -- per = Percentage of +1 samples in testing set among all +1 samples. 
    
    Output: {X_train, y_train, gnd_tr, X_test, y_test, gnd_te} 
    -- X_train =  Training set
    -- y_train = training labels -- +1 for fraud data points, 0 for others    
    -- gnd_tr = Class labels for samples in training set generated f

In [24]:
# Split Data
X_train, y_train, gnd_tr, X_test, y_test, gnd_te = utils.split_training_testing(X, Y, gnd, negative=10000, per=0.05)
print("Final shape of X_train is {}, y_train is {}".format(X_train.shape, y_train.shape))
print("Final shape of X_test is {}, y_test is {}".format(X_test.shape, y_test.shape))

Final shape of X_train is (12071, 1969), y_train is (12071,)
Final shape of X_test is (108, 1969), y_test is (108,)


In [25]:
# Model 
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) 
train = False  # Set it to True to train your own model
if train is True:
    clf.fit(X_train)
    joblib.dump(clf, 'model/new_model.pkl')
else:
    clf = joblib.load('model/model.pkl') 
y_pred_test = clf.predict(X_test)

In [26]:
# Accuracy test -- Predicting accuracy on individual samples
acc_test = y_pred_test[y_pred_test == 1].sum()/float(len(y_pred_test))
print(acc_test)

0.9629629629629629


In [27]:
help(utils.voting)

Help on function voting in module utils:

voting(y_pred_test, gnd_te)
    Helper function to judge the accuracy of model on test set using majority voting on distinct y values
    Input: {y_pred_test, gnd_te}
    -- y_pred_test = Prediction of model on y_test 
    -- gnd_te = Class labels for samples in testing set generated from LabelEncoder()
    
    Output: {acc_vot} 
    -- acc_vot =  Model Accuracy on Test Set



In [28]:
# Accuracy test -- Predicting accuracy for each distinct url in test set based on majority voting
acc_vot = utils.voting(y_pred_test, gnd_te)
print(acc_vot)

1.0


In [29]:
help(utils.evaluate)

Help on function evaluate in module utils:

evaluate(y_pred_X, gnd, thresh, le_y)
    Helper function to evaluate the model on input data to produce fraud list
    Input: {y_pred_X, gnd, thresh, le_y}
    -- y_pred_X = Prediction of model on X 
    -- gnd = Class labels for samples in X generated from LabelEncoder()
    -- thresh = threshold for majority voting over which samples are treated as fraud
    -- le_y = Label encoder used for converting 'y' in range 0 to num_classes - 1 
    
    Output: {fraud_list} 
    -- fraud_list =  List of urls recieving substantial fraudulent traffic



In [30]:
# Evaluate the model on X
y_pred_X = clf.predict(X)
fraud_list = utils.evaluate(y_pred_X, gnd.as_matrix(), 25, le_y)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(len(fraud_list))

838


In [31]:
pp.pprint(fraud_list)

[   'http://247gamer.net',
    'http://247sports.com',
    'http://997now.cbslocal.com',
    'http://abcnews.go.com',
    'http://about.com',
    'http://ad-fake.spotify.com',
    'http://ad.adnet.de',
    'http://ad.kaskus.co.id',
    'http://ad.kiosked.com',
    'http://ad.mandlegears.com',
    'http://ad.payclick.it',
    'http://ad.ranker.com',
    'http://ad1.guardianlv.com',
    'http://ads.adultswim.com',
    'http://ads.bibme.org',
    'http://ads.bossip.com',
    'http://ads.cnn.com',
    'http://ads.localyokelmedia.com',
    'http://ads.mp.mydas.mobi',
    'http://ads.ondemandkorea.com',
    'http://ads.optplay.com',
    'http://ads.proboards.com',
    'http://ads.web.aol.com',
    'http://adsdelivery1.com',
    'http://adserver.todaysgrind.com',
    'http://adv.diariodelweb.it',
    'http://adv.tech24.biz',
    'http://aktuel.mynet.com',
    'http://allrecipes.com',
    'http://americanoverlook.com',
    'http://americanprofile.com',
    'http://answers.com',
    'http://aol