## General imports

In [1]:
import pandas as pd
import numpy as np
import itertools
import calendar
import random
import time
import json

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
from datetime import date
from functools import reduce

## sklearn imports

In [2]:
from sklearn.metrics import accuracy_score, log_loss, precision_score, recall_score, f1_score
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis



In [3]:
df = pd.read_csv('data/TestData.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 13 columns):
Date             2000 non-null datetime64[ns]
Hour             2000 non-null int64
ExternalId       2000 non-null object
Domain           2000 non-null object
PlacementId      2000 non-null int64
Placements       2000 non-null int64
Impressions      2000 non-null int64
FillRate         2000 non-null float64
BuyCpm           2000 non-null float64
SellCpm          2000 non-null float64
ApbCpm           2000 non-null float64
Margin           2000 non-null float64
MarginPercent    2000 non-null float64
dtypes: datetime64[ns](1), float64(6), int64(4), object(2)
memory usage: 203.2+ KB


## Reduce Dataframe by specific column

In [4]:
df.head()

Unnamed: 0,Date,Hour,ExternalId,Domain,PlacementId,Placements,Impressions,FillRate,BuyCpm,SellCpm,ApbCpm,Margin,MarginPercent
0,2016-08-01 04:00:00,4,BC-US-8|13807760,myfox8.com,5410238,2,0,0.0,4.502,2.754,0.001,-0.009,0.0
1,2016-07-31 23:00:00,23,BC-US-8|13807760,hotnewhiphop.com,506724,0,1,0.0,0.0,2.871,0.001,0.0,0.0
2,2016-07-31 23:00:00,23,BC-US-8|13807760,drudgereport.com,6340489,1,0,0.0,1.32,3.366,0.001,-0.001,0.0
3,2016-07-31 16:00:00,16,BC-US-8|13807760,bloodyelbow.com,3640217,0,1,0.0,0.0,4.842,0.001,0.0,0.0
4,2016-07-31 02:00:00,2,BC-US-14|13859978,zillow.com,6397031,3,0,0.0,1.2,1.863,2.012,-0.004,0.0


In [5]:
class DataFrameReducer:
    def __init__(self, df, index_label):
        self.df = df
        self.index_label = index_label
        self.__columns = [index_label]
        self.__attempt_flag = False
        self.result = None
        
    def drop_columns(self, labels, axis=1):
        self.df = self.df.drop(labels=labels, axis=axis)
    
    def reduce_by_func(self, label, func=sum):
        """ Set func (i.e. sum or np.mean)
        """
        self.__columns.append(label)
        
        lst = [(self.df[i:i+1][self.index_label].values[0], self.df[i:i+1][label].values[0]) for i in range(len(self.df))]
        lst = np.array([(k, func(list(e for _, e in list(v)))) for k, v in itertools.groupby(sorted(lst), key=lambda x : x[0])])
        if self.__attempt_flag:
            lst = np.delete(lst, np.s_[0:1], axis=1)
            self.result = np.append(self.result, lst, axis=1)
        else:
            self.__attempt_flag = True
            self.result = lst
        return pd.DataFrame(self.result, columns=self.__columns)
    
    def to_df(self):
        return pd.DataFrame(self.result, columns=self.__columns)

In [6]:
reduced = DataFrameReducer(df, 'Date')

In [7]:
reduced.reduce_by_func('BuyCpm', np.mean)
reduced.reduce_by_func('SellCpm', np.mean)
reduced.reduce_by_func('ApbCpm', np.mean)
reduced.reduce_by_func('Placements', sum)
reduced.reduce_by_func('Impressions', sum)
reduced.reduce_by_func('Margin', sum)

df1 = reduced.to_df()

In [8]:
print(f'Dataframe size {df1.shape}')
df1.head()

Dataframe size (31, 7)


Unnamed: 0,Date,BuyCpm,SellCpm,ApbCpm,Placements,Impressions,Margin
0,2016-07-28T15:00:00.000000000,0.999672,4.61092,0.0186481,108092,22177,40.56
1,2016-07-28T16:00:00.000000000,0.688829,5.71346,0.129743,68,70,0.063
2,2016-07-28T17:00:00.000000000,0.531364,5.66468,0.001,19,20,0.01
3,2016-07-28T18:00:00.000000000,0.704125,5.1165,0.001,35,18,0.035
4,2016-07-28T19:00:00.000000000,0.527227,5.60864,0.001,27,15,-0.015


In [9]:
reduced = DataFrameReducer(df, 'Domain')

In [10]:
reduced.reduce_by_func('BuyCpm', np.mean)
reduced.reduce_by_func('SellCpm', np.mean)
reduced.reduce_by_func('ApbCpm', np.mean)
reduced.reduce_by_func('Placements', sum)
reduced.reduce_by_func('Impressions', sum)
reduced.reduce_by_func('Margin', sum)

df2 = reduced.to_df()

In [11]:
print(f'Dataframe size {df2.shape}')
df2.head()

Dataframe size (1402, 7)


Unnamed: 0,Domain,BuyCpm,SellCpm,ApbCpm,Placements,Impressions,Margin
0,ads.proboards.com,0.9205,6.075,0.001,23,1,-0.028
1,allrecipes.com,0.0,3.879,0.001,0,2,0.0
2,america.aljazeera.com,0.771333333333,6.048,0.001,2,1,-0.002
3,amyshealthybaking.com,1.262,5.607,0.001,1,0,-0.001
4,bet.com,0.0,6.705,0.001,0,22,0.0


## Label encoding and classifier chosing

In [12]:
df = pd.read_csv('data/train.csv', sep='|')

df['created'] = pd.to_datetime(df['created'])
df['weekday'] = df['created'].apply(lambda x: calendar.day_name[x.weekday()])
df['num_features'] = df['features'].apply(len)

df = df.drop(labels=['features', 'created'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49352 entries, 0 to 49351
Data columns (total 7 columns):
bathrooms         49352 non-null float64
bedrooms          49352 non-null int64
interest_level    49352 non-null object
price             49352 non-null float64
num_photos        49352 non-null int64
weekday           49352 non-null object
num_features      49352 non-null int64
dtypes: float64(2), int64(3), object(2)
memory usage: 2.6+ MB


In [13]:
df.head()

Unnamed: 0,bathrooms,bedrooms,interest_level,price,num_photos,weekday,num_features
0,1.5,3,medium,3000.0,5,Friday,2
1,1.0,2,low,5465.0,11,Sunday,73
2,1.0,1,high,2850.0,8,Sunday,85
3,1.0,1,low,3275.0,3,Monday,29
4,1.0,4,low,3350.0,3,Thursday,11


In [14]:
class DataEncoder:
    
    def __init__(self, df):
        self.df = df
        self.onehot_encoded = False
    
    def le_encoding(self, labels):
        if not isinstance(labels, list):
            raise TypeError
        self.le_labels = labels
        self.le_encoders = {label: LabelEncoder().fit(self.df[label]) for label in self.le_labels}

    def encode_all_le(self):
        try:
            for label in self.le_labels:
                df[label] = self.le_encoders[label].transform(df[label])   
            return df
        except ValueError:
            print('Firstly decode labels to origin values')
            return df
    
    def decode_all_le(self):
        try:
            for label in self.le_labels:
                df[label] = self.le_encoders[label].inverse_transform(df[label])   
            return df
        except ValueError:
            print('Firstly encode labels')
            return df
    
    def onehot_encoding(self, labels):
        if not self.onehot_encoded:
            self.onehot_encoded = True
            return pd.get_dummies(df, columns=labels)
        else:
            print('One-hot Encoding already applied')
            return df

In [15]:
de = DataEncoder(df)

In [16]:
de.le_encoding(['interest_level', 'weekday'])

In [17]:
de.encode_all_le().head()

Unnamed: 0,bathrooms,bedrooms,interest_level,price,num_photos,weekday,num_features
0,1.5,3,2,3000.0,5,0,2
1,1.0,2,1,5465.0,11,3,73
2,1.0,1,0,2850.0,8,3,85
3,1.0,1,1,3275.0,3,1,29
4,1.0,4,1,3350.0,3,4,11


In [18]:
de.decode_all_le().head()

Unnamed: 0,bathrooms,bedrooms,interest_level,price,num_photos,weekday,num_features
0,1.5,3,medium,3000.0,5,Friday,2
1,1.0,2,low,5465.0,11,Sunday,73
2,1.0,1,high,2850.0,8,Sunday,85
3,1.0,1,low,3275.0,3,Monday,29
4,1.0,4,low,3350.0,3,Thursday,11


In [19]:
df = de.onehot_encoding(['weekday'])

In [20]:
df.head()

Unnamed: 0,bathrooms,bedrooms,interest_level,price,num_photos,num_features,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
0,1.5,3,medium,3000.0,5,2,1,0,0,0,0,0,0
1,1.0,2,low,5465.0,11,73,0,0,0,1,0,0,0
2,1.0,1,high,2850.0,8,85,0,0,0,1,0,0,0
3,1.0,1,low,3275.0,3,29,0,1,0,0,0,0,0
4,1.0,4,low,3350.0,3,11,0,0,0,0,1,0,0


In [21]:
X, y = df.drop(labels=['interest_level'], axis=1), df['interest_level']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3, 
                                                    random_state = random.randint(0, 1024), 
                                                    stratify=y)
del X, y
print(f'train size: \t{X_train.shape}\ntest size: \t{X_test.shape}')

train size: 	(34546, 12)
test size: 	(14806, 12)


In [22]:
classifiers = [
    KNeighborsClassifier(n_jobs=-1),
#     SVC(probability=True), # too long
    DecisionTreeClassifier(),
    RandomForestClassifier(n_jobs=-1),
    ExtraTreesClassifier(n_jobs=-1),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

In [23]:
results = {'classifiers': []}

for clf in tqdm(classifiers):
    result = dict()
    start_time = time.time()
    clf.fit(X_train, y_train)
    result['training_time'] = round(time.time() - start_time, 4)
    train_predictions = clf.predict(X_test)
    train_predictions_proba = clf.predict_proba(X_test)
    
    result['name'] = clf.__class__.__name__
    result['log_loss'] = round(log_loss(y_test, train_predictions_proba), 4)
    result['accuracy'] = round(accuracy_score(y_test, train_predictions), 4)
    if len(np.unique(train_predictions)) == 2:
        avarage = 'binary'
    else:
        average = 'weighted'
    result['precission'] = round(precision_score(y_test, train_predictions, average=average), 4)
    result['recal'] = round(recall_score(y_test, train_predictions, average=average), 4)
    result['f1-score'] = round(f1_score(y_test, train_predictions, average=average), 4)
    results['classifiers'].append(result)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:11<00:00,  1.10s/it]


In [24]:
print(json.dumps(results, indent=2))

{
  "classifiers": [
    {
      "training_time": 0.0942,
      "name": "KNeighborsClassifier",
      "log_loss": 3.6911,
      "accuracy": 0.6616,
      "precission": 0.6099,
      "recal": 0.6616,
      "f1-score": 0.6259
    },
    {
      "training_time": 0.1692,
      "name": "DecisionTreeClassifier",
      "log_loss": 12.9507,
      "accuracy": 0.6195,
      "precission": 0.6279,
      "recal": 0.6195,
      "f1-score": 0.6236
    },
    {
      "training_time": 0.2627,
      "name": "RandomForestClassifier",
      "log_loss": 2.6752,
      "accuracy": 0.6722,
      "precission": 0.6326,
      "recal": 0.6722,
      "f1-score": 0.6467
    },
    {
      "training_time": 0.263,
      "name": "ExtraTreesClassifier",
      "log_loss": 3.9067,
      "accuracy": 0.6549,
      "precission": 0.6191,
      "recal": 0.6549,
      "f1-score": 0.6332
    },
    {
      "training_time": 2.295,
      "name": "AdaBoostClassifier",
      "log_loss": 1.072,
      "accuracy": 0.6976,
      "preci

In [25]:
df_results = pd.DataFrame(results['classifiers'])

In [26]:
df_results

Unnamed: 0,accuracy,f1-score,log_loss,name,precission,recal,training_time
0,0.6616,0.6259,3.6911,KNeighborsClassifier,0.6099,0.6616,0.0942
1,0.6195,0.6236,12.9507,DecisionTreeClassifier,0.6279,0.6195,0.1692
2,0.6722,0.6467,2.6752,RandomForestClassifier,0.6326,0.6722,0.2627
3,0.6549,0.6332,3.9067,ExtraTreesClassifier,0.6191,0.6549,0.263
4,0.6976,0.618,1.072,AdaBoostClassifier,0.6221,0.6976,2.295
5,0.705,0.6304,0.6661,GradientBoostingClassifier,0.6422,0.705,5.7104
6,0.6545,0.6072,0.8457,GaussianNB,0.573,0.6545,0.0462
7,0.6917,0.583,0.7445,LinearDiscriminantAnalysis,0.5895,0.6917,0.1001
8,0.5906,0.6042,0.9497,QuadraticDiscriminantAnalysis,0.6212,0.5906,0.0802
