# url-prediction-model
SanjayKAroraPhD@gmail.com <br>
October 2018

## Description
This script validates multiple classifiers to predict correct firm urls.  It assumes as input a matrix of known firm urls and potential matches from MS Bing.  Variables include the search result # (i.e., rank), length of the candidate url, matches of words derived from the known firm name and name of the url, etc. 

## Change log
v2 updates the script to use group k-fold cross-validation and produces evaluation metrics for each class of models.  Group k-fold validation is needed because indivdiual search results, the unit of observation in the input file, are grouped by firm (i.e., when searching for a firm name in Bing, several individual results are returned, and we want to train the model on groups of search results, not observations that are completely independent of one another).  For more information, see http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-for-grouped-data 

In [1]:
# import data processing and other libraries
import csv
import sys
import requests
import re
import pprint
import pymongo
import traceback
from time import sleep
import requests
import pandas as pd
from IPython.display import display
import time
import numpy as np

In [40]:
# import sklearn
from sklearn.model_selection import train_test_split
from sklearn import svm
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [50]:
train_df = pd.read_csv('/Users/sarora/dev/EAGER/data/training/urls/bing-firm-url-train-v5.csv')
train_df

Unnamed: 0,firm,firm_length,url,name_clnd,name_length,hit_url,hit_url_length,rank,matches,public,acquired_merged,outcome
0,honeywell international inc.,9,https://www.honeywell.com/,Honeywell - Official Site,25,https://www.honeywell.com/,26,1,1,0,0,1
1,honeywell international inc.,9,https://www.honeywell.com/,Honeywell International Inc. Company Profile |...,54,http://www.hoovers.com/company-information/cs/...,111,3,1,0,0,0
2,honeywell international inc.,9,https://www.honeywell.com/,HON:New York Stock Quote - Honeywell Internati...,58,https://www.bloomberg.com/quote/HON:US,38,4,1,1,0,0
3,honeywell international inc.,9,https://www.honeywell.com/,HON Stock Price - Honeywell International Inc....,62,https://www.marketwatch.com/investing/stock/hon,47,5,1,2,0,0
4,honeywell international inc.,9,https://www.honeywell.com/,HON Stock Price & News - Honeywell Internation...,56,https://quotes.wsj.com/HON,26,6,1,3,0,0
5,honeywell international inc.,9,https://www.honeywell.com/,Is Honeywell International Inc. a Buy? -- The ...,57,https://www.fool.com/investing/2018/10/06/is-h...,83,7,1,4,0,0
6,imds corporation,4,http://www.imds-ohio.com/,IMDS Corporation,16,http://www.imds-ohio.com/,25,1,1,0,0,1
7,imds corporation,4,http://www.imds-ohio.com/,Integrated Marketing & Distribution Services C...,50,http://imds.com.ph/,19,3,0,0,0,0
8,imds corporation,4,http://www.imds-ohio.com/,IMDS Information Pages - New to IMDS,36,https://public.mdsystem.com/web/imds-public-pa...,58,4,1,0,0,0
9,imds corporation,4,http://www.imds-ohio.com/,IMDS Information Pages - News 2016,34,https://public.mdsystem.com/web/imds-public-pa...,59,5,1,0,0,0


In [51]:
# create training and test sets, split on group (k-cross)

# X, y
X = train_df.drop(['firm', 'url', 'name_clnd', 'hit_url', 'outcome'], axis=1)
y = np.ravel(train_df[['outcome']].values)

# Check how unbalanced we are
display("Outcomes are unbalanced.")
unique, counts = np.unique(y, return_counts=True)
display(dict(zip(unique, counts)))

# Assign gruops based on 'firm'
groups = train_df.groupby('firm').ngroup().values

# k-foldGroup
gkf = GroupKFold(n_splits=3)

'Outcomes are unbalanced.'

{0: 671, 1: 99}

In [52]:
# train a few models

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "SVC", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(gamma=0.001, C=100.), 
    QuadraticDiscriminantAnalysis()]

In [71]:
# build dataframe for output metrics 
eval_df = pd.DataFrame (names,index=(range(len(names))), columns=["Name"])
eval_df['Accuracy'] = np.float64(0)
display (eval_df)

# build dataframe for predicted values
pred_df = pd.DataFrame(index=(range(len(train_df.index)))) # number of rows equals number of training observations

Unnamed: 0,Name,Accuracy
0,Nearest Neighbors,0.0
1,Linear SVM,0.0
2,RBF SVM,0.0
3,Gaussian Process,0.0
4,Decision Tree,0.0
5,Random Forest,0.0
6,Neural Net,0.0
7,AdaBoost,0.0
8,Naive Bayes,0.0
9,SVC,0.0


In [61]:
# build evaluation outputs (currently limited to accuracy)
i = np.int64(0)
for name, clf in zip(names, classifiers):
    display (name)
    scores = cross_val_score(clf, X, y, cv=gkf, groups=groups)
    avg_score = np.mean(scores)
    eval_df.set_value(i, 'Accuracy', avg_score)
    i = i + 1
    
display(eval_df)
eval_df.to_clipboard()

'Nearest Neighbors'

'Linear SVM'

'RBF SVM'

'Gaussian Process'

'Decision Tree'

'Random Forest'

'Neural Net'

'AdaBoost'

'Naive Bayes'

'SVC'

'QDA'

Unnamed: 0,Name,Accuracy
0,Nearest Neighbors,0.906452
1,Linear SVM,0.958495
2,RBF SVM,0.874021
3,Gaussian Process,0.961038
4,Decision Tree,0.964965
5,Random Forest,0.96232
6,Neural Net,0.898771
7,AdaBoost,0.957183
8,Naive Bayes,0.937591
9,SVC,0.96233


## Cross-validation prediction
This section produces outputs to examine the efficacy of each model. The csv file generated below can be imported into Excel, and the analyst can then review which firms seem more or less likely to have accurate predicted URLs (based on the training data)

In [72]:
# predict across classifiers
i = np.int64(0)
for name, clf in zip(names, classifiers):
    display (name)
    y_hat = cross_val_predict(clf, X, y, cv=gkf, groups=groups)
    avg_score = np.mean(scores)
    pred_df[name] = y_hat
    i = i + 1

'Nearest Neighbors'

'Linear SVM'

'RBF SVM'

'Gaussian Process'

'Decision Tree'

'Random Forest'

'Neural Net'

'AdaBoost'

'Naive Bayes'

'SVC'

'QDA'

In [96]:
# hold simple voting
vote_df = pred_df.copy()
vote_df['Votes'] = vote_df.sum(axis=1)
vote_df['Outcome'] = train_df['outcome']
vote_df['Group'] = groups

In [102]:
# pick from each group the top vote getter
idx = vote_df.groupby(['Group'])['Votes'].transform(max) == vote_df['Votes']
results_df = vote_df[idx]

# filter out 0-vote getting observations
results_df = results_df[(results_df['Votes'] > 0)]

# Check how unbalanced we are
display("Votes sometimes might be tied for non-zero values.")
unique, counts = np.unique(results_df['Group'], return_counts=True)
group_dup_list = zip(unique,counts)
# display(group_dup_list)

# merge with original training data
results_small_df = results_df[['Votes']]
results_merged_df = train_df.merge (results_small_df, left_index=True, right_index=True, how='inner')
results_merged_df

'Votes sometimes might be tied for non-zero values.'

Unnamed: 0,firm,firm_length,url,name_clnd,name_length,hit_url,hit_url_length,rank,matches,public,acquired_merged,outcome,Votes
0,honeywell international inc.,9,https://www.honeywell.com/,Honeywell - Official Site,25,https://www.honeywell.com/,26,1,1,0,0,1,9
6,imds corporation,4,http://www.imds-ohio.com/,IMDS Corporation,16,http://www.imds-ohio.com/,25,1,1,0,0,1,9
12,honeywell international inc.,9,https://www.honeywell.com/,Honeywell - Official Site,25,https://www.honeywell.com/,26,1,1,0,0,1,9
19,"ziptronix, inc.",9,https://www.xperi.com/,Invensas,8,https://www.invensas.com/,25,2,0,0,1,0,3
47,sensor-kinesis corporation,14,http://sensor-kinesis.com/,Sensor-Kinesis Corp. | The Leader in Advanced ...,59,http://sensor-kinesis.com/,26,1,2,0,0,1,7
54,gen-probe incorporated,9,https://www.hologic.com/,Hologic - Official Site,23,https://www.hologic.com/,24,1,0,0,0,1,9
62,"asm america, inc.",11,http://www.asm.com/,ASM International - Official Site,33,http://www.asm.com/,19,1,1,0,0,1,7
69,"fenwal, inc.",6,https://www.fresenius-kabi.com/us/,Kidde Fenwal,12,https://kidde-fenwal.com/,25,2,1,0,0,0,3
74,"nantero, inc.",7,http://nantero.com/,Nantero NRAM - Memory technology that is incre...,57,http://nantero.com/,19,1,1,0,0,1,9
81,"princeton optronics, inc.",19,http://www.princetonoptronics.com/,Applications - ams,18,https://ams.com/applications,28,1,0,0,0,0,9


In [104]:
# write data frame to csv
results_merged_df.to_csv('/Users/sarora/dev/EAGER/data/training/urls/bing-firm-url-out-v5.csv')