In [6]:
import os
import sys
import re
import csv
import time
import random
import pickle
import argparse
import ahocorasick
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import OrderedDict

from nltk.corpus import stopwords, words, names
stops = stopwords.words("english")

In [13]:
def read_csv_to_list(filename):
	with open(filename) as f:  # relevant english words
		reader = csv.reader(f)
		csv_list = list(reader)
	csv_list = [c[0] for c in csv_list]
	return(csv_list)

def init_automaton(string_list):
	"""Make Aho-Corasick automaton from a list of strings"""
	A = ahocorasick.Automaton()
	for idx, s in enumerate(string_list):
		A.add_word(s, (idx, s))
	return A

def check_strings(A, search_list, string_to_search):
	"""Use Aho Corasick algorithm to produce boolean list indicating
	prescence of strings within a longer string"""
	index_list = []
	for item in A.iter(string_to_search):
		index_list.append(item[1][0])

	output_list = np.array([0] * len(search_list))
	output_list[index_list] = 1
	return output_list.tolist()

def build_url_feature_vector(A_company, search_list, string_to_search):
	"""Presence of search_list words in string, along with length of string"""
	feature_vector = check_strings(A_company, search_list, string_to_search)
	feature_vector.append(len(string_to_search))
	return feature_vector

In [8]:
# Load list of keywords
words_list = read_csv_to_list('data/word_feature_list.csv')
words_list = [w for w in words_list if w not in stops if len(w) > 1]
url_endings_list = read_csv_to_list('data/domains_endings.csv')
words_list = words_list + url_endings_list
del url_endings_list

A = init_automaton(words_list)
A.make_automaton()

In [10]:
## Test URL list
links_df = pd.read_csv('data/links_dataframe.csv')
url_list = links_df['url'].tolist()
url_list = [l.replace("http://", "").replace("https://", "") for l in url_list if type(l) is str if l[-4:] not in [".png", ".jpg", ".pdf", ".txt"]]
url_list = url_list[:200]

In [21]:
t0 = time.time()
next_state_list = [build_url_feature_vector(A, words_list, l) for l in url_list]
print(time.time()-t0)

0.10703611373901367


In [23]:
url_df = pd.DataFrame.from_dict({"url":url_list})
url_df.apply(build_url_feature_vector, args=())

Unnamed: 0,url
0,www.gjassociates.co.uk/service/36/Restructuring
1,www.sterling-accountants.com/our-services/sole...
2,www.farrarsmith.co.uk/services
3,www.facebook.com/pages/Account-Ants-Darlington...
4,www.chesapeake.co.uk/services/personal-services
5,www.sheffieldlawsociety.co.uk/apply-for-member...
6,www.sableinternational.com/wealth
7,www.tribeuk.co.uk/work/microlise/
8,marcusbaum.co.uk/services/matrimonial/
9,www.cpaglobal.com/patent-portfolio-analysis-di...
