In [1]:
import sys
import csv
import ahocorasick
import numpy as np
import pandas as pd

from nltk.corpus import stopwords, words, names
stops = stopwords.words("english")

from collections import OrderedDict
from operator import add

In [2]:
def init_automaton(string_list):
	"""Make Aho-Corasick automaton from a list of strings"""
	A = ahocorasick.Automaton()
	for idx, s in enumerate(string_list):
		A.add_word(s, (idx, s))
	return A

def check_strings(A, search_list, string_to_search):
	"""Use Aho Corasick algorithm to produce boolean list indicating
	prescence of strings within a longer string"""
	index_list = []
	for item in A.iter(string_to_search):
		index_list.append(item[1][0])

	output_list = np.array([0] * len(search_list))
	output_list[index_list] = 1
	return output_list.tolist()

def progress_bar(value, endvalue, bar_length=20):
    """Print progress bar to the console"""
    percent = float(value) / endvalue
    arrow = '-' * int(round(percent * bar_length)-1) + '>'
    spaces = ' ' * (bar_length - len(arrow))
    sys.stdout.write("\rPercent complete: [{0}] {1}%".format(arrow + spaces, int(round(percent * 100))))
    sys.stdout.flush()

In [3]:
##-------------------- Read in data
# Company i.e. reward URLs
companies_df = pd.read_csv('../data/domains_clean.csv')
companies_df = companies_df[companies_df['vert_code'] <= 69203]
companies_df = companies_df[companies_df['vert_code'] >= 69101]
reward_urls = companies_df['url'].tolist()

In [4]:
##------------------------ Read in words list and build automaton
word_list = words.words() + names.words()
word_list = [w for w in word_list if w not in stops]
word_list = [w for w in word_list if len(w) > 1]
A = init_automaton(word_list)
A.make_automaton()

In [5]:
# url_list = url_list[:100]
word_count_vec = [0] * len(word_list)
for idx, url in enumerate(reward_urls):
    progress_bar(idx+1, len(reward_urls))
    word_count_vec = list(map(add, word_count_vec, np.array(check_strings(A, word_list, url))))

Percent complete: [------------------->] 100%

In [6]:
df_dict = OrderedDict()
df_dict['word'] = word_list
df_dict['count'] = word_count_vec
df = pd.DataFrame.from_dict(df_dict)

df = df[df['count'] > 0]
len_filter = df['word'].str.len() > 2
df = df[len_filter]
df = df.sort_values('count', ascending=False)
print(len(df))
df.head(n=10)

3665


Unnamed: 0,word,count
44006,count,493
235709,account,486
197825,tan,361
94528,ing,300
235726,ant,212
236070,law,211
130850,ons,195
236423,tin,190
184367,sol,184
203418,ting,179


In [10]:
rm_words = ['ing', 'ers', 'tin', 'cit', 'tor', 'untin', 'els', 'ich']
final_word_list = [w for w in df['word'].tolist() if w not in rm_words]

f = open('data/word_.csv', 'w')
w = csv.writer(f, delimiter = ',')
w.writerows([x.split(',') for x in data])
f.close()

TypeError: Mismatch between array dtype ('<U14') and format specifier ('%.18e')