In [307]:
import pandas as pd
import numpy as np 
from collections import Counter
from MarkovChain import MarkovChain
import splitter
import tldextract
from scipy.sparse import csr_matrix
from sqlalchemy import create_engine
import pickle

In [None]:
alexa_df = pd.read_csv('resources/top-1m.csv', names=['rank', 'domain'])

In [None]:
alexa_df = pd.DataFrame.from_dict([
    {'domain': 'helloworld.com'},
    {'domain': 'helloninja.com'},
    {'domain': 'lovecats.com'},
    {'domain': 'lovedogs.com'},
    {'domain': 'lovewomen.com'},
    {'domain': 'ninjawomen.com'},
    {'domain': 'ninjakid.com'},
    {'domain': 'ninjaboy.com'},
    {'domain': 'ninjamen.com'},
])

In [None]:
query_string

In [10]:
query_string = '''
    SELECT DISTINCT lower(base_domain) as base_domain
    FROM domains
    WHERE label = 1
    AND base_domain NOT IN ('000webhostapp.com', 'azurewebsites.net')
    AND domain_name NOT LIKE '%%-%%'
    LIMIT 5000
'''
engine = create_engine('postgresql://postgres:mypassword@localhost:5432/')
input_df = pd.read_sql_query(query_string, engine)

In [None]:
query_string

In [53]:
input_df[input_df.base_domain.str.contains('free')]

Unnamed: 0,base_domain
96,livingfreeintl.com
164,onlydutyfree.ru
178,freedynamicdns.org
242,freehost.pl
1027,free.fr
1162,icloudfreedom.com
1404,freeziana.com
1581,freeget.net
1604,freehostia.com
1745,freeimautomationtools.com


In [4]:
len(input_df)

5000

In [65]:
word_statistics, states_set, words_in_domain_counter = domain_to_word_features(list(input_df['base_domain']))

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


In [285]:
word = 'you'
word_statistics[word]

{'appeareance': 7,
 'index': Counter({0: 2, 2: 2, 3: 1, 1: 2}),
 'sentence_length': Counter({4: 2, 3: 4, 7: 1}),
 'transitions': Counter({'you': 0,
          'videos': 1,
          'it': 1,
          'hating': 1,
          'me': 1,
          'code': 1})}

In [303]:
words_transitions_prob = extract_transitions_probabilities(word_statistics, states_set)

In [304]:
model = convert_probabilities_to_model(words_transitions_prob)

In [309]:
with open('5k_model.pkl', 'wb') as f:
    pickle.dump((word_statistics, states_set, words_in_domain_counter), f)

In [305]:
word = 'free'
for i in range(10):
    print(create_random_domain_name(model, word_statistics, word))

freegamelabs
freeget001
freegamefish
freemovies
freenadef
freelc
freegamefish
freeiani
freehosta
freehosta


In [282]:
create_random_domain_name(model, word_statistics, word)

current_state: en
keys for random choise of current word: [8, 3, 4, 6, 9, 5, 7, 2]
current_word_sentence_length: 6
current_state: y
keys for random choise of current word: [3, 4, 5, 7, 2, 8]
current_word_sentence_length: 3
domain_name: iteny
word_list: ['it', 'en', 'y']


'iteny'

In [302]:
def create_random_domain_name(model, word_statistics, initial_state):
    word_list = [initial_state]
    current_state = initial_state
    sentence_length = 1
    while sentence_length < 25 and len(word_statistics[current_state]['transitions']) > 0:
        sentence_length += 1
        current_state = model.next_state(current_state)
        word_list.append(current_state)

        p = convert_counter_to_probabilities(word_statistics[current_state]['sentence_length'])
        
#         print (f'current_state: {current_state}')
#         print (f'keys for random choise of current word: {list(p.keys())}')
        
        current_word_sentence_length = np.random.choice(list(p.keys()), p=list(p.values()))
        
#         print (f'current_word_sentence_length: {current_word_sentence_length}')
        
        if current_word_sentence_length <= sentence_length:
            break
            
    domain_name = ''.join(word_list)
#     print (f'domain_name: {domain_name}')
#     print (f'word_list: {word_list}')
    return domain_name


def domain_to_word_features(domain_list):
    states_set = set()
    words_in_domain_counter = Counter()
    word_statistics = {}

    domain_index = 0
    for domain in domain_list:
        debug = True if 'free' in domain else False
            
        if domain_index % 100 == 0:
            print (domain_index)

        domain_index += 1
        exr = tldextract.extract(domain)
        words = splitter.split(exr.domain) #.replace('-','')) #.replace('2', 'to').replace('4', 'for'))
        words = [word.lower() for word in words]
       
        words_in_domain_counter.update([len(words)])
        word_index = 0
        if type(words) == list and len(words) > 1:
            for word in words:
                if word not in word_statistics:
                    word_statistics[word] = {}
                    word_statistics[word]['appeareance'] = 0
                    word_statistics[word]['index'] = Counter()
                    word_statistics[word]['sentence_length'] = Counter()
                    word_statistics[word]['transitions'] = Counter()

                word_statistics[word]['appeareance'] += 1
                word_statistics[word]['sentence_length'].update([len(words)])
                word_statistics[word]['index'].update([word_index])
                word_index += 1

            next_word = words[-1]
            for word in reversed(words[:-1]):
                word_statistics[word]['transitions'].update({word: 0})
                word_statistics[word]['transitions'].update({next_word: 1})
                states_set.add(word)
                states_set.add(next_word)
    return word_statistics, states_set, words_in_domain_counter
                    
                    
def convert_counter_to_probabilities(transitions_counter, round_ndigits=8):    
    transitions_probabilities = {}
    word_sum = sum(transitions_counter.values())

    for next_word, val in transitions_counter.items():
        transitions_probabilities[next_word] = round(val / word_sum, round_ndigits)
    row_sum = round(sum(transitions_probabilities.values()), round_ndigits)
    
    if row_sum != 1:
        error = round((row_sum - 1) * (10 ** round_ndigits))
        sign = -1 if error > 0 else 1
        adj_count = np.abs(error)
        for key in list(transitions_probabilities.keys())[1:]:
            transitions_probabilities[key] += sign * (10 ** (-1 * round_ndigits))
            adj_count -= 1
            if adj_count == 0:
                break
                
    return transitions_probabilities


def extract_transitions_probabilities(word_statistics, states_set):
    words_transitions_prob = {}
    for word in states_set:
        words_transitions_prob[word] = convert_counter_to_probabilities(word_statistics[word]['transitions'])
        
    for state in states_set:
        if state not in words_transitions_prob:
            words_transitions_prob[state] = {}
    
    return words_transitions_prob


def convert_probabilities_to_model(words_transitions_prob):
    df = pd.DataFrame.from_dict(words_transitions_prob).T
    df.fillna(0, inplace=True)
    transition_matrix = df[df.index].to_numpy()
    model = MarkovChain(transition_matrix=transition_matrix, states=list(df.index))
    return model


In [None]:
word_statistics['you']['transitions']

In [None]:
for i in range(10):
    print (model.next_state('free'))

In [None]:
create_random_domain_name(name_chain, 'you')

In [None]:
for i in range(transition_matrix.shape[0]):
    print(transition_matrix[i])
    transition_matrix[i] /= transition_matrix[i].sum()
    print(transition_matrix[i])
    print('----------------------------------------------------')

In [None]:
name_chain = MarkovChain(transition_matrix=transition_matrix, states=list(words_transitions_prob.keys()))

In [None]:
transition_matrix.shape

In [None]:
len(list(words_transitions_prob.keys()))

In [None]:
name_chain.next_state(current_state='you')

In [None]:
for val in words_transitions_prob['the'].values():
    print (val)

In [None]:
sum(words_transitions_prob['the'].values())

In [None]:
for key, trans in words_transitions_prob.items():
    print (key, sum(trans.values()))

In [None]:
transition_matrix

In [None]:
sign

In [None]:
words_transitions_prob['you']

In [None]:
d= {'a': 2, 'b': 4}

In [None]:
for key in d:
    print (key)

In [None]:
10 ** -1 * round_ndigits

In [None]:
np.power(10, -6)

In [None]:
sum(words_transitions_prob['porn'].values())

In [None]:
round((1.0002 - 1) * 10000)

In [None]:
df = pd.DataFrame.from_dict(words_transitions_prob)
df.fillna(0, inplace=True)
transition_matrix = df.to_numpy()

In [None]:
sum(df['you'])

In [None]:
list(words_transitions_prob.keys())

In [None]:
transition_matrix.shape

In [None]:
name_chain = MarkovChain(transition_matrix=transition_matrix, states=list(words_transitions_prob.keys()))
name_chain.next_state(current_state='hello')
# predictions = []
# for i in range(10000):
#     y = weather_chain.next_state(current_state='Sunny')
#     y = weather_chain.next_state(current_state=y)
#     predictions.append(y)

# # predictions = weather_chain.generate_states(current_state='Rainy', no=10000)
# Counter(predictions)

In [None]:
df = pd.DataFrame.from_dict(words_transitions)

In [None]:
df.fillna(0, inplace=True)

In [None]:
d = Counter({'a': 2, 'b': 5, 'c': 1, 'd': 2})

In [None]:
d

In [None]:
row_sum = sum(d.values())

In [None]:
for key, val in d.items():
    d[key] = val / row_sum

In [None]:
d['a']

In [None]:
df[df['porn'] > 0]