In [None]:
import pandas as pd
import numpy as np 
from collections import Counter
from MarkovChain import MarkovChain
import splitter
import tldextract
from scipy.sparse import csr_matrix
from sqlalchemy import create_engine

In [None]:
alexa_df = pd.read_csv('resources/top-1m.csv', names=['rank', 'domain'])

In [None]:
alexa_df = pd.DataFrame.from_dict([
    {'domain': 'helloworld.com'},
    {'domain': 'helloninja.com'},
    {'domain': 'lovecats.com'},
    {'domain': 'lovedogs.com'},
    {'domain': 'lovewomen.com'},
    {'domain': 'ninjawomen.com'},
    {'domain': 'ninjakid.com'},
    {'domain': 'ninjaboy.com'},
    {'domain': 'ninjamen.com'},
])

In [None]:
query_string

In [None]:
query_string = '''
    SELECT DISTINCT lower(base_domain)
    FROM domains
    WHERE label = 1
    AND base_domain NOT IN ('000webhostapp.com', 'azurewebsites.net')
    AND domain_name NOT LIKE '%%-%%'
    LIMIT 3000
'''
engine = create_engine('postgresql://postgres:mypassword@localhost:5432/')
input_df = pd.read_sql_query(query_string, engine)

In [None]:
query_string

In [None]:
input_df.sample(5)

In [None]:
len(input_df)

In [None]:
word_statistics, states_set, words_in_domain_counter = domain_to_word_features(list(input_df['base_domain']))

In [None]:
word = 'free'

In [None]:
words_transitions_prob = extract_transitions_probabilities(word_statistics)

In [None]:
model = convert_probabilities_to_model(words_transitions_prob)

In [None]:
create_random_domain_name(model, word)

In [None]:
convert_counter_probabilities(word_statistics[word]['transitions'])

In [None]:
word='you'

In [279]:
create_random_domain_name(model, word_statistics, word)

next_word_sentence_length: 4
next_word_sentence_length: 4
next_word_sentence_length: 3
domain_name: youvideoshatingvideos
word_list: ['you', 'videos', 'hating', 'videos']


'youvideoshatingvideos'

In [None]:
##TODO
def create_random_domain_name(model, word_statistics, initial_state):
    word_list = [initial_state]
    sentence_length = 1
    while sentence_length < 25:
        sentence_length += 1
        next_state = model.next_state(initial_state)
        word_list.append(next_state)

        p = convert_counter_to_probabilities(word_statistics[next_state]['sentence_length'])
        next_word_sentence_length = np.random.choice(list(p.keys()), p=list(p.values()))
        print (f'next_word_sentence_length: {next_word_sentence_length}')
        if next_word_sentence_length <= sentence_length:
            break
    domain_name = ''.join(word_list)
    print (f'domain_name: {domain_name}')
    print (f'word_list: {word_list}')
    return domain_name

def domain_to_word_features(domain_list):
    # words_transitions_count = {}

    states_set = set()
    words_in_domain_counter = Counter()
    word_statistics = {}

    domain_index = 0
    for domain in domain_list:
        if domain_index % 100 == 0:
            print (domain_index)

        domain_index += 1
        exr = tldextract.extract(domain)
        words = splitter.split(exr.domain) #.replace('-','')) #.replace('2', 'to').replace('4', 'for'))

        words_in_domain_counter.update([len(words)])
        word_index = 0
        if type(words) == list and len(words) > 1:
            for word in words:
                if word not in word_statistics:
                    word_statistics[word] = {}
                    word_statistics[word]['appeareance'] = 0
                    word_statistics[word]['index'] = Counter()
                    word_statistics[word]['sentence_length'] = Counter()
                    word_statistics[word]['transitions'] = Counter()

                word_statistics[word]['appeareance'] += 1
                word_statistics[word]['sentence_length'].update([len(words)])
                word_statistics[word]['index'].update([word_index])
                word_index += 1

        if type(words) == list and len(words) > 1:
            next_word = words[-1]
            for word in reversed(words[:-1]):
                word_statistics[word]['transitions'].update({word: 0})
                word_statistics[word]['transitions'].update({next_word: 1})
                states_set.add(word)
                states_set.add(next_word)
    return word_statistics, states_set, words_in_domain_counter
                    
                    
def convert_counter_to_probabilities(transitions_counter, round_ndigits=8):    
    transitions_probabilities = {}
    word_sum = sum(transitions_counter.values())

    for next_word, val in transitions_counter.items():
        transitions_probabilities[next_word] = round(val / word_sum, round_ndigits)
    row_sum = round(sum(transitions_probabilities.values()), round_ndigits)
    
    if row_sum != 1:
        error = round((row_sum - 1) * (10 ** round_ndigits))
        sign = -1 if error > 0 else 1
        adj_count = np.abs(error)
        for key in transitions_probabilities:
            transitions_probabilities[key] += sign * (10 ** (-1 * round_ndigits))
            adj_count -= 1
            if adj_count == 0:
                break
                
    return transitions_probabilities

#words_transitions_prob[word] = convert_counter_probabilities(word_statistics[word]['transitions'])

def extract_transitions_probabilities(word_statistics):
#     round_ndigits = 8
    words_transitions_prob = {}
    for word in states_set:
        words_transitions_prob[word] = convert_counter_to_probabilities(word_statistics[word]['transitions'])
        
#         words_transitions_prob[word] = {}
#         word_sum = sum(word_statistics[word]['transitions'].values())

#         for next_word, val in word_statistics[word]['transitions'].items():
#             words_transitions_prob[word][next_word] = round(val / word_sum, round_ndigits)
#         row_sum = round(sum(words_transitions_prob[word].values()), round_ndigits)

#         if row_sum != 1:
#             error = round((row_sum - 1) * (10 ** round_ndigits))
#             sign = -1 if error > 0 else 1
#             adj_count = np.abs(error)
#             for key in words_transitions_prob[word]:
#                 words_transitions_prob[word][key] += sign * (10 ** (-1 * round_ndigits))
#                 adj_count -= 1
#                 if adj_count == 0:
#                     break

    for state in states_set:
        if state not in words_transitions_prob:
            words_transitions_prob[state] = {}
    
    return words_transitions_prob


def convert_probabilities_to_model(words_transitions_prob):
    df = pd.DataFrame.from_dict(words_transitions_prob).T
    df.fillna(0, inplace=True)
    transition_matrix = df[df.index].to_numpy()
    model = MarkovChain(transition_matrix=transition_matrix, states=list(df.index))
    return model


In [None]:
word_statistics['you']['transitions']

In [None]:
for i in range(10):
    print (model.next_state('free'))

In [None]:
create_random_domain_name(name_chain, 'you')

In [None]:
for i in range(transition_matrix.shape[0]):
    print(transition_matrix[i])
    transition_matrix[i] /= transition_matrix[i].sum()
    print(transition_matrix[i])
    print('----------------------------------------------------')

In [None]:
name_chain = MarkovChain(transition_matrix=transition_matrix, states=list(words_transitions_prob.keys()))

In [None]:
transition_matrix.shape

In [None]:
len(list(words_transitions_prob.keys()))

In [None]:
name_chain.next_state(current_state='you')

In [None]:
for val in words_transitions_prob['the'].values():
    print (val)

In [None]:
sum(words_transitions_prob['the'].values())

In [None]:
for key, trans in words_transitions_prob.items():
    print (key, sum(trans.values()))

In [None]:
transition_matrix

In [None]:
sign

In [None]:
words_transitions_prob['you']

In [None]:
d= {'a': 2, 'b': 4}

In [None]:
for key in d:
    print (key)

In [None]:
10 ** -1 * round_ndigits

In [None]:
np.power(10, -6)

In [None]:
sum(words_transitions_prob['porn'].values())

In [None]:
round((1.0002 - 1) * 10000)

In [None]:
df = pd.DataFrame.from_dict(words_transitions_prob)
df.fillna(0, inplace=True)
transition_matrix = df.to_numpy()

In [None]:
sum(df['you'])

In [None]:
list(words_transitions_prob.keys())

In [None]:
transition_matrix.shape

In [None]:
name_chain = MarkovChain(transition_matrix=transition_matrix, states=list(words_transitions_prob.keys()))
name_chain.next_state(current_state='hello')
# predictions = []
# for i in range(10000):
#     y = weather_chain.next_state(current_state='Sunny')
#     y = weather_chain.next_state(current_state=y)
#     predictions.append(y)

# # predictions = weather_chain.generate_states(current_state='Rainy', no=10000)
# Counter(predictions)

In [None]:
df = pd.DataFrame.from_dict(words_transitions)

In [None]:
df.fillna(0, inplace=True)

In [None]:
d = Counter({'a': 2, 'b': 5, 'c': 1, 'd': 2})

In [None]:
d

In [None]:
row_sum = sum(d.values())

In [None]:
for key, val in d.items():
    d[key] = val / row_sum

In [None]:
d['a']

In [None]:
df[df['porn'] > 0]