In [88]:
import pandas as pd
import os
import json

all_jsons = []

for f in [f for f in os.listdir('json_horoscopes/') if 'json' in f]:
    with open(f, 'r') as js:
        all_jsons.append(json.load(js))

def merge_dicts(*dict_args):
    '''
    Given any number of dicts, shallow copy and merge into a new dict,
    precedence goes to key value pairs in latter dicts.
    '''
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

In [89]:
all_horoscopes = merge_dicts(*all_jsons)

In [90]:
import re
for date in all_horoscopes:
    print(date)
    for zodiac in all_horoscopes[date]:
        try:
            new_zodiac = re.match(r'\w+', zodiac)
            if new_zodiac:
                new_zodiac = new_zodiac.group()
            all_horoscopes[date][new_zodiac] = all_horoscopes[date].pop(zodiac)
        except:
            print(date, zodiac)

01.09.2015
17.05.2016
12.03.2016
29.08.2016
11.06.2014
11.06.2014 None


RuntimeError: dictionary changed size during iteration

In [91]:
raw_corpus = pd.DataFrame.from_dict(all_horoscopes, orient='index')

raw_corpus = raw_corpus.stack().reset_index()[0].values

raw_corpus = [d for d in raw_corpus if d !='\xa0']

raw_corpus = [d.replace('\xa0', ' ') for d in raw_corpus]

In [92]:
import re

def split_into_sentences(doc):
#     return re.findall(r'[A-Z].*?\.', doc)
    return tokenize.sent_tokenize(corpus[0], language='czech')


def prepare_corpus(corpus):
    prepared = []
    for doc in corpus:
        prepared.extend(nltk.sent_tokenize(doc, language='czech'))
    return prepared

def extract_first_word(sentence):
    try:
        first_word = re.compile(r'^\w+')
        return first_word.match(sentence).group()
    except:
        print("Fail {}".format(sentence))

In [93]:
import nltk

In [94]:
corpus = prepare_corpus(raw_corpus)

first_words = [extract_first_word(sentence) for sentence in corpus]

first_words = pd.Series(first_words)

first_words_counts = first_words.to_frame().reset_index().groupby(0).agg(len)

first_words_counts.index.name = None

first_words_counts['index'] = first_words_counts['index'].astype(int)

first_words_counts['probability'] = first_words_counts['index'].apply(lambda x: x/first_words_counts['index'].sum())

first_words_counts['cumsum'] = first_words_counts['probability'].cumsum()

first_word_probabilities = first_words_counts.reset_index().rename(columns={'level_0':'word1'})#.drop('index', 1).to_dict()['probability']

Fail   
Fail   
Fail – 22.7.)
Fail . 
Fail – 21.6.)
Fail   
Fail – 22.7.)
Fail . 
Fail – 22.9.)
Fail       
Fail . 
Fail   
Fail – 23.10.)
Fail  Nedovolte, aby vám popletl hlavu známý se svojí pohádkou o báječném místě.
Fail    
Fail  Nenaříkejte si na přemíru potíží, rozhlédněte se kolem sebe.
Fail   
Fail   
Fail         
Fail – 22.9.)


In [98]:
first_word_probabilities.head()

Unnamed: 0,word1,index,probability,cumsum
0,A,3,0.00012,0.00012
1,Absence,1,4e-05,0.00016
2,Absolvujete,2,8e-05,0.000239
3,Aby,3,0.00012,0.000359
4,Abyste,7,0.000279,0.000639


In [102]:
from nltk import bigrams

from collections import defaultdict

words = defaultdict(lambda: defaultdict(int))

fdist = nltk.FreqDist()

for sentence in corpus:
    fdist.update(nltk.bigrams([w for w in nltk.word_tokenize(sentence, language='czech') if w != ',']))

word_counts = pd.DataFrame.from_dict(dict(fdist), orient='index').reset_index()#.rename(columns={'index', 'bigram'})

word_counts[['word1', 'word2']] = word_counts['index'].apply(pd.Series)

word_counts = word_counts[['word1','word2',0]]

word_counts = word_counts.rename(columns={0:'count'})

word_counts['count'] = word_counts['count'].astype(int)

word_probabilities = word_counts.groupby('word1').apply(lambda group: group['count']/sum(group['count']))

word_counts = word_counts.reset_index().set_index(['word1', 'index'])

word_counts['probabilities'] = word_probabilities

word_counts = word_counts.reset_index().drop(['index','count'], 1)

word_counts['cumsum'] = word_counts.groupby('word1')['probabilities'].cumsum()

bigrams_probs = word_counts.copy()

In [103]:
first_word_probabilities.head()

Unnamed: 0,word1,index,probability,cumsum
0,A,3,0.00012,0.00012
1,Absence,1,4e-05,0.00016
2,Absolvujete,2,8e-05,0.000239
3,Aby,3,0.00012,0.000359
4,Abyste,7,0.000279,0.000639


In [104]:
bigrams_probs.head()

Unnamed: 0,word1,word2,probabilities,cumsum
0,neurazili,.,0.75,0.75
1,se,později,0.000779,0.000779
2,Je,na,0.045455,0.045455
3,Přestane,fungovat,0.125,0.125
4,výhružek,a,1.0,1.0


In [24]:
import random

def get_first_word(first_word_probabilities):
    val = random.random()
    return first_word_probabilities.query('cumsum >= {}'.format(val)).iloc[0]['word1']

In [105]:
get_first_word(first_word_probabilities)

'Zatoužíte'

In [106]:
def get_next_word(previous_word, bigrams_probs):
    val = random.random()
    subset = bigrams_probs.query('word1 == "{}"'.format(previous_word))
    return subset.query('cumsum >= {}'.format(val)).iloc[0]['word2']
    
#     print("Radnom val {}".format(val))
#     cumsum = 0
#     for word, prob in final_probs[previous_word].items():
#         cumsum += prob
# #         print("cumsum {}".format(cumsum))
# #         print("prob {}".format(prob))
#         if val <= cumsum:
#             return word

def create_sentence(first_word_probabilities, bigrams_probs):
    sentence = get_first_word(first_word_probabilities)
    previous_word = sentence
    size = 1
    while previous_word != '.':
        previous_word = get_next_word(previous_word, bigrams_probs)
        sentence += ' {}'.format(previous_word)
        size += 1
    return sentence.replace(r' .', '. ')

In [107]:
create_sentence(first_word_probabilities, bigrams_probs)

'Rozzlobí vás nezištné gesto. '

In [108]:
def create_horoscope(size):
    horoscope = ''
    for _ in range(size):
        horoscope += create_sentence(first_word_probabilities, bigrams_probs)
    return horoscope

In [109]:
for _ in range(10):
    print(create_horoscope(5))
    print("#"*79)

Kdyby vás postavení Jupiteru vás zdálo že jste se otevírají před cizími lidmi kteří zápasí s kolegy či chalupy pozdě. Můžete se nedivte že ve vašem citovém vztahu se zmást tím co vysvětlovat že to projít obchody. Nepřehánějte to jen bavit neustálé potíže vás bude mít hlavu budete bezpečnější. Nenechte se mohou vyřešit nějakou společenskou institucí. Nebuďte zbytečně ztrácet půdu na uklouznutí bude rozumné kupovat za čas demonstrovat sílu udělat po půlnoci a jejich nedávnému návrhu. 
###############################################################################
Naopak budete v pořádku. Správně tušíte že na jiné. Zkuste to nějak zvláště večer. Zvažte jestli si zklamání. Mnohé nasvědčuje tomu dojít trpělivost a vibrace Saturnu varuje že nic znamenat začátek by nesmysl. 
###############################################################################
S odpovědí neváhejte ani minutku. Šedivý možná výjimečný den snadno hořlavými chemickými prostředky. Nenaříkejte nad vlastní žárlivostí. Nech

In [110]:
bigrams_probs.to_csv('bigrams_probs.csv', encoding='utf-8')

In [111]:
first_word_probabilities.to_csv('first_world_probabilities.csv', encoding='utf-8')

In [15]:
import pandas as pd
from random import random

In [7]:
first_word_probabilities = pd.read_csv('first_world_probabilities.csv', encoding='utf-8', index_col=0)
bigrams_probs = pd.read_csv('bigrams_probs.csv', encoding='utf-8', index_col=0)

In [8]:
first_word_probabilities.head()

Unnamed: 0,word1,index,probability,cumsum
0,A,3,0.00012,0.00012
1,Absence,1,4e-05,0.00016
2,Absolvujete,2,8e-05,0.000239
3,Aby,3,0.00012,0.000359
4,Abyste,7,0.000279,0.000639


In [19]:
random()

0.3908665177832632

In [12]:
bigrams_probs.query('word1 == "Absolvujete"')

Unnamed: 0,word1,word2,probabilities,cumsum
33823,Absolvujete,rozhovor,0.5,0.5
82605,Absolvujete,spoustu,0.5,1.0


In [17]:
bigrams_probs.query('word1 == "rozhovor"')

Unnamed: 0,word1,word2,probabilities,cumsum
3483,rozhovor,točící,0.0125,0.0125
9399,rozhovor,vám,0.0125,0.025
10472,rozhovor,kolegů,0.0125,0.0375
21384,rozhovor,ohledně,0.0125,0.05
30720,rozhovor,mezi,0.0125,0.0625
49283,rozhovor,aniž,0.0125,0.075
51197,rozhovor,může,0.0125,0.0875
61372,rozhovor,budete,0.0375,0.125
64646,rozhovor,se,0.0375,0.1625
67390,rozhovor,na,0.0125,0.175


In [43]:
random()

0.38293381670506255

In [45]:
bigrams_probs.query('word1 == "kolegů"')

Unnamed: 0,word1,word2,probabilities,cumsum
890,kolegů,si,0.016,0.016
2332,kolegů,není,0.008,0.024
4821,kolegů,se,0.04,0.064
8178,kolegů,přátel,0.04,0.104
11106,kolegů,a,0.192,0.296
20340,kolegů,naleznete,0.008,0.304
22202,kolegů,s,0.008,0.312
26307,kolegů,bránících,0.008,0.32
30139,kolegů,vám,0.008,0.328
30455,kolegů,i,0.008,0.336


# Filozof

In [113]:
import numpy as np

In [132]:
import re
for date in all_horoscopes:
    print(date)
    for zodiac in all_horoscopes[date]:
        try:
            new_zodiac = re.search(r'\w+', zodiac)
            if new_zodiac:
                new_zodiac = new_zodiac.group().strip()
            all_horoscopes[date][new_zodiac] = all_horoscopes[date].pop(zodiac)
        except:
            print(date, zodiac)

01.09.2015
17.05.2016
12.03.2016
12.03.2016 None
29.08.2016
11.06.2014
11.06.2014 None
16.07.2014
22.12.2015
22.12.2015 None
26.08.2014
07.01.2016
05.09.2015
20.12.2014
16.08.2015
28.10.2016
30.06.2015
18.10.2015
18.10.2015 None
30.09.2015
10.03.2015
07.06.2014
29.07.2015
29.07.2015 None
30.09.2016
23.06.2015
23.06.2015 None
09.10.2016
16.03.2015
23.06.2014
30.11.2015
22.01.2015
04.02.2015
16.06.2014
24.05.2016
24.05.2016 None
20.02.2015
14.06.2016
07.06.2016
07.11.2016
11.08.2016
11.08.2016 None
01.02.2015
15.05.2016
15.05.2016 None
14.01.2015
14.01.2015 None
17.12.2015
25.10.2015
25.10.2015 None
10.12.2015
10.12.2015 None
13.09.2015
23.10.2014
15.10.2014
15.10.2014 None
23.04.2015
22.10.2015
22.10.2015 None
30.01.2016
30.01.2016 None
12.04.2016
12.04.2016 None
01.06.2014
01.06.2014 None
06.08.2015
07.09.2014
16.07.2015
16.07.2015 None
17.01.2016
30.09.2014
30.09.2014 None
27.08.2014
27.08.2014 None
02.09.2016
02.09.2016 None
13.01.2016
06.06.2016
06.06.2016 None
22.09.2015
31.08.2016

In [255]:
horoscopes = pd.DataFrame.from_dict(all_horoscopes, orient='index')

In [256]:
horoscopes_clean = horoscopes_clean.apply(lambda col: col.str.strip().replace(r'\\xa0',' '), 1)

In [257]:
data = horoscopes_clean[["LEV", "BLÍŽENCI"]].copy()

In [278]:
data_clean = data[(data.LEV != '') & (data["BLÍŽENCI"] != '')]

In [281]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()

LEV_words = np.array(data_clean.LEV.values)
bag = count.fit_transform(LEV_words)

In [286]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer().fit_transform(data_clean.LEV.values)
# no need to normalize, since Vectorizer will return normalized tf-idf
pairwise_similarity = tfidf * tfidf.T

In [288]:
import matplotlib.pyplot as plt
%matplotlib inline

In [307]:
TfidfVectorizer?