In [1]:
from random import randrange, random
import numpy as np
from scipy.sparse import vstack
from scipy.sparse.csr import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [3]:
num_topics = 2

In [4]:
cold_war_terms = ['soviet', 'cold', 'war', 'nato', 'communist', 'capitalist', 'political', 
                 'western', 'powers', 'propaganda']

economic_policy_terms = ['financial', 'taxation', 'monetary', 'policy', 'economic', 'interest',
                        'rate', 'money', 'supply', 'bank']

In [5]:
cold_war_topics = [.9, .1]
economic_policy_topics = [.9, .1]

In [6]:
cold_war_words = []
economic_policy_words = []

for i in range(10):
    if random() <= cold_war_topics[0]:
        w = cold_war_terms[randrange(len(cold_war_terms))]
        print(w)
        if w not in cold_war_words:
            cold_war_words.insert(0,w)
    else:
        w = economic_policy_terms[randrange(len(economic_policy_terms))]
        print(w)
        if w not in cold_war_words:
            cold_war_words.append(w)

for i in range(10):
    if random() <= economic_policy_topics[0]:
        w = economic_policy_terms[randrange(len(economic_policy_terms))]
        if w not in economic_policy_words:
            economic_policy_words.insert(0,w)
    else:
        w = cold_war_terms[randrange(len(cold_war_terms))]
        if w not in economic_policy_words:
            economic_policy_words.append(w)
    

political
nato
political
soviet
powers
nato
political
rate
propaganda
powers


In [7]:
print(cold_war_words)
print(economic_policy_words)

['propaganda', 'powers', 'soviet', 'nato', 'political', 'rate']
['economic', 'bank', 'policy', 'monetary', 'financial', 'taxation', 'nato']


In [8]:
cold_war_den = np.random.dirichlet(np.ones(len(cold_war_words)),size=1)
cold_war_dist = np.cumsum(cold_war_den)

economic_policy_den = np.random.dirichlet(np.ones(len(economic_policy_words)),size=1)
economic_policy_dist = np.cumsum(economic_policy_den)

In [9]:
n = random()
index = np.searchsorted(cold_war_dist,n)
index, cold_war_words[index]

(2, 'soviet')

In [10]:
# generate 10 documents from each category
documents = []
topics = []

for i in range(5):
    sentence = []
    for i in range(10):
        n = random()
        index = np.searchsorted(cold_war_dist,n)
        sentence.append(cold_war_words[index])
    documents.append(' '.join(sentence))
    topics.append(0)

for i in range(5):
    sentence = []
    for i in range(10):
        n = random()
        index = np.searchsorted(economic_policy_dist,n)
        sentence.append(economic_policy_words[index])
    documents.append(' '.join(sentence))
    topics.append(1)

In [11]:
for s in documents:
    print(s)

political soviet soviet rate political rate powers political propaganda political
political rate political powers political powers political political nato rate
rate political powers political powers political rate rate rate powers
political rate propaganda political powers political soviet nato powers nato
political rate political political political propaganda propaganda political political political
policy nato bank economic nato monetary bank nato financial financial
economic economic nato economic bank nato bank nato monetary bank
bank bank nato bank economic nato economic policy nato nato
policy nato policy monetary bank bank bank monetary nato bank
policy bank bank nato nato bank nato nato bank monetary


In [12]:
vectorizer = CountVectorizer()

In [13]:
X_train = vectorizer.fit_transform(documents)

In [14]:
init = X_train.todense()
init[0]

matrix([[0, 0, 0, 0, 0, 0, 4, 1, 1, 2, 2]])

In [15]:
vectorizer.get_feature_names()

['bank',
 'economic',
 'financial',
 'monetary',
 'nato',
 'policy',
 'political',
 'powers',
 'propaganda',
 'rate',
 'soviet']

In [16]:
import numpy
def delete_rows_csr(mat, indices):
    """
    Remove the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
    """
    #if not isinstance(mat, scipy.sparse.csr_matrix):
    #    raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = list(indices)
    mask = numpy.ones(mat.shape[0], dtype=bool)
    mask[indices] = False
    return mat[mask]

In [17]:
while len(X_train.todense()) > 2:
    cs = np.round(cosine_similarity(X_train, Y=None, dense_output=True),2)

    max = 0
    jj = 0
    ii = 0
    for i in range(len(cs[0])):
        for j in range(len(cs[0])):
            if cs[i][j] < 1 and cs[i][j] > max:
                max = cs[i][j]
                ii = i
                jj = j


    nr = (X_train[ii] + X_train[jj])

    X_train = delete_rows_csr(X_train, [ii, jj])

    merged = np.insert(X_train.todense(), ii, nr.todense(), 0)

    X_train = csr_matrix(merged)
    print(len(X_train.todense()))

9
8
7
6
5
4
3
2


In [18]:
#len(X_train.todense())

In [19]:
print(ii,jj,max)
print(X_train.todense())
print(cs)

0 1 0.77
[[ 0  0  0  0  3  0 22  8  4 10  3]
 [16  6  2  5 16  5  0  0  0  0  0]]
[[1.   0.77 0.09]
 [0.77 1.   0.  ]
 [0.09 0.   1.  ]]


In [20]:
# and then continue like this until you meet the number of topics

In [21]:
# ok, now I really need to see if this will work
clusters = X_train.todense()

In [22]:
clusters

matrix([[ 0,  0,  0,  0,  3,  0, 22,  8,  4, 10,  3],
        [16,  6,  2,  5, 16,  5,  0,  0,  0,  0,  0]])

In [23]:
df = pd.DataFrame(X_train.todense())
#df=(df-df.min())/(df.max()-df.min())

In [24]:
df = df.T

In [25]:
df = df/df.sum()

In [26]:
df['terms'] = vectorizer.get_feature_names()

In [27]:
df.sort_values([0, 1], ascending=[False,False])

Unnamed: 0,0,1,terms
6,0.44,0.0,political
9,0.2,0.0,rate
7,0.16,0.0,powers
8,0.08,0.0,propaganda
4,0.06,0.32,nato
10,0.06,0.0,soviet
0,0.0,0.32,bank
1,0.0,0.12,economic
3,0.0,0.1,monetary
5,0.0,0.1,policy


In [28]:
for s in documents:
    print(s)

political soviet soviet rate political rate powers political propaganda political
political rate political powers political powers political political nato rate
rate political powers political powers political rate rate rate powers
political rate propaganda political powers political soviet nato powers nato
political rate political political political propaganda propaganda political political political
policy nato bank economic nato monetary bank nato financial financial
economic economic nato economic bank nato bank nato monetary bank
bank bank nato bank economic nato economic policy nato nato
policy nato policy monetary bank bank bank monetary nato bank
policy bank bank nato nato bank nato nato bank monetary


In [29]:
init[0]

matrix([[0, 0, 0, 0, 0, 0, 4, 1, 1, 2, 2]])

In [30]:
df[0]

0     0.00
1     0.00
2     0.00
3     0.00
4     0.06
5     0.00
6     0.44
7     0.16
8     0.08
9     0.20
10    0.06
Name: 0, dtype: float64

In [31]:
type(init[0])

numpy.matrix

In [32]:
type(X_train.todense()[0])

numpy.matrix

In [33]:
# calculate the percentage similarity for each document to the cluster
#
sim0 = []
sim1 = []

for i in init:
    sim0.append(list(cosine_similarity(np.stack([X_train.todense()[0],i]), Y=None, dense_output=True))[0][1])
    sim1.append(list(cosine_similarity(np.stack([X_train.todense()[1],i]), Y=None, dense_output=True))[0][1])

In [37]:
df_documents = pd.DataFrame({'document':documents, 'C0':sim0, 'C1':sim1})

In [43]:
df_documents['sim0_w'] = (df_documents['C0'] / (df_documents['C0'] + (df_documents['C1'])))
df_documents['sim_1w'] = (df_documents['C1'] / (df_documents['C0'] + (df_documents['C1'])))

In [44]:
df_documents

Unnamed: 0,document,C0,C1,sim0_w,sim_1w
0,political soviet soviet rate political rate powers political propaganda political,0.946219,0.0,1.0,0.0
1,political rate political powers political powers political political nato rate,0.978486,0.111836,0.897428,0.102572
2,rate political powers political powers political rate rate rate powers,0.853713,0.0,1.0,0.0
3,political rate propaganda political powers political soviet nato powers nato,0.899046,0.291633,0.75507,0.24493
4,political rate political political political propaganda propaganda political political political,0.896271,0.0,1.0,0.0
5,policy nato bank economic nato monetary bank nato financial financial,0.077061,0.911353,0.077964,0.922036
6,economic economic nato economic bank nato bank nato monetary bank,0.065129,0.916579,0.066342,0.933658
7,bank bank nato bank economic nato economic policy nato nato,0.083894,0.959911,0.080373,0.919627
8,policy nato policy monetary bank bank bank monetary nato bank,0.043419,0.893472,0.046344,0.953656
9,policy bank bank nato nato bank nato nato bank monetary,0.078804,0.964587,0.075527,0.924473
