"""
Dependencies: numpy 1.17, pandas 0.25

userUserNetwork = W; Weighted adjacency matrix, probability user u respond to user v
treads = R
posts = P : user u, creation t, text x
documents = N
questionTendency = average; number of questions by total posts by user u in thread r for topic k
seeking (question) = S; QuestionTendency * log of 1+posts*length
disseminating (answer) = D; 1-Seeking
dictionary = X
topics = K
postTopics (theta) = [0,1]^N*K
topicWords = [0,1]^K*X
SIDR = phi; proportion of seeking by u on topic k by probability for user v responds to user u on topic k
DISR = psi; proportion of disseminating by u on topic k by probability for user u responds to user v on topic k
Benefit = B; utility obtained by user u for topic k; seeking*log of prob v to u on topic k
alpha = marginal benefit of teaching
smoothing = sigma
c_S, c_D = tightness parameters
step = lambda
t = threshold; error

Compute User-User Network
1-Smooth to ensure user responds to each post at most once
QuestionTendency = proportion of questions per topic per thread per weighted-average Q for u

Seeking and Dissemination
1:Extract forum topics: remove stopwords, urls, stem, lemmatize 
2:Infer if post is question or answer: first post and; other post or; has question mark or 5W1H or 1G
3:Compute S and D

Projected Gradient Descent
C_s = participation rate for seeking
C_d = participation rate for dissemination
alpha = learning step
"""

In [None]:
thread = pd.read_json(r'/home/davidlemay/Documents/social_learning_network_analysis/Coursera MOOCs/courses/designingcities-001/thread250-0',orient='records',typ='series',convert_dates=False)

In [None]:
thread = {}
with open ('/home/davidlemay/Documents/social_learning_network_analysis/Coursera MOOCs/courses/designingcities-001/thread25-0', 'r') as f:
    thread = json.loads(f.read())

In [None]:
json.load(open('/home/davidlemay/Documents/social_learning_network_analysis/Coursera MOOCs/courses/designingcities-001/thread25-0','r'))

In [215]:
import pandas as pd
import numpy as np
#import proxmin as px
import json, os

In [242]:
#Import Coursera course discussion forum files, concatenate threads
directory = r'/home/davidlemay/Documents/social_learning_network_analysis/Coursera MOOCs/courses/designingcities-001/'
threads = []
for f in os.listdir(directory):
    thread = json.load(open(os.path.join(directory,f),'r'))
    posts = pd.DataFrame(thread['posts'])['post_text'].dropna()
    comments = pd.DataFrame(thread['comments'])
    threads.append(posts)
    if len(comments) > 0:
        comments['post_text'] = comments['comment_text'].dropna()
        threads.append(comments)
    else:
        continue
df = pd.concat(threads,ignore_index=True,sort=False)

In [150]:
type(df.loc[10028,'post_text'])

str

In [143]:
total_posts = 0
for x in threads:
    total_posts += len(x)
print(total_posts)

10033


In [298]:
df['post_text'][2066]

'Hola Camilo!<br />Si seria interesante!.. estamos hablando para compartir puntos de vista. &nbsp;&nbsp;'

In [243]:
#pd.to_datetime(df['post_time'],unit='s')
df['new_post_time'] = df['post_time'].transform(lambda x: pd.to_datetime(x,unit='s'))

In [244]:
import re

def q_a(x):
    qWords = re.compile(r'[\w\W]*(who|what|where|when|why|how|\?)[\w\W]*')
    if re.search(qWords, x):
        return 'question'
    else:
        return 'answer'
        
df['q_a'] = df['post_text'].apply(lambda x: q_a(x) if type(x) == str else np.nan)

In [None]:
# EdX transforms
#df['_id'] = df['_id'].transform(lambda x: x['$oid'])
#df['parent_id'] = df['parent_id'].transform(lambda x: x['$oid'] if type(x) == dict else np.nan)

In [None]:
# EdX transform
#df.set_index('_id', inplace=True)
#df['parent_author_id'] = pd.Series(df['parent_id'].apply(lambda x: df.loc[x, 'author_id'] if type(x) == str else 0), dtype='Int64')
#df.reset_index(inplace=True)

In [245]:
df.sort_values(['thread_id','post_time'],inplace=True)
df['parent_id'] = df['user_id'].shift(-1)

In [None]:
#df['post_text'] = df['post_text'].transform(lambda x: x if type(str) else "")

In [246]:
# _type : either CommentThread (i.e., initiation) or Comment (i.e., reply)
InitiationsReplies = pd.crosstab(df['user_id'], df['q_a'])
#InitiationsReplies.head()

In [247]:
# https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
#nltk.download('wordnet')

In [248]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [249]:
len(df['post_text'])

5242

In [None]:
#print(WordNetLemmatizer().lemmatize(gensim.utils.simple_preprocess(df['post_text'][1])[7], pos='v'))

In [None]:
#gensim.utils.simple_preprocess(df['post_text'][1])

In [250]:
processed_docs = df['post_text'].map(lambda x: preprocess(x) if type(x) == str else [])

In [251]:
processed_docs[:10]

24      [welcom, design, citi, thank, student, hope, e...
2065    [hola, laura, nombr, camilo, pineda, barranqui...
2066    [hola, camilo, seria, interesant, estamo, habl...
4952              [love, citi, beauti, movement, chicago]
4953                           [amaz, think, jane, jacob]
2067    [buena, hora, siguen, llegando, colombiano, es...
2068    [singapor, paddi, resid, origin, india, live, ...
2069    [nice, singapor, meet, point, lion, citi, smal...
2365         [friend, moscow, right, jakarta, nice, meet]
2366              [love, mumbai, februari, greet, poland]
Name: post_text, dtype: object

In [252]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
#dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [253]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [254]:
len(processed_docs)

5242

In [255]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.030*"nbsp" + 0.016*"cours" + 0.015*"citi" + 0.015*"plan" + 0.013*"http" + 0.009*"live" + 0.007*"thank" + 0.007*"href" + 0.006*"target" + 0.006*"peopl"
Topic: 1 
Words: 0.029*"nbsp" + 0.017*"citi" + 0.014*"http" + 0.010*"design" + 0.009*"assign" + 0.008*"work" + 0.008*"cours" + 0.007*"href" + 0.006*"peopl" + 0.006*"think"
Topic: 2 
Words: 0.022*"grade" + 0.020*"nbsp" + 0.019*"assign" + 0.017*"peopl" + 0.010*"sketch" + 0.009*"present" + 0.008*"plan" + 0.008*"mark" + 0.007*"review" + 0.007*"page"
Topic: 3 
Words: 0.031*"citi" + 0.028*"nbsp" + 0.009*"like" + 0.007*"thank" + 0.007*"work" + 0.007*"urban" + 0.006*"build" + 0.006*"http" + 0.006*"cours" + 0.006*"design"
Topic: 4 
Words: 0.102*"nbsp" + 0.013*"citi" + 0.009*"span" + 0.009*"https" + 0.008*"imag" + 0.007*"peopl" + 0.006*"amazonaw" + 0.006*"http" + 0.006*"coursera" + 0.006*"forum"
Topic: 5 
Words: 0.051*"nbsp" + 0.017*"assign" + 0.015*"coursera" + 0.013*"https" + 0.010*"cours" + 0.009*"citi" + 0.007*"hope" + 0.007

In [256]:
topicScores = [lda_model.get_document_topics(x) for x in bow_corpus]

In [257]:
topics = pd.DataFrame([max(x, key=lambda y: y[1]) for x in topicScores])

In [258]:
df.insert(2, 'topics', topics[0])
#df.drop('topics', 1)
#topics.shape

In [264]:
len(df.post_text)

5242

In [261]:
df.columns

Index([                   0,     '_reporter_link',             'topics',
          '_user_full_name', '_user_profile_link',        '_user_title',
         '_viewer_can_edit',   '_viewer_can_vote',          'anonymous',
             'comment_text',            'deleted',                 'id',
                  'is_spam',               'link',            'post_id',
                'post_text',          'post_time',          'text_type',
                'thread_id',         'user_agent',            'user_id',
                    'votes',      'new_post_time',                'q_a',
                'parent_id'],
      dtype='object')

In [281]:
"""Create the adjacency matrix using _id for index and column labels"""
userNetwork = pd.crosstab(df.user_id,df.parent_id)
idx = userNetwork.columns.union(userNetwork.index)
userNetwork = userNetwork.reindex(index=idx, columns=idx, fill_value=0)

In [282]:
userNetwork = pd.DataFrame(userNetwork.sort_index(axis=0).sort_index(axis=1))

In [267]:
#userNetwork.drop(0, axis=0, inplace=True)
#userNetwork.drop(0, axis=1, inplace=True)
#userNetwork.shape

(584, 584)

In [283]:
totalResponses = userNetwork.sum(axis=1)
totalPosts = userNetwork.sum(axis=0).transpose()

In [284]:
w_UserNetwork = userNetwork*totalResponses/totalPosts

In [285]:
w_UserNetwork.shape

(585, 585)

In [286]:
w_UserNetwork.mean(0).head()

0.0        0.249573
4004.0     0.001709
4249.0     0.001709
9991.0     0.006838
37799.0    0.001709
dtype: float64

In [287]:
userTopics = pd.crosstab(df['user_id'], [df['q_a'], df['topics']])

In [288]:
userTopics.head()

q_a,answer,answer,answer,answer,answer,answer,answer,answer,answer,answer,question,question,question,question,question,question,question,question,question,question
topics,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
0.0,64,3,2,9,4,0,1,2,1,2,44,4,0,5,1,1,3,0,0,1
4004.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4249.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
9991.0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37799.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [289]:
postTopics = userTopics.sum(0)/userTopics.sum(0).sum()

In [290]:
postTopics

q_a       topics
answer    0         0.365063
          1         0.032427
          2         0.009414
          3         0.078975
          4         0.038703
          5         0.016213
          6         0.010460
          7         0.008368
          8         0.027197
          9         0.008891
question  0         0.262029
          1         0.020397
          2         0.008891
          3         0.044456
          4         0.020921
          5         0.009937
          6         0.010983
          7         0.003138
          8         0.017782
          9         0.005753
dtype: float64

In [291]:
postingTendency = userTopics['question']*postTopics.sum()+userTopics['answer']*postTopics.sum()
questionTendency = (userTopics['question']*postTopics.sum())/postingTendency.sum()

In [292]:
questionTendency.head()

topics,0,1,2,3,4,5,6,7,8,9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.036697,0.039604,0.0,0.021186,0.008772,0.02,0.073171,0.0,0.0,0.035714
4004.0,0.000834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4249.0,0.000834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9991.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37799.0,0.000834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [293]:
#userTopics.columns = userTopics.columns.droplevel(0)
disseminating = 1-questionTendency * np.log(1+postingTendency)
seeking = questionTendency * np.log(1+postingTendency)

In [294]:
seeking.head()

topics,0,1,2,3,4,5,6,7,8,9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.17216,0.082354,0.0,0.057374,0.015717,0.013863,0.117764,0.0,0.0,0.049511
4004.0,0.000578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4249.0,0.000578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9991.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37799.0,0.000578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [295]:
threshold = .001
alpha = .4
beta = 0.8
c_S = 1.25
c_D = 0.75
step = 0.1
rho = 1
N = w_UserNetwork.shape[0]
z1 = lambda1 = np.zeros(seeking.T.shape).astype('float64')
z2 = lambda2 = np.zeros(disseminating.shape).astype('float64')
W = W_hat = W_obs = w_UserNetwork.fillna(0).astype('float64').to_numpy()
W_opt = W_prime = np.zeros(W.shape)
D = disseminating.fillna(0).astype('float64').to_numpy()
S = seeking.fillna(0).astype('float64').to_numpy()

def SIDR(s,d,w):
    phi = np.zeros(s.shape)
    for u in range(0, s.shape[0]):
        for k in range(0, s.shape[1]):
            phi[u,k] = s[u,k]/1+(w[:,u].sum()*d[:,k].sum())
    return phi

def DISR(s,d,w):
    psi = np.zeros(d.shape)
    for u in range(0, d.shape[0]):
        for k in range(0, d.shape[1]):
            psi[u,k] = d[u,k]/1+(w[u,:].sum()*s[:,k].sum())
    return psi
    
#SIDR.replace(to_replace=0,value=1)
#DISR.replace(to_replace=0,value=1)
P = (S/(1+(c_S*SIDR(S,D,W)))).T
Q = (D/(1+(c_D*DISR(S,D,W))))

In [296]:
np.seterr(all="raise")

def benefit(X):
    b = np.zeros(D.shape)
    for u in range(0, D.shape[0]):
        for k in range(0, D.shape[1]):
            b[u,k] = (S[u,k]*np.log(1+X[:,u].sum()*D[:,k].sum()) + alpha*D[u,k]*np.log(1+X[u,:].sum()*S[:,k].sum()))
    return b

def proj(X):
    """Projection step"""
    return np.clip(np.subtract(X,np.diag(np.diag(X))),0,1)
    
def grad(X):
    """Proximal gradient step"""
    return (np.add((rho*np.dot(D,(np.add(np.subtract(np.dot(D.T,X),P),np.subtract(z1,lambda1))))),
            (rho*np.dot((np.subtract(np.subtract(np.dot(X,S),Q),np.add(z2,lambda2))),S.T))))

g = benefit(W).sum().sum()
g_hat = 1
i = 0

while (g - g_hat)/np.abs(g_hat) >= threshold:
    i += 1
    for u in range(0, W.shape[0]):
        for v in range(0, W.shape[1]):
            W_prime[u,v] = (
                (D[u].sum()*S[v].sum()/(1+(W[:,v].sum()*D.sum().sum()))
                +alpha*D[u].sum()*S[v].sum()/(1+(W[u,:].sum()*S.sum().sum())))
                /N
            )
    W_hat = (W + step * W_prime)
    W = proj(grad(W_hat))
    g_hat = g
    g = benefit(W).sum().sum()
    z1 = np.clip(np.add(np.add((-1*np.dot(D.T,W)),P),lambda1),None,0)
    z2 = np.clip(np.add(np.subtract(np.dot(W,S),Q),lambda2),None,0)
    lambda1 -= np.subtract(np.add(np.dot(D.T,W),P),z1)
    lambda2 += np.subtract(np.subtract(np.dot(W,S),Q),z2)
    step *= beta

print("Iterations: "+str(i))
print("Observed learning benefit: "+str(benefit(W_obs).sum().sum()/N))
print("Optimized learning benefit: "+str(g_hat/N))

Iterations: 2
Observed learning benefit: 2.7070190253010744
Optimized learning benefit: 22.026077479222476


In [None]:
from numpy import linalg as la

In [None]:
np.max(la.eig(W_obs)[0]).real

In [None]:
np.max(la.eig(W)[0]).real