"""
Dependencies: numpy 1.17, pandas 0.25

userUserNetwork = W; Weighted adjacency matrix, probability user u respond to user v
treads = R
posts = P : user u, creation t, text x
documents = N
questionTendency = average; number of questions by total posts by user u in thread r for topic k
seeking (question) = S; QuestionTendency * log of 1+posts*length
disseminating (answer) = D; 1-Seeking
dictionary = X
topics = K
postTopics (theta) = [0,1]^N*K
topicWords = [0,1]^K*X
SIDR = phi; proportion of seeking by u on topic k by probability for user v responds to user u on topic k
DISR = psi; proportion of disseminating by u on topic k by probability for user u responds to user v on topic k
Benefit = B; utility obtained by user u for topic k; seeking*log of prob v to u on topic k
alpha = marginal benefit of teaching
smoothing = sigma
c_S, c_D = tightness parameters
step = lambda
t = threshold; error

Compute User-User Network
1-Smooth to ensure user responds to each post at most once
QuestionTendency = proportion of questions per topic per thread per weighted-average Q for u

Seeking and Dissemination
1:Extract forum topics: remove stopwords, urls, stem, lemmatize 
2:Infer if post is question or answer: first post and; other post or; has question mark or 5W1H or 1G
3:Compute S and D

Projected Gradient Descent
C_s = participation rate for seeking
C_d = participation rate for dissemination
alpha = learning step
"""

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
#Import EdX course discussion forum Mongo DB file
df = pd.read_json('~/Downloads/TeachersCollegeX-BDE1x-2T2015-prod.mongo', lines=True)

In [3]:
df.head()

Unnamed: 0,_id,votes,visible,abuse_flaggers,historical_abuse_flaggers,parent_ids,at_position_list,body,course_id,_type,...,created_at,parent_id,endorsement,thread_type,context,comment_count,title,commentable_id,closed,last_activity_at
0,{'$oid': '5607c2bd9714b09f27002354'},"{'up': [], 'down': [], 'up_count': 0, 'down_co...",True,[],[],[],[],Welcome :-),course-v1:TeachersCollegeX+BDE1x+2T2015,Comment,...,{'$date': '2015-09-27T10:19:41.247+0000'},,,,,,,,,
1,{'$oid': '5604598c01772b8fd70020cc'},"{'up': [], 'down': [], 'up_count': 0, 'down_co...",True,[],[],[],[],I'm still waiting on mine as well. Any updates...,course-v1:TeachersCollegeX+BDE1x+2T2015,Comment,...,{'$date': '2015-09-24T20:14:04.905+0000'},,,,,,,,,
2,{'$oid': '5600ba739714b0505d001f3a'},"{'up': [], 'down': [], 'up_count': 0, 'down_co...",True,[],[],[],[],Hi Asma. Are you living and going to school in...,course-v1:TeachersCollegeX+BDE1x+2T2015,Comment,...,{'$date': '2015-09-22T02:18:27.702+0000'},,,,,,,,,
3,{'$oid': '55e8453f9714b0800a00100e'},"{'up': [], 'down': [], 'up_count': 0, 'down_co...",True,[],[],[{'$oid': '5595ad87a9a36ea584000eb6'}],[],"I will like to know you, Emily\n",course-v1:TeachersCollegeX+BDE1x+2T2015,Comment,...,{'$date': '2015-09-03T13:03:59.547+0000'},{'$oid': '5595ad87a9a36ea584000eb6'},,,,,,,,
4,{'$oid': '55e07c1401772be15a00090e'},"{'up': [], 'down': [], 'up_count': 0, 'down_co...",True,[],[],[{'$oid': '55d713ae9714b0c98d000243'}],[],You're welcome!,course-v1:TeachersCollegeX+BDE1x+2T2015,Comment,...,{'$date': '2015-08-28T15:19:48.290+0000'},{'$oid': '55d713ae9714b0c98d000243'},,,,,,,,


In [4]:
import re

def q_a(x):
    qWords = re.compile(r'[\w\W]*(who|what|where|when|why|how|\?)[\w\W]*')
    if re.search(qWords, x):
        return 'question'
    else:
        return 'answer'
        
df['q_a'] = df['body'].apply(lambda x: q_a(x))

In [5]:
df['_id'] = df['_id'].transform(lambda x: x['$oid'])

In [6]:
df['parent_id'] = df['parent_id'].transform(lambda x: x['$oid'] if type(x) == dict else np.nan)

In [7]:
df.set_index('_id', inplace=True)
df['parent_author_id'] = pd.Series(df['parent_id'].apply(lambda x: df.loc[x, 'author_id'] if type(x) == str else 0), dtype='Int64')
df.reset_index(inplace=True)

In [8]:
# _type : either CommentThread (i.e., initiation) or Comment (i.e., reply)
InitiationsReplies = pd.crosstab(df['author_id'], df['q_a'])
InitiationsReplies.head()

q_a,answer,question
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3867,0,1
9421,1,0
11848,2,0
42003,4,0
47161,1,0


In [9]:
# https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
#nltk.download('wordnet')

In [10]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [11]:
print(WordNetLemmatizer().lemmatize(gensim.utils.simple_preprocess(df['body'][1])[7], pos='v'))

update


In [12]:
gensim.utils.simple_preprocess(df['body'][1])

['still',
 'waiting',
 'on',
 'mine',
 'as',
 'well',
 'any',
 'updates',
 'thank',
 'you']

In [13]:
processed_docs = df['body'].map(preprocess)

In [14]:
processed_docs[:10]

0                                             [welcom]
1                                 [wait, updat, thank]
2             [asma, live, go, school, tunisia, right]
3                                  [like, know, emili]
4                                             [welcom]
5    [intend, cours, archiv, avail, soon, http, col...
6                       [alright, clear, thank, repli]
7    [sorri, hear, weren, abl, particip, bazaar, as...
8    [option, theaker, gephi, note, visual, graph, ...
9    [coordin, centroid, cluster, cluster, center, ...
Name: body, dtype: object

In [15]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [16]:
#dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [17]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [18]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [19]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.027*"data" + 0.024*"cours" + 0.013*"learn" + 0.012*"time" + 0.011*"work" + 0.011*"educ" + 0.010*"student" + 0.008*"interest" + 0.008*"thank" + 0.008*"answer"
Topic: 1 
Words: 0.022*"data" + 0.014*"thank" + 0.012*"problem" + 0.012*"video" + 0.011*"student" + 0.010*"work" + 0.010*"help" + 0.009*"educ" + 0.008*"model" + 0.006*"sorri"
Topic: 2 
Words: 0.032*"data" + 0.020*"work" + 0.013*"student" + 0.013*"educ" + 0.010*"cours" + 0.009*"learn" + 0.008*"thank" + 0.008*"look" + 0.007*"observ" + 0.007*"like"
Topic: 3 
Words: 0.014*"class" + 0.013*"model" + 0.013*"data" + 0.010*"glad" + 0.010*"welcom" + 0.009*"assign" + 0.009*"cours" + 0.009*"work" + 0.009*"rapidmin" + 0.008*"week"
Topic: 4 
Words: 0.027*"thank" + 0.013*"assign" + 0.013*"answer" + 0.012*"correct" + 0.012*"work" + 0.011*"know" + 0.008*"file" + 0.007*"question" + 0.006*"number" + 0.006*"count"
Topic: 5 
Words: 0.074*"welcom" + 0.072*"class" + 0.070*"glad" + 0.023*"cours" + 0.009*"think" + 0.008*"file" + 0.008*"

In [20]:
topicScores = [lda_model.get_document_topics(x) for x in bow_corpus]

In [21]:
topics = pd.DataFrame([max(x, key=lambda y: y[1]) for x in topicScores])

In [22]:
df.insert(2, 'topics', topics[0])
#df.drop('topics', 1)
#topics.shape

In [23]:
df.head()

Unnamed: 0,_id,votes,topics,visible,abuse_flaggers,historical_abuse_flaggers,parent_ids,at_position_list,body,course_id,...,endorsement,thread_type,context,comment_count,title,commentable_id,closed,last_activity_at,q_a,parent_author_id
0,5607c2bd9714b09f27002354,"{'up': [], 'down': [], 'up_count': 0, 'down_co...",5,True,[],[],[],[],Welcome :-),course-v1:TeachersCollegeX+BDE1x+2T2015,...,,,,,,,,,answer,0
1,5604598c01772b8fd70020cc,"{'up': [], 'down': [], 'up_count': 0, 'down_co...",8,True,[],[],[],[],I'm still waiting on mine as well. Any updates...,course-v1:TeachersCollegeX+BDE1x+2T2015,...,,,,,,,,,question,0
2,5600ba739714b0505d001f3a,"{'up': [], 'down': [], 'up_count': 0, 'down_co...",8,True,[],[],[],[],Hi Asma. Are you living and going to school in...,course-v1:TeachersCollegeX+BDE1x+2T2015,...,,,,,,,,,question,0
3,55e8453f9714b0800a00100e,"{'up': [], 'down': [], 'up_count': 0, 'down_co...",4,True,[],[],[{'$oid': '5595ad87a9a36ea584000eb6'}],[],"I will like to know you, Emily\n",course-v1:TeachersCollegeX+BDE1x+2T2015,...,,,,,,,,,answer,42003
4,55e07c1401772be15a00090e,"{'up': [], 'down': [], 'up_count': 0, 'down_co...",5,True,[],[],[{'$oid': '55d713ae9714b0c98d000243'}],[],You're welcome!,course-v1:TeachersCollegeX+BDE1x+2T2015,...,,,,,,,,,answer,5542424


In [24]:
df.loc[11,'parent_id']

nan

In [25]:
"""Create the adjacency matrix using _id for index and column labels"""
userNetwork = pd.crosstab(df.author_id,df.parent_author_id)
idx = userNetwork.columns.union(userNetwork.index)
userNetwork = userNetwork.reindex(index=idx, columns=idx, fill_value=0)

In [26]:
userNetwork = pd.DataFrame(userNetwork.sort_index(axis=0).sort_index(axis=1))

In [27]:
totalResponses = userNetwork.sum(axis=1)
totalPosts = userNetwork.sum(axis=0).transpose()

In [28]:
userNetwork.head()

Unnamed: 0,0,3867,9421,11848,42003,47161,48360,57083,61457,76537,...,7802070,7819650,7827486,7838886,7898482,7927158,7934102,7979534,8150422,8317639
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3867,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9421,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11848,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42003,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
w_UserNetwork = userNetwork*totalResponses/totalPosts

In [30]:
w_UserNetwork.mean(0).head()

0        0.000000
3867          NaN
9421          NaN
11848         NaN
42003    0.007692
dtype: float64

In [31]:
userTopics = pd.crosstab(df['author_id'], [df['q_a'], df['topics']])

In [32]:
userTopics.head()

q_a,answer,answer,answer,answer,answer,answer,answer,answer,answer,answer,question,question,question,question,question,question,question,question,question,question
topics,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9
author_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
3867,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
9421,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
11848,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42003,0,0,0,0,1,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0
47161,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [33]:
postTopics = userTopics.sum(0)/userTopics.sum(0).sum()

In [34]:
postTopics

q_a       topics
answer    0         0.077072
          1         0.056714
          2         0.058652
          3         0.037809
          4         0.047504
          5         0.157538
          6         0.035385
          7         0.038778
          8         0.068347
          9         0.039263
question  0         0.064954
          1         0.033446
          2         0.037324
          3         0.031508
          4         0.041687
          5         0.028599
          6         0.024721
          7         0.033446
          8         0.036840
          9         0.050412
dtype: float64

In [35]:
postingTendency = userTopics['question']*postTopics.sum()+userTopics['answer']*postTopics.sum()
questionTendency = (userTopics['question']*postTopics.sum())/postingTendency.sum()

In [36]:
questionTendency.head()

topics,0,1,2,3,4,5,6,7,8,9
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3867,0.003413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
#userTopics.columns = userTopics.columns.droplevel(0)
disseminating = 1-questionTendency * np.log(1+postingTendency)
seeking = questionTendency * np.log(1+postingTendency)

In [38]:
seeking.head()

topics,0,1,2,3,4,5,6,7,8,9
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3867,0.002366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
w_UserNetwork.drop(0, axis=0, inplace=True)
w_UserNetwork.drop(0, axis=1, inplace=True)
w_UserNetwork.shape

(519, 519)

In [40]:
seeking.loc[3867].sum()*disseminating.loc[9421].sum()

0.02365690036040769

In [72]:
threshold = .01
alpha = .4
c_S = 1.25
c_D = 0.75
step = 0.1
rho = 1
N = w_UserNetwork.shape[0]
z1 = lambda1 = np.zeros(seeking.T.shape).astype('float64')
z2 = lambda2 = np.zeros(disseminating.shape).astype('float64')
W = W_hat = W_obs = w_UserNetwork.fillna(0).astype('float64').to_numpy()
W_opt = W_prime = np.zeros(W.shape)
D = disseminating.fillna(0).astype('float64').to_numpy()
S = seeking.fillna(0).astype('float64').to_numpy()

def SIDR(s,d,w):
    phi = np.zeros(s.shape)
    for u in range(0, s.shape[0]):
        for k in range(0, s.shape[1]):
            phi[u,k] = s[u,k]/1+(w[:,u].sum()*d[:,k].sum())
    return phi

def DISR(s,d,w):
    psi = np.zeros(d.shape)
    for u in range(0, d.shape[0]):
        for k in range(0, d.shape[1]):
            psi[u,k] = d[u,k]/1+(w[u,:].sum()*s[:,k].sum())
    return psi
    
#SIDR.replace(to_replace=0,value=1)
#DISR.replace(to_replace=0,value=1)
P = (S/(1+(c_S*SIDR(S,D,W)))).T
Q = (D/(1+(c_D*DISR(S,D,W))))

In [73]:
np.seterr(all="raise")

def benefit(X):
    b = np.zeros(D.shape)
    for u in range(0, D.shape[0]):
        for k in range(0, D.shape[1]):
            b[u,k] = (S[u,k]*np.log(1+X[:,u].sum()*D[:,k].sum()) + alpha*D[u,k]*np.log(1+X[u,:].sum()*S[:,k].sum()))
    return b

def proj(X):
    """Projection step"""
    return np.maximum(np.minimum(np.subtract(X,np.diag(np.diag(X))),0),1)
    
def grad(X):
    """Proximal gradient step"""
    return (np.add((rho*np.dot(D,(np.add(np.subtract(np.dot(D.T,X),P),np.subtract(z1,lambda1))))),
            (rho*np.dot((np.subtract(np.subtract(np.dot(X,S),Q),np.add(z2,lambda2))),S.T))))

g_hat = benefit(W).sum().sum()
g = benefit(np.identity(W.shape[0])).sum().sum()
i = 0

while (g - g_hat)/np.abs(g_hat) >= threshold:
    i += 1
    for u in range(0, W.shape[0]):
        for v in range(0, W.shape[1]):
            W_prime[u,v] = (
                (D[u].sum()*S[v].sum()/(1+(W[:,v].sum()*D.sum().sum()))
                +alpha*D[u].sum()*S[v].sum()/(1+(W[u,:].sum()*S.sum().sum())))
                /N
            )
    W_hat = (W + step * W_prime)
    W = proj(W - grad(W_hat))
    g_hat = g
    g = benefit(W).sum().sum()
    z1 = np.maximum(np.add(np.add((-1*np.dot(D.T,W)),P),lambda1),0)
    z2 = np.maximum(np.add(np.subtract(np.dot(W,S),Q),lambda2),0)
    lambda1 -= np.subtract(np.add(np.dot(D.T,W),P),z1)
    lambda2 += np.subtract(np.subtract(np.dot(W,S),Q),z2)

print("Iterations: "+str(i))
print("Observed learning benefit: "+str(benefit(W_obs).sum().sum()/N))
print("Optimized learning benefit: "+str(benefit(W).sum().sum()/N))

Iterations: 2
Observed learning benefit: 1.4576226924368993
Optimized learning benefit: 23.3998161477623
