In [1]:
import pandas as pd
import numpy as np

posts = pd.read_csv('Posts.csv')
tags = pd.read_csv('Tags.csv', keep_default_na=False)

In [2]:
answer_posts = posts[posts['PostTypeId'] == 2][['Id', 'OwnerUserId', 'ParentId']].reset_index(drop=True)
question_posts = posts[posts['PostTypeId'] == 1][['Id', 'Tags']].reset_index(drop=True)

len(answer_posts), len(question_posts)

(178628, 63423)

In [3]:
#create a dataframe of users and number of questions they have answered without considering multiple answers bythe same person to the same question
answerer_table = answer_posts.drop_duplicates(subset=['OwnerUserId', 'ParentId']).groupby('OwnerUserId').size().sort_values(ascending = False).reset_index(name='AnswerCount')
answerer_table

Unnamed: 0,OwnerUserId,AnswerCount
0,9113.0,2838
1,177980.0,2318
2,1204.0,2042
3,123788.0,1672
4,131624.0,1602
...,...,...
26856,61188.0,1
26857,61176.0,1
26858,61136.0,1
26859,61134.0,1


In [4]:
tags_table = tags[['Id', 'TagName', 'Count']].sort_values(ascending = False, by = 'Count').reset_index(drop=True)
tags_table

Unnamed: 0,Id,TagName,Count
0,609,design,5162
1,249,c#,4931
2,76,java,4929
3,391,design-patterns,4450
4,790,architecture,3510
...,...,...,...
1673,3389,riak,1
1674,3379,modern-ui,1
1675,4719,hl7-fhir,1
1676,3286,ceylon,1


In [5]:
#filter from answerer_table where user has answered atleast 20 questions
updated_answerer_table = answerer_table[answerer_table['AnswerCount'] >= 20].reset_index(drop=True)
updated_answerer_table

Unnamed: 0,OwnerUserId,AnswerCount
0,9113.0,2838
1,177980.0,2318
2,1204.0,2042
3,123788.0,1672
4,131624.0,1602
...,...,...
1155,6425.0,20
1156,13125.0,20
1157,26149.0,20
1158,207341.0,20


In [6]:
updated_tags_table = tags_table[tags_table['Count'] >= 20].reset_index(drop=True)
updated_tags_table

Unnamed: 0,Id,TagName,Count
0,609,design,5162
1,249,c#,4931
2,76,java,4929
3,391,design-patterns,4450
4,790,architecture,3510
...,...,...,...
969,4690,vue.js,20
970,4646,serverless,20
971,1547,copy-protection,20
972,4682,azure-devops,20


# Q2

## Preprocessing 
Reducing the size of answer_posts and mering answer_posts and questions_Posts to include tags

In [7]:
nodups_answer_posts = answer_posts.drop_duplicates(subset=['OwnerUserId', 'ParentId'])
updated_answer_posts = nodups_answer_posts[nodups_answer_posts['OwnerUserId'].isin(updated_answerer_table['OwnerUserId'])].reset_index(drop = True)
# updated_answer_posts.sort_values(by = 'OwnerUserId')
merged = pd.merge(updated_answer_posts, question_posts, left_on = 'ParentId', right_on = 'Id', how = 'inner')
merged

Unnamed: 0,Id_x,OwnerUserId,ParentId,Id_y,Tags
0,3,11.0,1.0,1,|comments|anti-patterns|
1,13,4.0,1.0,1,|comments|anti-patterns|
2,56,17.0,1.0,1,|comments|anti-patterns|
3,482,148.0,1.0,1,|comments|anti-patterns|
4,1680,552.0,1.0,1,|comments|anti-patterns|
...,...,...,...,...,...
113274,452683,27032.0,301053.0,301053,|python|python-3.x|
113275,452688,136413.0,452685.0,452685,|architecture|web-development|architectural-pa...
113276,452692,177980.0,452685.0,452685,|architecture|web-development|architectural-pa...
113277,452689,292095.0,452686.0,452686,|database-design|relational-database|nosql|


In [8]:
expert_matrix = np.zeros((len(updated_answerer_table), len(updated_tags_table)))

#sorting the tags and answerers by Id cause we want the expert matrix to be in the order of IDs
updated_tags_table = updated_tags_table.sort_values(by = 'Id')
updated_answerer_table = updated_answerer_table.sort_values(by = 'OwnerUserId')

# dictionary to map user index to user id and vice versa
index_to_user = {}
for i in range(len(updated_answerer_table)):
    index_to_user[i] = updated_answerer_table.iloc[i]['OwnerUserId']
user_to_index = {v: k for k, v in index_to_user.items()}

# dictionary to map tag index to tag name and vice versa
index_to_tag = {}
for i in range(len(updated_tags_table)):
    index_to_tag[i] = updated_tags_table.iloc[i]['TagName']
tag_to_index = {v: k for k, v in index_to_tag.items()}


for i in range(len(merged)):
    user_index = user_to_index[merged.iloc[i]['OwnerUserId']]
    tagslist = merged.iloc[i]['Tags'].split('|')
    for j in range(len(tagslist)):
        if(tagslist[j] not in tag_to_index):
            continue
        tag_index = tag_to_index[tagslist[j]]
        expert_matrix[user_index][tag_index] += 1

In [9]:
for i in range(len(expert_matrix)):
    for j in range(len(expert_matrix[i])):
        if(expert_matrix[i][j] == 0):
            expert_matrix[i][j] = np.nan

expert_matrix

array([[13., nan,  6., ..., nan,  1., nan],
       [nan, nan,  8., ..., nan, nan, nan],
       [ 1., nan,  1., ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [ 1., nan, nan, ..., nan,  5.,  1.],
       [nan, nan, nan, ..., nan, nan, nan]])

In [10]:
expert_matrix.shape

(1160, 974)

## Q3

In [11]:
utility_matrix = np.zeros(expert_matrix.shape)
for i in range(len(updated_answerer_table)):
    for j in range(len(updated_tags_table)):
        if expert_matrix[i][j] < 15:
            utility_matrix[i][j] = expert_matrix[i][j]//3
        elif expert_matrix[i][j] >= 15:
            utility_matrix[i][j] = 5
        else:
            utility_matrix[i][j] = np.nan
            
utility_matrix

array([[ 4., nan,  2., ..., nan,  0., nan],
       [nan, nan,  2., ..., nan, nan, nan],
       [ 0., nan,  0., ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [ 0., nan, nan, ..., nan,  1.,  0.],
       [nan, nan, nan, ..., nan, nan, nan]])

In [12]:
#sum of all entries in the utility matrix
np.nansum(utility_matrix)

41228.0

In [13]:
# highest row sum
print(np.max(np.nansum(utility_matrix, axis = 1)))
# print(np.sum(utility_matrix, axis = 0).shape)

# highest column sum
print(np.max(np.nansum(utility_matrix, axis = 0)))
# print(np.sum(utility_matrix, axis = 1).shape)

1164.0
1403.0


In [14]:
# create a test matrix of size smaller than the utility matrix from the bottom 15% of the rows and bottom 15% of the columns of the utility matrix
rows = int(0.15*utility_matrix.shape[0])
cols = int(0.15*utility_matrix.shape[1])
test_matrix = utility_matrix[-rows:, -cols:]
test_matrix.shape

(174, 146)

In [15]:
np.nansum(test_matrix)

642.0

In [16]:
train_matrix = utility_matrix.copy()
train_matrix[-rows:, -cols:] = np.nan
np.nansum(train_matrix)

40586.0

In [17]:
rating_matrix = train_matrix.copy()

## Q4

In [18]:
x = len(train_matrix) - int(0.15*len(train_matrix))
y = len(train_matrix[0]) - int(0.15*len(train_matrix[0]))

def RMSE(utility_matrix ,N):

    #getting the test matrix
    rows = int(0.15*utility_matrix.shape[0])
    cols = int(0.15*utility_matrix.shape[1])
    test_matrix = utility_matrix[-rows:, -cols:]

    #getting the train matrix
    train_matrix = utility_matrix.copy()
    train_matrix[-rows:, -cols:] = np.nan

    #calculating the centered train matrix
    rating_matrix = train_matrix.copy()

    x = len(train_matrix) - int(0.15*len(train_matrix))
    y = len(train_matrix[0]) - int(0.15*len(train_matrix[0]))

    for i in range(len(train_matrix)):
        train_matrix[i] -= np.nanmean(train_matrix[i])
    similarity_matrix = np.zeros((len(train_matrix), len(train_matrix)))

    for i in range(len(train_matrix)):   
        for j in range(len(train_matrix)):
            if(i == j):
                similarity_matrix[i][j] = 1
            else:
                temp1 = train_matrix[i]
                temp2 = train_matrix[j]
                indices = np.where(~np.isnan(temp1) & ~np.isnan(temp2))
                if(len(indices[0]) == 0):
                    similarity_matrix[i][j] = np.nan
                else:
                    temp1 = temp1[indices]
                    temp2 = temp2[indices]
                    if(np.array_equal(temp1, temp2)):
                        similarity_matrix[i][j] = 1
                    elif(np.dot(temp1, temp2) == 0):
                        similarity_matrix[i][j] = 0
                    else:
                        similarity_matrix[i][j] = np.dot(temp1, temp2)/(np.linalg.norm(temp1)*np.linalg.norm(temp2))

    predicted_matrix_just_mean = np.zeros((len(test_matrix), len(test_matrix[0])))
    predicted_matrix_weighted_mean = np.zeros((len(test_matrix), len(test_matrix[0])))

    for i in range(x, len(train_matrix)):
        tempsimil = similarity_matrix[i]
        indices = np.argsort(tempsimil)[::-1]
        indices = indices[~np.isnan(tempsimil[indices])]
        for j in range(y, len(train_matrix[0])):
            temp = []
            for k in range(len(indices)):
                if(~np.isnan(rating_matrix[indices[k]][j])):
                    temp.append(indices[k])
                    if(len(temp) == N):
                        break
            justmean = 0
            for k in range(len(temp)):
                justmean += rating_matrix[temp[k]][j]
            justmean /= N
            predicted_matrix_just_mean[i-x][j-y] = justmean
            weightedmean = 0
            similarity_sum = 0
            for k in temp:
                weightedmean += rating_matrix[k][j]*similarity_matrix[i][k]
                similarity_sum += similarity_matrix[i][k]
            if(similarity_sum != 0):
                weightedmean /= similarity_sum
                predicted_matrix_weighted_mean[i-x][j-y] = weightedmean

    rmse_just_mean = 0
    rmse_weighted_mean = 0
    count = 0
    for i in range(len(test_matrix)):
        for j in range(len(test_matrix[0])):
            if(~np.isnan(test_matrix[i][j])):
                rmse_just_mean += (predicted_matrix_just_mean[i][j] - test_matrix[i][j])**2
                rmse_weighted_mean += (predicted_matrix_weighted_mean[i][j] - test_matrix[i][j])**2
                count += 1
    rmse_just_mean = (rmse_just_mean/count)**0.5
    rmse_weighted_mean = (rmse_weighted_mean/count)**0.5
    return rmse_just_mean, rmse_weighted_mean
    

In [19]:
#  User User
print(RMSE(utility_matrix, 2))
print(RMSE(utility_matrix, 3))
print(RMSE(utility_matrix, 5))

(0.7081306039820847, 0.705534035007272)
(0.6960745221792036, 0.6928177676218576)
(0.6828384805708951, 0.6801368028131318)


In [20]:
#  Tag Tag
print(RMSE(utility_matrix.transpose(), 2))
print(RMSE(utility_matrix.transpose(), 3))
print(RMSE(utility_matrix.transpose(), 5))

(0.8191314735326606, 0.8157283432373043)
(0.7974671351684951, 0.7933978589640686)
(0.776344418008056, 0.7715583149790094)


## Q5

In [21]:
# not_nan_utility_matrix = utility_matrix.copy()
# not_nan_utility_matrix[np.isnan(utility_matrix)] = 0
# not_nan_utility_matrix

In [22]:
from tqdm import tqdm

In [23]:
def matrix_factor(util_mat, K, lambda1, lambda2):
    #Initialize Q with guassian distribution of size (util_mat.shape[0], K)
    Q = np.random.randn(util_mat.shape[0], K)
    Q =  (Q + 1)/2
    P = np.random.randn(util_mat.shape[1], K)
    P =  (P + 1)/2
    learning_rate = 0.0005
    epochs = 35
    for epoch in tqdm(range(epochs)):

        for i in range(len(Q)):           
            for x in range(len(P)):
                if np.isnan(util_mat[i][x]):
                    continue
                E = 2*(util_mat[i][x] - np.dot(Q[i], P[x]))
                Q[i] = Q[i] + learning_rate*(E*P[x] - lambda1*Q[i])
                P[x] = P[x] + learning_rate*(E*Q[i] - lambda2*P[x])
    
    return Q, P

In [24]:
train_matrix = utility_matrix.copy()
rows = int(0.15*train_matrix.shape[0])
cols = int(0.15*train_matrix.shape[1])
train_matrix[-rows:, -cols:] = np.nan

test_matrix = utility_matrix[-rows:, -cols:]
not_nan_test_matrix = test_matrix.copy()
not_nan_test_matrix[np.isnan(test_matrix)] = 0

In [25]:
#number of entries that are not nan in the test matrix

rmse_count = test_matrix.shape[0]*test_matrix.shape[1] - np.isnan(test_matrix).sum()  

In [26]:
# set of ks to try
ks = [2, 5, 10]
#set of lambda1 and lambda2 to try
lambdas = [(0,0), (0.001, 0.003), (0.05, 0.05), (0.5, 0.75)]

for k in ks:
    for (lambda1, lambda2) in lambdas:
        print (k, lambda1, lambda2, " :")
        Q, P = matrix_factor(train_matrix, k, lambda1, lambda2)
        R = (Q @ P.T)[-rows:, -cols:]
        R[np.isnan(test_matrix)] = 0
        final_loss = np.linalg.norm(not_nan_test_matrix - R)/np.sqrt(rmse_count)
        print(final_loss)
        print()
        

2 0 0  :


  0%|          | 0/35 [00:00<?, ?it/s]

100%|██████████| 35/35 [01:07<00:00,  1.94s/it]


0.7620493745429467

2 0.001 0.003  :


100%|██████████| 35/35 [01:12<00:00,  2.07s/it]


0.7530203026584174

2 0.05 0.05  :


100%|██████████| 35/35 [01:11<00:00,  2.04s/it]


0.7324274721987428

2 0.5 0.75  :


100%|██████████| 35/35 [01:11<00:00,  2.05s/it]


0.7677135681106083

5 0 0  :


100%|██████████| 35/35 [01:14<00:00,  2.13s/it]


0.9479347289069318

5 0.001 0.003  :


100%|██████████| 35/35 [01:10<00:00,  2.03s/it]


0.910045487558364

5 0.05 0.05  :


100%|██████████| 35/35 [01:12<00:00,  2.08s/it]


0.8851432622541018

5 0.5 0.75  :


100%|██████████| 35/35 [01:12<00:00,  2.06s/it]


0.8281942109182915

10 0 0  :


100%|██████████| 35/35 [01:14<00:00,  2.13s/it]


1.139593008533728

10 0.001 0.003  :


100%|██████████| 35/35 [01:11<00:00,  2.04s/it]


1.1194375047386036

10 0.05 0.05  :


100%|██████████| 35/35 [01:11<00:00,  2.04s/it]


1.0908074364864637

10 0.5 0.75  :


100%|██████████| 35/35 [01:13<00:00,  2.10s/it]

0.909536198323997






## Q6

In [27]:
from surprise import Dataset, Reader, KNNBaseline
from surprise.model_selection import train_test_split
from surprise import accuracy


train_data = pd.DataFrame(rating_matrix)
test_data = pd.DataFrame(test_matrix)

train_data = train_data.stack().reset_index()
train_data.columns = ['user', 'item', 'rating']
reader = Reader(rating_scale=(train_data['rating'].min(), train_data['rating'].max()))
train_dataset = Dataset.load_from_df(train_data[['user', 'item', 'rating']], reader)
trainset = train_dataset.build_full_trainset()

test_data = test_data.stack().reset_index()
test_data.columns = ['user', 'item', 'rating']
test_dataset = Dataset.load_from_df(test_data[['user', 'item', 'rating']], reader)
testset = test_dataset.build_full_trainset().build_testset()

In [28]:
algo = KNNBaseline(k=2, min_k= 1, sim_options={'name' : 'pearson', 'user_based' : True}, verbose = False)
algo.fit(trainset)
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
# print(f'RMSE: {rmse}')

algo = KNNBaseline(k=3, min_k= 1, sim_options={'name' : 'pearson', 'user_based' : True}, verbose = False)
algo.fit(trainset)
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
# print(f'RMSE: {rmse}')

algo = KNNBaseline(k=5, min_k= 1, sim_options={'name' : 'pearson', 'user_based' : True}, verbose = False)
algo.fit(trainset)
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
# print(f'RMSE: {rmse}')

RMSE: 0.9631
RMSE: 0.9392
RMSE: 0.9260


In [29]:
algo = KNNBaseline(k=2, min_k= 1, sim_options={'name' : 'pearson', 'user_based' : False}, verbose = False)
algo.fit(trainset)
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
# print(f'RMSE: {rmse}')

algo = KNNBaseline(k=3, min_k= 1, sim_options={'name' : 'pearson', 'user_based' : False}, verbose = False)
algo.fit(trainset)
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
# print(f'RMSE: {rmse}')

algo = KNNBaseline(k=5, min_k= 1, sim_options={'name' : 'pearson', 'user_based' : False}, verbose = False)
algo.fit(trainset)
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
# print(f'RMSE: {rmse}')


RMSE: 0.9462
RMSE: 0.9418
RMSE: 0.9167


In [30]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

Ks = [2, 5, 10]
for k in Ks:
    param = {"n_epochs": [5, 10, 25], "lr_all": [0.0001, 0.001, 0.01], "reg_all": [ 0.001, 0.01, 0.1, 1], "n_factors": [k]}
    gs = GridSearchCV(SVD, param, measures=["rmse"])
    gs.fit(train_dataset)
    
    algo = gs.best_estimator["rmse"]
    algo = algo.fit(trainset)
    rmse = accuracy.rmse(algo.test(testset))

RMSE: 0.8815
RMSE: 0.9172
RMSE: 0.9194
