In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [30]:
ratings = pd.read_csv('data/rating.txt', delimiter='\t',header=None)
ratings = ratings.rename(columns = {0:'object_id', 1:'member_id',2:'rating',3:'status',4:'creation',5:'last_modified',6:'type',7:'vertical_id'})
ratings = ratings[['object_id','member_id','rating','status','type']]
ratings.head()

# OBJECT_ID The object ID is the object that is being rated. The only valid objects at the present time are the content_id of the member_content table. This means that at present this table only stores the ratings on reviews and essays
# MEMBER_ID Stores the id of the member who is rating the object
# RATING Stores the 1-5 (1- Not helpful , 2 - Somewhat Helpful, 3 - Helpful 4 - Very Helpful 5- Most Helpful) rating of the object by member [There are some 6s, treat them as 5]
# STATUS The display status of the rating. 1 :- means the member has chosen not to show his rating of the object and 0 meaning the member does not mind showing his name beside the rating.
# CREATION The date on which the member first rated this object
# LAST_MODIFIED The latest date on which the member modified his rating of the object
# TYPE If and when we allow more than just content rating to be stored in this table, then this column would store the type of the object being rated.
# VERTICAL_ID Vertical_id of the review.

Unnamed: 0,object_id,member_id,rating,status,type
0,139431556,591156,5,0,1
1,139431556,1312460676,5,0,1
2,139431556,204358,5,0,1
3,139431556,368725,5,0,1
4,139431556,277629,5,0,1


In [31]:
mc = pd.read_csv('data/mc.txt.gz', delimiter='|',header=None)
mc = mc.rename(columns = {0:'content_id', 1:'author_id',2:'subject_id'})
mc.head()


Unnamed: 0,content_id,author_id,subject_id
0,1445594,718357,149002400000.0
1,1445595,220568,149003600000.0
2,1445596,717325,5303145000.0
3,1445597,360156,192620900000.0
4,1445598,718857,149002200000.0


In [32]:
user_ratings = pd.read_csv('data/user_rating.txt.gz', delimiter='\t', header=None)
user_ratings = user_ratings.rename(columns={0:'FromId',1:'ToId',2:'sign',3:'creation'})
user_ratings.head()

Unnamed: 0,FromId,ToId,sign,creation
0,3287060356,232085,-1,2001/01/10
1,3288305540,709420,1,2001/01/10
2,3290337156,204418,-1,2001/01/10
3,3294138244,269243,-1,2001/01/10
4,3294138244,170692484,-1,2001/01/10


In [33]:
epinions_given = pd.read_csv('data/soc-sign-epinions.txt', delimiter='\t', header=3)
epinions_given = epinions_given.rename(columns={'# FromNodeId':'FromNodeId'})
epinions_given.head()

Unnamed: 0,FromNodeId,ToNodeId,Sign
0,0,1,-1
1,1,128552,-1
2,2,3,1
3,4,5,-1
4,4,155,-1


In [35]:
# check the user Ids
print("min\tmax\tnum")
print(epinions_given.FromNodeId.min(),epinions_given.FromNodeId.max(), len(epinions_given.FromNodeId.unique()))
print(epinions_given.ToNodeId.min(), epinions_given.ToNodeId.max(),len(epinions_given.ToNodeId.unique()))
print(user_ratings.FromId.min(),user_ratings.FromId.max(), len(user_ratings.FromId.unique()))
print(user_ratings.ToId.min(),user_ratings.ToId.max(), len(user_ratings.ToId.unique()))
print(ratings.member_id.min(),ratings.member_id.max(), len(ratings.member_id.unique()))
print(mc.author_id.min(),mc.author_id.max(), len(mc.author_id.unique()))

min	max	num
0 131827 95318
1 131826 84601
199781 84015157124 95318
199781 83988156292 84601
199775 46381502340 120492
199775 88274210691 326983


In [43]:
posting_freq = ratings['member_id'].value_counts().sort_index()
status_freq = ratings[['member_id','status']].groupby('member_id').sum().squeeze()
mean_rating = ratings[['member_id','rating']].groupby('member_id').mean().squeeze()
articles_freq = mc['author_id'].value_counts().sort_index()
anonymity_norm = status_freq/posting_freq

In [38]:
# convert the user ids in to simpler numbers
# the computation took so much time. ignore for now.

id_list = list(pd.concat([user_ratings['FromId'], user_ratings['ToId']]).unique())

def convert_id(ind):
    if ind in id_list:
        return id_list.index(ind)
    else:
        return np.nan

In [39]:
# mapping module
def add_col(s, ind):
    try:
        return s.loc[ind]
    except:
        return np.nan

In [40]:
# add new columns for each edge

user_ratings['FromId_posting_freq'] = user_ratings['FromId'].apply(lambda x: add_col(posting_freq, x))
user_ratings['FromId_status_freq'] = user_ratings['FromId'].apply(lambda x: add_col(status_freq, x))
user_ratings['FromId_mean_rating'] = user_ratings['FromId'].apply(lambda x: add_col(mean_rating, x))
user_ratings['FromId_articles_freq'] = user_ratings['FromId'].apply(lambda x: add_col(articles_freq, x))
user_ratings['FromId_anonymity_norm'] = user_ratings['FromId'].apply(lambda x: add_col(anonymity_norm, x))


user_ratings['ToId_posting_freq'] = user_ratings['ToId'].apply(lambda x: add_col(posting_freq, x))
user_ratings['ToId_posting_freq'] = user_ratings['ToId'].apply(lambda x: add_col(status_freq, x))
user_ratings['ToId_posting_freq'] = user_ratings['ToId'].apply(lambda x: add_col(mean_rating, x))
user_ratings['ToId_posting_freq'] = user_ratings['ToId'].apply(lambda x: add_col(articles_freq, x))
user_ratings['ToId_anonymity_norm'] = user_ratings['ToId'].apply(lambda x: add_col(anonymity_norm, x))

user_ratings = user_ratings.dropna()
user_ratings.shape

In [51]:
user_ratings.to_pickle('data/preprocessed.pkl')