In [None]:
import pandas as pd
import os
from pathlib import Path
import sys
import pandas as pd
from datetime import datetime
import numpy as np
import csv

In [None]:
project_name = 'clpsych'
project_path = Path(os.getcwd()).parent

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
else:
    data_path = Path(project_path, 'dataset')

utils_path = str(Path(project_path, 'utils'))
# including the project folder and the utils folder
if utils_path not in ''.join(sys.path):
    sys.path.extend([str(project_path), utils_path])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('')
print('sys.path = {0}'.format(sys.path))

In [None]:
os.listdir(data_path)

In [None]:
filename = 'post_user_risk.csv'
users = pd.read_csv(Path(data_path, filename))
print(users.shape)
users.head()

In [None]:
# read in task_B_train csv file
filename = 'risk_tbs.csv'
sentiments = pd.read_csv(Path(data_path, filename), header=None, 
                         names=['post_id', 'sent_1', 'sent_2', 'sent_3', 'sent_4', 'sent_5'])
# sentiments = sentiments.set_index('post_id').copy()
sentiments.post_id = sentiments.post_id.astype(int)
print(sentiments.shape)
sentiments.head()

In [None]:
# read in task_B_train csv file
filename = 'risk_tbs_num.csv'
sentence_number = pd.read_csv(Path(data_path, filename),header=None, 
                         names=['post_id', 'num_sentences'])
print(sentence_number.shape)
sentence_number.head()

In [None]:
# n_dataset = sentiments.copy()
# n_dataset['user_id'] = 0
# n_dataset['risk_label'] = 0
# n_dataset['post_id'] = 0
# for sent_ele_ix, sent_ele in sentiments.iterrows():
#     current_user = users.loc[int(sent_ele['post_id'])]
#     user_id = current_user['user_id']
#     n_dataset.loc[sent_ele_ix, 'user_id'] = user_id
#     n_dataset.loc[sent_ele_ix, 'risk_label'] = current_user['risk_label']
#     n_dataset.loc[sent_ele_ix, 'post_id'] = current_user['post_id']
# n_dataset.head()

In [None]:
sntiments_num = pd.merge(sentiments, sentence_number, on='post_id', how='inner')
sntiments_num.head()
dataset = pd.merge(users, sntiments_num, left_index=True, right_on='post_id', how='inner')
dataset.head()

In [None]:
dataset = dataset[['post_id_x','sent_1','sent_2','sent_3','sent_4','sent_5','user_id','risk_label', 'num_sentences']]
dataset.columns = ['post_id','sent_1','sent_2','sent_3','sent_4','sent_5','user_id','risk_label', 'num_sentences']
dataset.head()

In [None]:
print(dataset.shape)
dataset = dataset[~pd.isnull(dataset['risk_label'])]  
print(dataset.shape)

In [None]:
dataset.to_csv(Path(data_path, 'sentiment_per_post.csv'), index=False)

Individual post-level sentiment vector: sum of all sentiment vectors for sentences in a post / total number of the sentence in the post [you already computed it for each post]

In [None]:
averaged_sentiments_list_per_post = list()

#iterate over user ids
for row_ix, row in dataset.iterrows():
    new_row = list()
    # All sentence-level average (micro ave.)
    relevant_dist = row[['sent_1','sent_2','sent_3','sent_4','sent_5']] / row['num_sentences']
    # find average of the distributions by sentiment
    sentiment_dist = relevant_dist / relevant_dist.sum()
    # store
    new_row.extend(row[['post_id', 'user_id', 'risk_label']])
    new_row.extend(list(sentiment_dist.values))
    averaged_sentiments_list_per_post.append(np.array(new_row))

In [None]:
file_obj = Path(data_path, 'sentiment_per_post.csv')
with file_obj.open('w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['post_id', 'user_id', 'risk_label', 'sent_1','sent_2',
                     'sent_3','sent_4','sent_5'])
    for row in averaged_sentiments_list_per_post:
        writer.writerow(row)

In [None]:
pd.read_csv(file_obj).head()

- All sentence-level average (micro ave.): sum of all sentiment vectors (1x5) across all the posts of a user / total number of sentences across all the posts of the user (Ni,s)

In [None]:
averaged_user_sentiments_list = list()

#iterate over user ids
for user in dataset.user_id.unique():
    user_posts = dataset[dataset['user_id'] == user]
    user_sentiments_list = list()
    #get posts for current user
    total_sentences = user_posts['num_sentences'].sum()
    # All sentence-level average (micro ave.)
    relevant_dist = user_posts[['sent_1','sent_2','sent_3','sent_4','sent_5']].sum() / total_sentences
    # find average of the distributions by sentiment
    sentiment_dis = relevant_dist / relevant_dist.sum()
    # store
    user_sentiments_list.append(user)
    user_sentiments_list.extend(list(sentiment_dist.values))
    user_sentiments_list.append(user_posts['risk_label'].unique()[0])
    averaged_user_sentiments_list.append(np.array(user_sentiments_list))

In [None]:
file_obj = Path(data_path, 'sentiment_per_user_micro.csv')
with file_obj.open('w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['user_id', 'sent_1_micro','sent_2_micro','sent_3_micro','sent_4_micro','sent_5_micro', 
                     'risk_label'])
    for row in averaged_user_sentiments_list:
        writer.writerow(row)

In [None]:
pd.read_csv(file_obj).head()

- All post-level average (marco ave.): sum of Individual post-level sentiment vector for all the posts of a user / total number of posts of a user (Ni,p)

In [None]:
averaged_user_sentiments_list = list()

#iterate over user ids
for user in dataset.user_id.unique():
    user_sentiments_list = list()
    user_posts = dataset[dataset['user_id'] == user]
    #get posts for current user
    total_post = user_posts.shape[0]
    # All sentence-level average (macro ave.)
    relevant_dist_macro = user_posts[['sent_1','sent_2','sent_3','sent_4','sent_5']].divide(total_post, axis=0)
    # add up all the sentiments
    dist_sum_macro = relevant_dist_macro.sum()
    # find average of the distributions by sentiment
    sentiment_dist_macro = dist_sum_macro / dist_sum_macro.sum()
    # store
    user_sentiments_list.append(user)
    user_sentiments_list.extend(list(sentiment_dist_macro.values))
    user_sentiments_list.append(user_posts['risk_label'].unique()[0])
    averaged_user_sentiments_list.append(np.array(user_sentiments_list))

In [None]:
file_obj = Path(data_path, 'sentiment_per_user_macro.csv')
with file_obj.open('w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['user_id', 'sent_1_macro','sent_2_macro','sent_3_macro','sent_4_macro','sent_5_macro', 
                     'risk_label'])
    for row in averaged_user_sentiments_list:
        writer.writerow(row)

In [None]:
pd.read_csv(file_obj).head()