In [3]:
import pickle
import pandas as pd
import numpy as np

# Reorganize Tom's Dict Data into a DataFrame

In [None]:
comment_info = pickle.load(open('comment_info.p','rb'))
deltas_info = pickle.load(open('deltas_info.p','rb'))
post_info = pickle.load(open('post_info.p','rb'))

In [None]:
#Add a depth key in Tom's original comment_info dictionary 
for op in comment_info.keys():
    for comment in range(0,len(comment_info[op])):
    
        parent = comment_info[op][comment]['parent']
        parent = parent[3:]
        
        if parent == op:
            comment_info[op][comment]['depth'] = 1
        else:
            for parent_comment in range(0,comment):
                if comment_info[op][parent_comment]['id'] == parent:
                    comment_info[op][comment]['depth'] = comment_info[op][parent_comment]['depth'] + 1
    

In [None]:
#Reorganize data into a dictionary that will be later used as the basis for a data frame.
#(I used dictionary for this step because its more efficient)
data_dict = {'comment_id':[],'comment_content':[],'comment_auth':[], 
             'comment_time':[], 'comment_parent':[], 'comment_depth':[],
             'op_id':[], 'op_content':[],'op_auth':[],'op_time':[]}

i = 0
for OP_id in post_info.keys():
    
    comment_dict = comment_info[OP_id]
    
    OP_dict = post_info[OP_id]
    OP_cont = OP_dict['text']
    OP_auth = OP_dict['author']
    OP_time = OP_dict['time']    
    
    for comment in comment_dict:
        
        #Input Comment Data
        data_dict['comment_id'].append(comment['id'])
        data_dict['comment_content'].append(comment['text'])
        data_dict['comment_auth'].append(comment['author'])
        data_dict['comment_time'].append(comment['time'])
        data_dict['comment_parent'].append(comment['parent'])
        data_dict['comment_depth'].append(comment['depth'])

        
        #Input OP data
        data_dict['op_id'].append(OP_id)
        data_dict['op_content'].append(OP_cont)
        data_dict['op_auth'].append(OP_auth)
        data_dict['op_time'].append(OP_time)
    
        i += 1

In [None]:
pickle.dump(data_dict,open('data_dict.p','wb'))

In [None]:
###Transfer dictionary data into dataframe 
data_df = pd.DataFrame(np.zeros((len(data_dict['comment_auth']),10)), columns=['comment_id','comment_content','comment_auth', 
                                                             'comment_time', 'comment_parent', 'comment_depth',
                                                             'op_id', 'op_content','op_auth','op_time'])
for i in data_dict.keys():
    data_df[i] = data_dict[i]


In [None]:
data_df.head()

# Pre-process Non-Language Features

In [None]:
#Label: If comment received delta
#First, dump list of all comments with a delta into a list
delta_list = []
thread_delta_list = []

for OP in deltas_info.keys():
    for i in range(0,len(deltas_info[OP])):
        delta_list.append(deltas_info[OP][i][1]) #Indicates by comment
        if len(deltas_info[OP]) > 0:
            thread_delta_list.append(OP)


#Add column that checks if each comment_id is in delta_list
data_df['delta'] = data_df['comment_id'].apply(lambda x: 1 if (x in delta_list) else 0)

#Add column that checks if the OP author ever gave out a delta for that thread
data_df['delta_thread'] = data_df['op_id'].apply(lambda x: 1 if (x in thread_delta_list) else 0)

In [None]:
#Feature: Time between commenters post and OP's post 
data_df['time_diff'] = data_df['comment_time'] - data_df['op_time']

In [None]:
#General: Fix parent column (remove first 3 characters)
data_df['comment_parent'] = data_df['comment_parent'].apply(lambda x: x[3:len(x)])

#General: Remove rows where author of a comment is the same as the author
data_df = data_df[data_df['comment_auth'] != data_df['op_auth']]

#General: Remove rows where comment text is '[deleted]'
data_df = data_df[data_df['comment_content'] != '[deleted]']

#General: Reset Index:
data_df.index = range(0,len(data_df))

In [17]:
data_df.head(10)

Unnamed: 0,comment_id,comment_content,comment_auth,comment_time,comment_parent,comment_depth,op_id,op_content,op_auth,op_time,delta,delta_thread,time_diff
0,d9ujmbi,I would offer this very sub as a counterpoint ...,Ansuz07,1478827000.0,5c8xdc,1,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,0,1,442.0
1,d9ujw71,That doesn't mean that you can't have a meanin...,Ansuz07,1478827000.0,d9ujt0e,3,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,1,1,758.0
2,d9ukuqd,Practice a hobby when you are sick of politics...,Krieg-The-Psycho1,1478828000.0,d9uk2bm,5,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,0,1,1845.0
3,d9uk6m0,This election has certianly been very polarizi...,Ansuz07,1478827000.0,d9uk2bm,5,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,0,1,1087.0
4,d9ukl5h,People have always done this - men would get t...,Ansuz07,1478828000.0,d9ukg11,7,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,0,1,1544.0
5,d9ukulr,"if you want a different kind of fight, you cou...",______NSA______,1478828000.0,d9uk2bm,5,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,0,1,1841.0
6,d9uk2zz,Confirmed: 1 delta awarded to /u/Ansuz07 ([75∆...,DeltaBot,1478827000.0,d9uk2bm,5,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,0,1,971.0
7,d9uv7ue,I think you may have a limited vision of meani...,InsufficientOverkill,1478840000.0,5c8xdc,1,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,0,1,13832.0
8,d9uwsnt,"Ah I see.\n\nYes, the hostility is definitely ...",InsufficientOverkill,1478842000.0,d9uvled,3,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,0,1,15695.0
9,d9v7did,The fact that you've chosen to start a discuss...,jello_sweaters,1478856000.0,5c8xdc,1,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,0,1,29519.0


In [16]:
#pickle.dump(data_df,open('data_df.p','wb'))
data_df = pickle.load(open('data_df.p','rb'))

## Calculate NLP Features

In [18]:
###Pre-process MLT dict into a python dict
import ast

##Process .txt in pandas (the formatting is easier)
MLT_df = pd.read_csv('MFT_dict.txt', sep = '\t', names = ['word','1','2','3'])

#Remove irrelevant rows 
MLT_df = MLT_df[14:len(MLT_df)]
MLT_df = MLT_df.dropna(axis=0,thresh=1)

#Reset Index
MLT_df.index = range(0,len(MLT_df))

MLT_df = MLT_df.fillna(0)
MLT_df['categories'] = [[]]* len(MLT_df)

for i in range(0,len(MLT_df)):
    cat_list = []
    for j in ['1','2','3']:
        if MLT_df[j][i] != 0:
            try:
                cat_list.append(int(MLT_df[j][i]))
            except ValueError:
                error_list = '[' + MLT_df[j][i] + ']'
                error_list = error_list.replace(' ',', ')
                error_list = error_list.replace('0','')
                error_list = error_list.replace(' ,','')

                error_list = ast.literal_eval(error_list)
                
                cat_list.extend(error_list)
                break 
    MLT_df['categories'][i] = cat_list

#Delete processed columns
del MLT_df['1']
del MLT_df['2']
del MLT_df['3']

##Initialize Dict & Transfer to Dict Format
MLT_dict = {1:{'name':'harm_virtue','words':[]},
            2:{'name':'harm_vice','words':[]},
            3:{'name':'fairness_virtue','words':[]}, 
            4:{'name':'fairness_vice','words':[]}, 
            5:{'name':'ingroup_virtue','words':[]},
            6:{'name':'ingroup_vice','words':[]}, 
            7:{'name':'authority_virtue','words':[]},
            8:{'name':'authority_vice','words':[]},
            9:{'name':'purity_virtue','words':[]},
            10:{'name':'purity_vice','words':[]},
            11:{'name':'morality_general','words':[]}}

for i in range(0,len(MLT_df)):
    cat_list = MLT_df['categories'][i]
    for j in cat_list:
        MLT_dict[j]['words'].append(MLT_df['word'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
##Add MLT Features
#Add 1 feature for each (harm/vice, fairness/virtue, etc.)

j=0
for key in MLT_dict.keys():
    name = MLT_dict[key]['name']
    i=0
    for word in MLT_dict[key]['words']:

        #Get rid of the asterisk if it exists
        if word[len(word) - 1:] ==  '*':
            word = word[:len(word) - 1]

        #Count the number of each word and add it to the previous counter for that name
        if i == 0:
            data_df['comment_' + name.lower()] = data_df['comment_content'].apply(lambda x:x.count(word))
            data_df['op_' + name.lower()] = data_df['op_content'].apply(lambda x:x.count(word))

        else:
            data_df['comment_' + name.lower()] += data_df['comment_content'].apply(lambda x:x.count(word))
            data_df['op_' + name.lower()] += data_df['op_content'].apply(lambda x:x.count(word))
        i += 1
    
    #Get binary versions of each generated feature
    data_df['comment_' + name.lower() + '_bin'] = data_df['comment_' + name.lower()].apply(lambda x: 1 if x > 0 else 0)
    data_df['op_' + name.lower() + '_bin'] = data_df['op_' + name.lower()].apply(lambda x: 1 if x > 0 else 0)
    
    #Keep track of what posts have any uses of the moral language category use
    if j == 0:
        data_df['comment_MFT_usage'] = data_df['comment_' + name.lower()]
    else:
        data_df['comment_MFT_usage'] += data_df['comment_' + name.lower()]
    j += 1

#Get binary version of moral language category use
data_df['comment_MFT_usage_bin'] = data_df['comment_MFT_usage'].apply(lambda x: 1 if x > 0 else 0)

In [20]:
##Calculate Jaccard Similarity, treating each 'harm' versus 'virtue' as separate categories

data_df['j_num'] = [0] * len(data_df)
data_df['j_den'] = [0] * len(data_df)

for key in MLT_dict.keys():
    comment_name = 'comment_' + MLT_dict[key]['name'] + '_bin'
    op_name = 'op_' + MLT_dict[key]['name'] + '_bin'
    
    data_df['j_num'] += (data_df[comment_name] == data_df[op_name]).apply(lambda x: 1 if x == True else 0)
    data_df['j_den'] += (data_df[comment_name] == data_df[op_name]).apply(lambda x: 1 if x == True else 2)

data_df['jaccard_sim_split'] = data_df['j_num'] / data_df['j_den']

##Calculate Jaccard Similarity, treating each 'harm' versus 'virtue' as the same categories

data_df['j_num'] = [0] * len(data_df)
data_df['j_den'] = [0] * len(data_df)

for key in range(1,12,2):
    if key != 11:
        comment_name_1 = 'comment_' + MLT_dict[key]['name'] + '_bin'
        comment_name_2 = 'comment_' + MLT_dict[key + 1]['name'] + '_bin'
        op_name_1 = 'op_' + MLT_dict[key]['name'] + '_bin'
        op_name_2 = 'op_' + MLT_dict[key + 1]['name'] + '_bin'
        
        comment_df = (data_df[comment_name_1] + data_df[comment_name_2]).apply(lambda x: 1 if x > 0 else 0 )
        op_df = (data_df[op_name_1] + data_df[op_name_2]).apply(lambda x: 1 if x > 0 else 0 )
        
        data_df['j_num'] += (comment_df == op_df).apply(lambda x: 1 if x == True else 0)
        data_df['j_den'] += (comment_df == op_df).apply(lambda x: 1 if x == True else 2)

    else:
        comment_name = 'comment_' + MLT_dict[key]['name'] + '_bin'
        op_name = 'op_' + MLT_dict[key]['name'] + '_bin'

        data_df['j_num'] += (data_df[comment_name] == data_df[op_name]).apply(lambda x: 1 if x == True else 0)
        data_df['j_den'] += (data_df[comment_name] == data_df[op_name]).apply(lambda x: 1 if x == True else 2)

data_df['jaccard_sim_same'] = data_df['j_num'] / data_df['j_den']



In [22]:
data_df.head()

Unnamed: 0,comment_id,comment_content,comment_auth,comment_time,comment_parent,comment_depth,op_id,op_content,op_auth,op_time,...,op_purity_vice_bin,comment_morality_general,op_morality_general,comment_morality_general_bin,op_morality_general_bin,comment_MFT_usage_bin,j_num,j_den,jaccard_sim_split,jaccard_sim_same
0,d9ujmbi,I would offer this very sub as a counterpoint ...,Ansuz07,1478827000.0,5c8xdc,1,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,...,0,1,1,1,1,1,5,7,0.833333,0.714286
1,d9ujw71,That doesn't mean that you can't have a meanin...,Ansuz07,1478827000.0,d9ujt0e,3,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,...,0,0,1,0,1,0,3,9,0.571429,0.333333
2,d9ukuqd,Practice a hobby when you are sick of politics...,Krieg-The-Psycho1,1478828000.0,d9uk2bm,5,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,...,0,0,1,0,1,1,3,9,0.571429,0.333333
3,d9uk6m0,This election has certianly been very polarizi...,Ansuz07,1478827000.0,d9uk2bm,5,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,...,0,0,1,0,1,0,3,9,0.571429,0.333333
4,d9ukl5h,People have always done this - men would get t...,Ansuz07,1478828000.0,d9ukg11,7,5c8xdc,I have to say that I am very disappointed with...,ralpher313,1478826000.0,...,0,0,1,0,1,1,4,8,0.692308,0.5


In [23]:
###Create 2 versions of the file, 1 without text and unecessary columns and 1 with:

##With Text
#Re-introduce full text
#data_df2 = pickle.load(open('data_df.p','rb'))
#data_df['comment_content'] = data_df2['comment_content']
#data_df['op_content'] = data_df2['op_content']
#del data_df2 

pickle.dump(data_df,open('preproc_data_w_text.p','wb'))

##Without text 
#Remove text and other uncessary feats
del data_df['comment_content']
del data_df['comment_time']
del data_df['comment_parent']
del data_df['op_time']
del data_df['j_num']
del data_df['j_den']

pickle.dump(data_df,open('preproc_data.p','wb'))


In [None]:
len(data_df['delta'])

In [None]:
sum(data_df['delta'])