In [3]:
import json
import csv
import nltk
import numpy
import collections
import re
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
from nltk import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from textstat.textstat import textstat
from nltk.tag import pos_tag

analyzer = SentimentIntensityAnalyzer()
counter_sw = collections.Counter()
counter_mh = collections.Counter()
pattern = re.compile('\w+')  # regular expression for word
stopwords = set(stopwords.words('english'))

# load post, comment from json files
data = []
with open("post_2014.json",'r',encoding = 'utf-8') as f:
    for line in f:
        data.append(json.loads(line))

comments = []
with open("comment_2014.json",'r',encoding = 'utf-8') as f:
    for line in f:
        comments.append(json.loads(line))

def getTerms(sentences):
    tokens = nltk.wordpunct_tokenize(sentences)
    words = [w.lower() for w in tokens if w.isalnum()]
    return words

def getTreatmentToken():
    with open('token2.txt') as f:
        tokens = f.read().splitlines()
    f.close()
    token_lst = []
    for token in tokens:
        tokenized = getTerms(token)
        if len(tokenized) > 1:
            tup = (tokenized[0],tokenized[1])
            token_lst.append(tup)
        else:
            token_lst.append(token)
    return token_lst

treatmentToken = getTreatmentToken()

# metadata of post, comments
author = []

sw_post = []
mh_post = []

length_sw_post = []
length_mh_post = []

sw_polar_score = []
mh_polar_score = []

neg_sw_score = []
pos_sw_score = []
neu_sw_score = []
compound_sw_score = []
neg_mh_score = []
pos_mh_score = []
neu_mh_score = []
compound_mh_score = []

for i in range(len(data)):
    author.append(data[i]["author"])

unique_author = list(set(author))

# posts from SuicideWatch
for i in range(len(data)):
    if data[i]["subreddit"] == "SuicideWatch":
        sw_post.append(data[i]["selftext"])
        
# post from other MH subreddit       
for i in range(len(data)):
    if data[i]["subreddit"] != "SuicideWatch":
        mh_post.append(data[i]["selftext"])

# calculate length of posts from SW
for i in range(len(sw_post)):
    length_sw_post.append(len(getTerms(sw_post[i])))
    
# calculate length of posts from MH
for i in range(len(mh_post)):
    length_mh_post.append(len(getTerms(mh_post[i])))

# calculate Vader polarity score of posts from SW
for post in sw_post:
    polar_score = analyzer.polarity_scores(post)
    sw_polar_score.append(polar_score)

# calculate Vader polarity score of posts from MH    
for post in mh_post:
    polar_score = analyzer.polarity_scores(post)
    mh_polar_score.append(polar_score)

for score in sw_polar_score:
    neg_sw_score.append(score['neg'])
    pos_sw_score.append(score['pos'])
    neu_sw_score.append(score['neu'])
    compound_sw_score.append(score['compound'])
    
for score in mh_polar_score:
    neg_mh_score.append(score['neg'])
    pos_mh_score.append(score['pos'])
    neu_mh_score.append(score['neu'])
    compound_mh_score.append(score['compound'])


for post in sw_post:
    for word in getTerms(post):
        if pattern.match(word) and word not in stopwords:
            counter_sw[word.lower()] += 1

for post in mh_post:
    for word in getTerms(post):
        if pattern.match(word) and word not in stopwords:
            counter_mh[word.lower()] += 1

In [4]:
sw_comments = []
mh_comments = []
length_sw_comment = []
length_mh_comment = []
# posts from SuicideWatch
for i in range(len(comments)):
    if comments[i]["subreddit"] == "SuicideWatch":
        sw_comments.append(comments[i]["body"])
        
# post from other MH subreddit       
for i in range(len(comments)):
    if comments[i]["subreddit"] != "SuicideWatch":
        mh_comments.append(comments[i]["body"])
        
# calculate length of posts from SW
for i in range(len(sw_comments)):
    length_sw_comment.append(len(getTerms(sw_comments[i])))
    
# calculate length of posts from MH
for i in range(len(mh_comments)):
    length_mh_comment.append(len(getTerms(mh_comments[i])))

In [5]:
print("Number of posts: ", len(data))
print("Number of comments: ", len(comments))
print("Number of unique author: ",len(unique_author))        
print("Number of SW post: ", len(sw_post))
print("Number of MH post: ", len(mh_post))
print("Number of SW comment: ", len(sw_comments))
print("Number of MH comment: ", len(mh_comments))
print("Average, median length in SW post: ", (numpy.mean(length_sw_post), numpy.median(length_sw_post)))
print("Average, median length in MH post: ", (numpy.mean(length_mh_post), numpy.median(length_mh_post)))
print("Average, median length in SW comment: ", (numpy.mean(length_sw_comment), numpy.median(length_sw_comment)))
print("Average, median length in MH comment: ", (numpy.mean(length_mh_comment), numpy.median(length_mh_comment)))
# print("Average negative polarity score in SW post: ", numpy.mean(neg_sw_score))
# print("Average negative polarity score in MH post: ", numpy.mean(neg_mh_score))
# print("Average positive polarity score in SW post: ", numpy.mean(pos_sw_score))
# print("Average positive polarity score in MH post: ", numpy.mean(pos_mh_score))
# print("Average neutral polarity score in SW post: ", numpy.mean(neu_sw_score))
# print("Average neutral polarity score in MH post: ", numpy.mean(neu_mh_score))
# print("Average compound polarity score in SW post: ", numpy.mean(compound_sw_score))
# print("Average compound polarity score in MH post: ", numpy.mean(compound_mh_score))
print("Most common 20 unigram in SW post: ", counter_sw.most_common(20))
print("Most common 20 unigram in MH post: ", counter_mh.most_common(20))

Number of posts:  80581
Number of comments:  409430
Number of unique author:  34549
Number of SW post:  20025
Number of MH post:  60556
Number of SW comment:  131330
Number of MH comment:  278100
Average, median length in SW post:  (250.29538077403245, 165.0)
Average, median length in MH post:  (251.02151727326773, 164.0)
Average, median length in SW comment:  (57.249836290261172, 26.0)
Average, median length in MH comment:  (66.747504494786043, 36.0)
Most common 20 unigram in SW post:  [('like', 25290), ('want', 22485), ('know', 21755), ('life', 21213), ('feel', 20429), ('get', 17066), ('even', 15100), ('would', 14854), ('time', 14826), ('people', 14607), ('one', 14156), ('really', 14015), ('going', 11658), ('never', 11482), ('think', 11233), ('friends', 11120), ('go', 11009), ('much', 10039), ('years', 9995), ('help', 9810)]
Most common 20 unigram in MH post:  [('like', 87222), ('feel', 75385), ('know', 64496), ('get', 55209), ('want', 52594), ('time', 49249), ('really', 48359), ('li

In [6]:
#print(time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(1392076800)))
#print(time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(1407801599)))
#print(time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(1407801600)))
#print(time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(1415750399)))

# get the MH, MH->SW user class
def getTargetAuthor(data):
    mh_author = []
    mh_author_not_in_sw_first_period =[]
    sw_author_in_first_period = []
    sw_author_in_second_period = []
    mh = []
    mh_sw = []

    for i in range(len(data)):
        if (data[i]["subreddit"] == "SuicideWatch") and (int(data[i]["created_utc"]) in range(1392076800, 1407801599)):
            sw_author_in_first_period.append(data[i]["author"])

    for i in range(len(data)):
        if (data[i]["subreddit"] == "SuicideWatch") and (int(data[i]["created_utc"]) in range(1407801600, 1415750399)):
            sw_author_in_second_period.append(data[i]["author"])

    for i in range(len(data)):
        if data[i]["subreddit"] != "SuicideWatch":
            mh_author.append(data[i]["author"])


    for author in mh_author:
        if author not in sw_author_in_first_period:
            mh_author_not_in_sw_first_period.append(author)

    for author in mh_author_not_in_sw_first_period:
        if author in sw_author_in_second_period:
            mh_sw.append(author)
        else:
            mh.append(author)

    mh_sw = list(set(mh_sw))
    mh = list(set(mh))
    
    return mh, mh_sw


In [7]:
#print(min(comment_utc))
#print(max(comment_utc))
#print(time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(1459469045)))
#print(time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(1493596740)))
#print(time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(1493596743)))
#print(time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(1485867600)))
#print(time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(1485867599)))
#print(time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(1459469009)))
# print(len(mh_author_not_in_sw_first_period))
# print(len(sw_author_in_first_period))
# print(len(sw_author_in_second_period))
# print(len(mh))
# print(len(mh_sw))

text = "Not just in terms of looks, or the burden I am, but a true monster, like, a truly despicable human being. I feel as if I'm depraved and evil, that my thoughts are twisted and psychotic, and I'm honestly scared of myself. I don't know how much, if any control I have over myself. Honestly, it'd be best for everyone if I was just killed, best for me, and everyone else."
analyzed = analyzer.polarity_scores(text)
print(textstat.flesch_reading_ease(text))
print(textstat.automated_readability_index(text))
print(textstat.smog_index(text))
print(textstat.coleman_liau_index(text))
print(textstat.dale_chall_readability_score(text))
print(textstat.difficult_words(text))
print(textstat.linsear_write_formula(text))
print(textstat.gunning_fog(text))
print(textstat.text_standard(text))
print(type(analyzed['neu']))

78.79
7.2
6.0
6.78
8.52
18
8.125
19.260845070422533
6th and 7th grade
<class 'float'>


In [8]:
# return post of sampled set of MH user
def getSamplePosts(data, comments, mh, mh_sw):
    
    mh_sw_post = []     # list of tuple (author, post)
    mh_sample_post = [] # list of tuple (author, post)
    mh_sw_comment = []     # list of tuple (author, post)
    mh_sample_comment = [] # list of tuple (author, post)
    
    #toggle this comment to switch to 'realistic' setting
    #mh_sample = mh
    
    mh_sample = random.sample(mh,len(mh_sw))
    
#     for i in range(len(data)):
#         for author in mh_sample:
#             if data[i]["author"] == author and data[i]["selftext"] != '[deleted]':
#                 mh_sample_post.append((author, data[i]["selftext"], data[i]["title"], data[i]["score"], data[i]["num_comments"]))
#         for author in mh_sw:
#             if data[i]["author"] == author and data[i]["selftext"] != '[deleted]':
#                 mh_sw_post.append((author, data[i]["selftext"], data[i]["title"], data[i]["score"], data[i]["num_comments"]))
    
#     for i in range(len(comments)):
#         for author in mh_sample:
#             if comments[i]["author"] == author and comments[i]["body"] != '[deleted]':
#                 mh_sample_comment.append((author, comments[i]["body"], comments[i]["score"]))
#         for author in mh_sw:
#             if comments[i]["author"] == author and comments[i]["body"] != '[deleted]':
#                 mh_sw_comment.append((author, comments[i]["body"], comments[i]["score"]))
                
    for i in range(len(data)):
        for author in mh_sample:
            if data[i]["author"] == author and data[i]['selftext'] != '' and data[i]['selftext'] != '[deleted]':
                mh_sample_post.append((author, data[i]["selftext"], data[i]["title"], data[i]["score"], data[i]["num_comments"]))
        for author in mh_sw:
            if data[i]["author"] == author and data[i]['selftext'] != '' and data[i]['selftext'] != '[deleted]':
                mh_sw_post.append((author, data[i]["selftext"], data[i]["title"], data[i]["score"], data[i]["num_comments"]))
    
    for i in range(len(comments)):
        for author in mh_sample:
            if comments[i]["author"] == author and comments[i]['body'] != '' and comments[i]['body'] != '[deleted]':
                mh_sample_comment.append((author, comments[i]["body"], comments[i]["score"]))
        for author in mh_sw:
            if comments[i]["author"] == author and comments[i]['body'] != '' and comments[i]['body'] != '[deleted]':
                mh_sw_comment.append((author, comments[i]["body"], comments[i]["score"]))
                
    return mh_sw_post, mh_sample_post, mh_sample, mh_sw_comment, mh_sample_comment


In [9]:

# prepare a dataset with general feature like faction of pronoun, noun or reading ease
def prepareLinguisticDataset(mh_sw, mh_sample, mh_sw_post, mh_sample_post):
    
    dataset = [] # list for feature of MH->SW sample
    dataset_2 = []  # list for feature of MH sample
    
    # iterate through MH->SW users
    for author in mh_sw:
        
        difficult_word = 0 # difficult words is words have more than 3 syllables
        reading_ease = 0   # Flesch reading easy index: from 0-100, the higher index the easier text to read
        pron_fraction = 0
        noun_fraction = 0
        verb_fraction = 0
        adv_fraction = 0
        readability_index = 0
        num_post = 0
        
        # iterate through posts of MH->SW users
        for tup in mh_sw_post:              
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):
                num_pron = 0
                num_noun = 0
                num_adv = 0
                num_verb = 0
                tagged = nltk.pos_tag(getTerms(tup[1]), tagset="universal")
                for word_tag in tagged:
                    if word_tag[1] == 'PRON':
                        num_pron += 1
                    if word_tag[1] == 'NOUN':
                        num_noun += 1
                    if word_tag[1] == 'VERB':
                        num_verb += 1
                    if word_tag[1] == 'ADV':
                        num_adv += 1
                try:
                    pron_fraction += num_pron/len(tagged)
                except ZeroDivisionError:
                    pron_fraction += num_pron/1
                    
                try:
                    verb_fraction += num_verb/len(tagged)
                except ZeroDivisionError:
                    verb_fraction += num_verb/len(tagged)
                    
                try:   
                    noun_fraction += num_noun/len(tagged)
                except ZeroDivisionError:
                    noun_fraction += num_noun/1
                    
                try:
                    adv_fraction += num_adv/len(tagged)
                except ZeroDivisionError:    
                    adv_fraction += num_adv/1

                try:
                    difficult_word += textstat.difficult_words(tup[1])
                except:
                    difficult_word += 0

                try:
                    reading_ease += textstat.flesch_reading_ease(tup[1])
                except:
                    reading_ease += 90
                
                try:
                    readability_index += textstat.automated_readability_index(tup[1])
                except:
                    readability_index += 5

                num_post += 1
        try:
            avg_diff_word = difficult_word/num_post
        except ZeroDivisionError:
            avg_diff_word = difficult_word/1

        try:
            avg_read_ease = reading_ease/num_post
        except ZeroDivisionError:
            avg_read_ease = reading_ease/1

        try:
            avg_pron_fraction = pron_fraction/num_post
        except ZeroDivisionError:
            avg_pron_fraction = pron_fraction/1

        try:
            avg_noun_fraction = noun_fraction/num_post
        except ZeroDivisionError:
            avg_noun_fraction = noun_fraction/1
        
        try:
            avg_verb_fraction = verb_fraction/num_post
        except ZeroDivisionError:
            avg_verb_fraction = verb_fraction/1

        try:
            avg_adv_fraction = adv_fraction/num_post
        except ZeroDivisionError:
            avg_adv_fraction = adv_fraction/1
            
        try:
            avg_readability_index = readability_index/num_post
        except ZeroDivisionError:
            avg_readability_index = readability_index/1
            
        dataset.append((avg_diff_word, avg_read_ease, avg_readability_index, avg_pron_fraction, avg_noun_fraction,\
                        avg_verb_fraction, avg_adv_fraction, 1)) 
                
    
    # iterate through MH users
    for author in mh_sample:
        
        difficult_word = 0
        reading_ease = 0
        pron_fraction = 0
        noun_fraction = 0
        verb_fraction = 0
        adv_fraction = 0
        readability_index = 0
        num_post = 0  
        
        for tup in mh_sample_post:              
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):
                num_pron = 0
                num_noun = 0
                num_adv = 0
                num_verb = 0
                tagged = nltk.pos_tag(getTerms(tup[1]), tagset="universal")
                for word_tag in tagged:
                    if word_tag[1] == 'PRON':
                        num_pron += 1
                    if word_tag[1] == 'NOUN':
                        num_noun += 1
                    if word_tag[1] == 'VERB':
                        num_verb += 1
                    if word_tag[1] == 'ADV':
                        num_adv += 1
                try:
                    pron_fraction += num_pron/len(tagged)
                except ZeroDivisionError:
                    pron_fraction += num_pron/1
                    
                try:   
                    noun_fraction += num_noun/len(tagged)
                except ZeroDivisionError:
                    noun_fraction += num_noun/1
                
                try:
                    verb_fraction += num_verb/len(tagged)
                except ZeroDivisionError:
                    verb_fraction += num_verb/len(tagged)
                    
                try:
                    adv_fraction += num_adv/len(tagged)
                except ZeroDivisionError:    
                    adv_fraction += num_adv/1

                difficult_word += textstat.difficult_words(tup[1])

                try:
                    reading_ease += textstat.flesch_reading_ease(tup[1])
                except:
                    reading_ease += 90
                
                try:
                    readability_index += textstat.automated_readability_index(tup[1])
                except:
                    readability_index += 5
                
                num_post += 1
                
        try:
            avg_diff_word = difficult_word/num_post
        except ZeroDivisionError:
            avg_diff_word = difficult_word/1

        try:
            avg_read_ease = reading_ease/num_post
        except ZeroDivisionError:
            avg_read_ease = reading_ease/1

        try:
            avg_pron_fraction = pron_fraction/num_post
        except ZeroDivisionError:
            avg_pron_fraction = pron_fraction/1

        try:
            avg_noun_fraction = noun_fraction/num_post
        except ZeroDivisionError:
            avg_noun_fraction = noun_fraction/1

        try:
            avg_verb_fraction = verb_fraction/num_post
        except ZeroDivisionError:
            avg_verb_fraction = verb_fraction/1
        try:
            avg_adv_fraction = adv_fraction/num_post
        except ZeroDivisionError:
            avg_adv_fraction = adv_fraction/1
        
        try:
            avg_readability_index = readability_index/num_post
        except ZeroDivisionError:
            avg_readability_index = readability_index/1

        dataset_2.append((avg_diff_word, avg_read_ease, avg_readability_index, avg_pron_fraction, \
                          avg_noun_fraction, avg_verb_fraction, avg_adv_fraction, 0))

    dataset.extend(dataset_2)
    #random.shuffle(dataset)
    
    return dataset


In [87]:
first_person_singular = ['i','me','my','mine']
first_person_plural = ['we','us','our','ours']
second_person_pronoun = ['you','your','yours']
third_person_pronoun = ['he','she','it','him','her','his','hers','its','they','them','their','theirs']

def prepareInterpersonalDataset(mh_sw, mh_sample, mh_sw_post, mh_sample_post, mh_sw_comment, mh_sample_comment):
    dataset = []  # list for feature of MH->SW sample
    dataset_2 = []  # list for feature of MH sample
    
    # iterate through MH->SW users
    for author in mh_sw:
        
        sp_fraction = 0
        tp_fraction = 0
        fp_singular_fraction = 0
        fp_plural_fraction = 0
        
        num_post = 0
        for tup in mh_sw_post:              
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):
                sp_pron = 0
                tp_pron = 0
                fp_singular = 0
                fp_plural = 0
                #tagged = nltk.pos_tag(getTerms(tup[1]), tagset="universal")
                tokenized = getTerms(tup[1])
                for word in tokenized:
                    if word.lower() in first_person_singular:
                        fp_singular += 1
                    if word.lower() in first_person_plural:
                        fp_plural += 1
                    if word.lower() in second_person_pronoun:
                        sp_pron += 1
                    if word.lower() in third_person_pronoun:
                        tp_pron += 1
                                
                try:
                    sp_fraction += sp_pron/len(tokenized)
                except ZeroDivisionError:
                    sp_fraction += sp_pron/1
                
                try:
                    tp_fraction += tp_pron/len(tokenized)
                except ZeroDivisionError:
                    tp_fraction += tp_pron/1
                
                try:
                    fp_singular_fraction += fp_singular/len(tokenized)
                except ZeroDivisionError:
                    fp_singular_fraction += fp_singular/1
                
                try:
                    fp_plural_fraction += fp_plural/len(tokenized)
                except ZeroDivisionError:
                    fp_plural_fraction += fp_plural/1
                    
                num_post += 1

        try:
            avg_sp_fraction = sp_fraction/num_post
        except ZeroDivisionError:
            avg_sp_fraction = sp_fraction/1
            
        try:
            avg_tp_fraction = tp_fraction/num_post
        except ZeroDivisionError:
            avg_tp_fraction = tp_fraction/1
        
        try:
            avg_fp_singular_fraction = fp_singular_fraction/num_post
        except ZeroDivisionError:
            avg_fp_singular_fraction = fp_singular_fraction/1
            
        try:
            avg_fp_plural_fraction = fp_plural_fraction/num_post
        except ZeroDivisionError:
            avg_fp_plural_fraction = fp_plural_fraction/1
            
        dataset.append((avg_fp_singular_fraction, avg_fp_plural_fraction, avg_sp_fraction, avg_tp_fraction, 1))

        
    for author in mh_sample:
        
        sp_fraction = 0
        tp_fraction = 0
        fp_singular_fraction = 0
        fp_plural_fraction = 0
        
        num_post = 0
        for tup in mh_sample_post:              
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):
                sp_pron = 0
                tp_pron = 0
                fp_singular = 0
                fp_plural = 0                
                #tagged = nltk.pos_tag(getTerms(tup[1]), tagset="universal")
                tokenized = getTerms(tup[1])
                for word in tokenized:
                    if word.lower() in first_person_singular:
                        fp_singular += 1
                    if word.lower() in first_person_plural:
                        fp_plural += 1
                    if word.lower() in second_person_pronoun:
                        sp_pron += 1
                    if word.lower() in third_person_pronoun:
                        tp_pron += 1
                                
                try:
                    sp_fraction += sp_pron/len(tokenized)
                except ZeroDivisionError:
                    sp_fraction += sp_pron/1
                
                try:
                    tp_fraction += tp_pron/len(tokenized)
                except ZeroDivisionError:
                    tp_fraction += tp_pron/1
                
                try:
                    fp_singular_fraction += fp_singular/len(tokenized)
                except ZeroDivisionError:
                    fp_singular_fraction += fp_singular/1
                
                try:
                    fp_plural_fraction += fp_plural/len(tokenized)
                except ZeroDivisionError:
                    fp_plural_fraction += fp_plural/1
                    
                num_post += 1

        try:
            avg_sp_fraction = sp_fraction/num_post
        except ZeroDivisionError:
            avg_sp_fraction = sp_fraction/1
            
        try:
            avg_tp_fraction = tp_fraction/num_post
        except ZeroDivisionError:
            avg_tp_fraction = tp_fraction/1
        
        try:
            avg_fp_singular_fraction = fp_singular_fraction/num_post
        except ZeroDivisionError:
            avg_fp_singular_fraction = fp_singular_fraction/1
            
        try:
            avg_fp_plural_fraction = fp_plural_fraction/num_post
        except ZeroDivisionError:
            avg_fp_plural_fraction = fp_plural_fraction/1
            
        dataset_2.append((avg_fp_singular_fraction, avg_fp_plural_fraction, avg_sp_fraction, avg_tp_fraction, 0)) 
        
    dataset.extend(dataset_2)
    #random.shuffle(dataset)
    
    return dataset        

In [11]:
# prepare a dataset with features like number of comments, length of post/title
def prepareMetadataDataset(mh_sw, mh_sample, mh_sw_post, mh_sample_post, mh_sw_comment, mh_sample_comment):
    
    dataset = []  # list for feature of MH->SW sample
    dataset_2 = []  # list for feature of MH sample
    
    # iterate through MH->SW users
    for author in mh_sw:

        post_length = 0
        title_length = 0
        reddit_score = 0
        num_comments_to_post = 0
        num_post = 0
        num_comment = 0
        comment_score = 0
        comment_length = 0
        
        # iterate through posts of MH->SW users
        for tup in mh_sw_post:              
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):
                post_length += len(getTerms(tup[1]))
                title_length += len(getTerms(tup[2]))
                reddit_score += int(tup[3])
                num_comments_to_post += int(tup[4])
                num_post += 1
                
        # iterate through comments of MH->SW users       
        for tup in mh_sw_comment:
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):
                comment_length += len(getTerms(tup[1]))
                comment_score += int(tup[2])
                num_comment += 1
             
        
        try:
            avg_post_length = post_length/num_post
        except ZeroDivisionError:
            avg_post_length = post_length/1

        try:
            avg_title_length = title_length/num_post
        except ZeroDivisionError:
            avg_title_length = title_length/1

        try:
            avg_reddit_score = reddit_score/num_post
        except ZeroDivisionError:
            avg_reddit_score = reddit_score/1

        try:
            avg_num_comments_to_post = num_comments_to_post/num_post
        except ZeroDivisionError:
            avg_num_comments_to_post = num_comments_to_post/1
            
        try:
            avg_comment_length = comment_length/num_comment
        except ZeroDivisionError:
            avg_comment_length = comment_length/1
            
        try:
            avg_comment_score = comment_score/num_comment
        except ZeroDivisionError:
            avg_comment_score = comment_score/1

        
        dataset.append((avg_post_length, avg_title_length,avg_reddit_score, avg_num_comments_to_post,\
                        avg_comment_length, avg_comment_score, num_comment, 1))
    
    # iterate through MH users
    for author in mh_sample:
        
        post_length = 0
        title_length = 0
        reddit_score = 0
        num_comments_to_post = 0
        num_post = 0
        num_comment = 0
        comment_score = 0
        comment_length = 0
        
        # iterate through posts of MH users
        for tup in mh_sample_post:              
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):
                polar_score = analyzer.polarity_scores(tup[1])
                post_length += len(getTerms(tup[1]))
                title_length += len(getTerms(tup[2]))
                reddit_score += int(tup[3])
                num_comments_to_post += int(tup[4])
                num_post += 1
        
        # iterate through comments of MH users
        for tup in mh_sample_comment:
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):
                comment_length += len(getTerms(tup[1]))
                comment_score += int(tup[2])
                num_comment += 1

        try:
            avg_post_length = post_length/num_post
        except ZeroDivisionError:
            avg_post_length = post_length/1

        try:
            avg_title_length = title_length/num_post
        except ZeroDivisionError:
            avg_title_length = title_length/1

        try:
            avg_reddit_score = reddit_score/num_post
        except ZeroDivisionError:
            avg_reddit_score = reddit_score/1

        try:
            avg_num_comments_to_post = num_comments_to_post/num_post
        except ZeroDivisionError:
            avg_num_comments_to_post = num_comments_to_post/1
            
        try:
            avg_comment_length = comment_length/num_comment
        except ZeroDivisionError:
            avg_comment_length = comment_length/1
            
        try:
            avg_comment_score = comment_score/num_comment
        except ZeroDivisionError:
            avg_comment_score = comment_score/1

        dataset_2.append((avg_post_length, avg_title_length, avg_reddit_score, avg_num_comments_to_post,\
                        avg_comment_length, avg_comment_score,num_comment,0))
        
    dataset.extend(dataset_2)
    #random.shuffle(dataset)
    
    return dataset 


In [12]:
# prepare a dataset with feature is score of Vader sentiment in NLTK
def prepareSentimentDataset(mh_sw, mh_sample, mh_sw_post, mh_sample_post, mh_sw_comment, mh_sample_comment):
    
    dataset = []  # list for feature of MH->SW sample
    dataset_2 = []  # list for feature of MH sample
    
    # iterate through MH->SW users
    for author in mh_sw:
        
        neg_score = 0
        neu_score = 0
        pos_score = 0
        compound_score = 0
        num_post = 0
        
        # iterate through posts of MH->SW users
        for tup in mh_sw_post:              
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):
                polar_score = analyzer.polarity_scores(tup[1])
                neg_score += polar_score['neg']
                neu_score += polar_score['neu']
                pos_score += polar_score['pos']
                compound_score += polar_score['compound']
                num_post += 1
        try:
            avg_neg_score = neg_score/num_post
        except ZeroDivisionError:
            avg_neg_score = neg_score/1

        try:
            avg_neu_score = neu_score/num_post
        except ZeroDivisionError:
            avg_neu_score = neu_score/1

        try:
            avg_pos_score = pos_score/num_post
        except ZeroDivisionError:
            avg_pos_score = pos_score/1

        try:
            avg_compound_score = compound_score/num_post
        except ZeroDivisionError:
            avg_compound_score = compound_score/1


        dataset.append((avg_neg_score, avg_neu_score, avg_pos_score, avg_compound_score,1))
        
    # iterate through MH users    
    for author in mh_sample:
        
        neg_score = 0
        neu_score = 0
        pos_score = 0
        compound_score = 0
        num_post = 0
        
        # iterate through posts of MH users
        for tup in mh_sample_post:              
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):
                polar_score = analyzer.polarity_scores(tup[1])
                neg_score += polar_score['neg']
                neu_score += polar_score['neu']
                pos_score += polar_score['pos']
                compound_score += polar_score['compound']
                num_post += 1

        try:
            avg_neg_score = neg_score/num_post
        except ZeroDivisionError:
            avg_neg_score = neg_score/1

        try:
            avg_neu_score = neu_score/num_post
        except ZeroDivisionError:
            avg_neu_score = neu_score/1

        try:
            avg_pos_score = pos_score/num_post
        except ZeroDivisionError:
            avg_pos_score = pos_score/1

        try:
            avg_compound_score = compound_score/num_post
        except ZeroDivisionError:
            avg_compound_score = compound_score/1

        dataset_2.append((avg_neg_score, avg_neu_score, avg_pos_score, avg_compound_score,0)) 
    
    dataset.extend(dataset_2)
    #random.shuffle(dataset)
    
    return dataset 


In [13]:
def prepareContentDataset(mh_sw, mh_sample, mh_sw_post, mh_sample_post, mh_sw_comment, mh_sample_comment, treatmentToken):
    
    dataset = []  # list for feature of MH->SW sample
    dataset_2 = []  # list for feature of MH sample
    
    for author in mh_sw:        
        
        binary_tup = ()
        index_lst = []
        
        for tup in mh_sw_post:
            appeared = []
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):                
                unigram_appeared = set(treatmentToken).intersection(getTerms(tup[1]))
                bigram = list(nltk.bigrams(getTerms(tup[1])))
                bigram_appeared = set(treatmentToken).intersection(bigram)
                temp = list(unigram_appeared.union(bigram_appeared))
                appeared.extend(temp)                  
        
            for word in appeared:
                index_lst.append(treatmentToken.index(word))
        
        index_lst = list(set(index_lst))
        index_lst.sort()
        
        for i in range(len(treatmentToken)):
            if i in index_lst:
                binary_tup = binary_tup + (1,)
            else:
                binary_tup = binary_tup + (0,)

        dataset.append(binary_tup + (1,))
        
    for author in mh_sample:
        
        appeared = []
        binary_tup = ()
        index_lst = []
        
        for tup in mh_sample_post:
            if((tup[0] == author) and (len(getTerms(tup[1])) >= 1)):                
                unigram_appeared = set(treatmentToken).intersection(getTerms(tup[1]))
                bigram = list(nltk.bigrams(getTerms(tup[1])))
                bigram_appeared = set(treatmentToken).intersection(bigram)
                temp = list(unigram_appeared.union(bigram_appeared))
                appeared.extend(temp)
                                         
            for word in appeared:
                index_lst.append(treatmentToken.index(word))
        
        index_lst = list(set(index_lst))
        index_lst.sort()
                            
        for i in range(len(treatmentToken)):
            if i in index_lst:
                binary_tup = binary_tup + (1,)
            else:
                binary_tup = binary_tup + (0,)

        dataset_2.append(binary_tup + (0,))

    dataset.extend(dataset_2)

    return dataset
                                                                    

In [14]:
def combineDataset(dataset_linguistic, dataset_interpersonal, dataset_metadata, dataset_sentiment, dataset_content): 
    dataset_full = []
    for i in range(len(dataset_linguistic)):
        row = dataset_linguistic[i][0:len(dataset_linguistic[i])-1] + \
              dataset_metadata[i][0:len(dataset_metadata[i])-1] + \
              dataset_interpersonal[i][0:len(dataset_interpersonal[i])-1] + \
            dataset_sentiment[i][0:len(dataset_sentiment[i])-1] + \
                dataset_content[i]
        dataset_full.append(row)
    return dataset_full

In [88]:
mh, mh_sw = getTargetAuthor(data)
mh_sw_post, mh_sample_post, mh_sample, mh_sw_comment, mh_sample_comment = getSamplePosts(data, comments, mh, mh_sw)
dataset_linguistic = prepareLinguisticDataset(mh_sw, mh_sample, mh_sw_post, mh_sample_post)
dataset_interpersonal = prepareInterpersonalDataset(mh_sw, mh_sample, mh_sw_post, mh_sample_post, mh_sw_comment, mh_sample_comment)
dataset_metadata = prepareMetadataDataset(mh_sw, mh_sample, mh_sw_post, mh_sample_post, mh_sw_comment, mh_sample_comment)
dataset_sentiment = prepareSentimentDataset(mh_sw, mh_sample, mh_sw_post, mh_sample_post, mh_sw_comment, mh_sample_comment)
dataset_content = prepareContentDataset(mh_sw, mh_sample, mh_sw_post, mh_sample_post, mh_sw_comment, mh_sample_comment, treatmentToken)
dataset_full = combineDataset(dataset_linguistic, dataset_interpersonal, dataset_metadata, dataset_sentiment, dataset_content)

In [95]:
#from scipy import stats
import statsmodels.api as sm

print("Z-test for linguistic features:")
for i in range(len(dataset_linguistic[1])-1):
    linguistic_mh = []
    linguistic_mh_sw = []
    for j in range(len(dataset_linguistic)):
        if dataset_linguistic[j][len(dataset_linguistic[j])-1] == 1:
            linguistic_mh_sw.append(dataset_linguistic[j][i])
        else:
            linguistic_mh.append(dataset_linguistic[j][i])
    result = sm.stats.CompareMeans.from_data(linguistic_mh, linguistic_mh_sw).ztest_ind()
    #result2 = sm.stats.CompareMeans.from_data(linguistic_mh, linguistic_mh_sw).ttest_ind()
    print("Mean (MH, SW): ", numpy.mean(linguistic_mh), numpy.mean(linguistic_mh_sw))
    print(result)
    #print(result2)
    
print("Z-test for interaction features:")   
for i in range(len(dataset_metadata[1])-1):
    metadata_mh = []
    metadata_mh_sw = []
    for j in range(len(dataset_metadata)):
        if dataset_metadata[j][len(dataset_metadata[j])-1] == 1:
            metadata_mh_sw.append(dataset_metadata[j][i])
        else:
            metadata_mh.append(dataset_metadata[j][i])
    result = sm.stats.CompareMeans.from_data(metadata_mh, metadata_mh_sw).ztest_ind()
    #result2 = sm.stats.CompareMeans.from_data(metadata_mh, metadata_mh_sw).ttest_ind()
    print("Mean (MH, SW): ", numpy.mean(metadata_mh), numpy.mean(metadata_mh_sw))
    print(result)
    #print(result2)
    
print("Z-test for interpersonal features:")   
for i in range(len(dataset_interpersonal[1])-1):
    interpersonal_mh = []
    interpersonal_mh_sw = []
    for j in range(len(dataset_interpersonal)):
        if dataset_interpersonal[j][len(dataset_interpersonal[j])-1] == 1:
            interpersonal_mh_sw.append(dataset_interpersonal[j][i])
        else:
            interpersonal_mh.append(dataset_interpersonal[j][i])
    result = sm.stats.CompareMeans.from_data(interpersonal_mh, interpersonal_mh_sw).ztest_ind()
    #result2 = sm.stats.CompareMeans.from_data(interpersonal_mh, interpersonal_mh_sw).ttest_ind()
    print("Mean (MH, SW): ", numpy.mean(interpersonal_mh), numpy.mean(interpersonal_mh_sw))
    print(result)
    #print(result2)
    
print("Z-test for sentiment features:")   
for i in range(len(dataset_sentiment[1])-1):
    sentiment_mh = []
    sentiment_mh_sw = []
    for j in range(len(dataset_sentiment)):
        if dataset_sentiment[j][len(dataset_sentiment[j])-1] == 1:
            sentiment_mh_sw.append(dataset_sentiment[j][i])
        else:
            sentiment_mh.append(dataset_sentiment[j][i])
    result = sm.stats.CompareMeans.from_data(sentiment_mh, sentiment_mh_sw).ztest_ind()
    #result2 = sm.stats.CompareMeans.from_data(sentiment_mh, sentiment_mh_sw).ttest_ind()
    print("Mean (MH, SW): ", numpy.mean(sentiment_mh), numpy.mean(sentiment_mh_sw))
    print(result)
    #print(result2)
    
# lin = ['num_difficult_words','fleisch_reading_ease', 'automated_readability_index','pronoun_fraction', 'noun_fraction',\
#         'verb_fraction','adverb_fraction', 'is_mh_sw']
# meta = ['post_length', 'title_length','reddit_score', 'num_comments_to_post', 'comment_length', 'comment_score',\
#         'num_comment', 'is_mh_sw']
# interpersonal = ['fp_singular_fraction', 'fp_plural_fraction', 'sp_fraction', 'tp_fraction', 'is_mh_sw']
# sen = ['negative_score', 'neutral_score', 'positive_score', 'compound_score', 'is_mh_sw']

Z-test for linguistic features:
Mean (MH, SW):  56.2272793992 52.617179196
(1.11286945945803, 0.26576449835223681)
Mean (MH, SW):  66.9645640638 75.5015083001
(-2.3553664510276753, 0.018504447218113711)
Mean (MH, SW):  8.55422495306 7.14030167813
(1.4397044632639591, 0.14995103009452979)
Mean (MH, SW):  0.076785509621 0.0802390137692
(-1.5292270004686632, 0.12620817925630029)
Mean (MH, SW):  0.204161683441 0.208060426166
(-0.77941994400627623, 0.43573237919518337)
Mean (MH, SW):  0.233315027146 0.249708507797
(-3.5836452825216178, 0.00033883216042893371)
Mean (MH, SW):  0.0834225367339 0.084553588336
(-0.49429376941095438, 0.62109872474103067)
Z-test for interaction features:
Mean (MH, SW):  278.443300051 265.94557851
(0.70034976043848307, 0.48370890304224512)
Mean (MH, SW):  7.94463987029 8.31530778689
(-0.92215196949545764, 0.35644931227035048)
Mean (MH, SW):  6.04557433009 5.78329966546
(0.24248470743705972, 0.80840460503222666)
Mean (MH, SW):  3.96162655743 5.49700131544
(-3.170348

In [66]:
# print(len(mh))
# print(len(mh_sw))
# print(len(mh_sample))
# print(len(mh_sample_post))
# print(len(mh_sw_post))
# print(len(mh_sw_comment))
# print(len(mh_sample_comment))


In [89]:
import copy

# write dataset to CSV file, feature_choice is parameter to decide what dataset and the name of CSV file
def writeCSV(dataset, treatmentToken, feature_choice):
    set1 = ['num_difficult_words','fleisch_reading_ease', 'automated_readability_index','pronoun_fraction', 'noun_fraction',\
            'verb_fraction','adverb_fraction', 'is_mh_sw']
    set2 = ['post_length', 'title_length','reddit_score', 'num_comments_to_post', 'comment_length', 'comment_score',\
            'num_comment', 'is_mh_sw']
    set3 = ['fp_singular_fraction', 'fp_plural_fraction', 'sp_fraction', 'tp_fraction', 'is_mh_sw']
    set4 = ['negative_score', 'neutral_score', 'positive_score', 'compound_score', 'is_mh_sw']

    
    set6 = copy.deepcopy(set1[0:len(set1)-1])

#     set5 = ['num_difficult_words','fleisch_reading_ease', 'automated_readability_index','pronoun_fraction', 'noun_fraction',\
#             'verb_fraction','adverb_fraction',\
#             'post_length', 'title_length','reddit_score', 'num_comments_to_post', 'comment_length', 'comment_score',\
#             'num_comment',\
#             'fp_singular_fraction', 'fp_plural_fraction', 'sp_fraction', 'tp_fraction',
#             'negative_score', 'neutral_score', 'positive_score', 'compound_score', 'is_mh_sw']
    treatment_copy = copy.deepcopy(treatmentToken)
    set5 = treatment_copy
    set5.append('is_mh_sw')

#    set6.extend(set2[0:len(set2)-1]).extend(set3[0:len(set3)-1]).extend(set4[0:len(set4)-1]).extend(set5)
    set6.extend(set2[0:len(set2)-1])
    set6.extend(set3[0:len(set3)-1])
    set6.extend(set4[0:len(set4)-1])
    set6.extend(set5)

    if feature_choice == 1:
        feature = set1
        filename = 'dataset_linguistic.csv'
    elif feature_choice == 2:
        feature = set2
        filename = 'dataset_metadata.csv'
    elif feature_choice == 3:
        feature = set3
        filename = 'dataset_interpersonal.csv'
    elif feature_choice == 4:
        feature = set4
        filename = 'dataset_sentiment.csv'
    elif feature_choice == 5:
        feature = set5
        filename = 'dataset_content.csv'
    elif feature_choice == 6:
        feature = set6
        filename = 'dataset_full.csv'
    else:
        raise Exception('Invalid choice!')
        
    with open(filename, 'w', newline='') as file:
        file_writer = csv.writer(file)    
        file_writer.writerow(feature)
        for row in dataset:
            file_writer.writerow(row)
            


In [92]:
writeCSV(dataset_linguistic, treatmentToken, 1)
writeCSV(dataset_metadata, treatmentToken, 2)
writeCSV(dataset_interpersonal, treatmentToken, 3)
writeCSV(dataset_sentiment, treatmentToken, 4)
writeCSV(dataset_content, treatmentToken, 5)
writeCSV(dataset_full, treatmentToken, 6)