In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.collocations import *
import pandas as pd
import csv
import math
from matplotlib import pyplot
import seaborn as sns
from textblob import TextBlob
from ast import literal_eval as make_tuple

In [None]:
from ast import literal_eval as make_tuple
stopword_list = stopwords.words('english')

def disputing_greater_flt(x):
    return x['disputing_freq'] >=  x['supporting_freq'] and x['disputing_freq'] >=  x['mentioning_freq']

def supporting_greater_flt(x):
    return x['supporting_freq'] >=  x['disputing_freq'] and x['supporting_freq'] >=  x['disputing_freq']

def mentioning_greater_flt(x):
    return x['mentioning_freq'] >=  x['disputing_freq'] and x['mentioning_freq'] >=  x['supporting_freq']


def common_flt(x):
    return x['supporting_disputing_loglikelihood'] <= 3.84 and x['supporting_mentioning_loglikelihood'] <= 3.84


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def make_pos(x, pos):
    tup = make_tuple(x)
    tokenized = nltk.word_tokenize(' '.join(tup))
    tags = nltk.pos_tag(tokenized)
    tagstr = ''
    for tag in tags:
        tagstr += tag[1] + ' '
    return tagstr
    
def find_pos(x, pos):
    tup = make_tuple(x)
    tokenized = nltk.word_tokenize(' '.join(tup))
    tags = nltk.pos_tag(tokenized)
    for tag in tags:
        if tag[1] == pos:
            return True
    return False

def sentiment(x):
    tup = make_tuple(x)
    sent = sia.polarity_scores(' '.join(tup))
    return sent['compound']

def discourse_markers(x):
    tup = make_tuple(x)
    sent = ' '.join(tup)
    for dm in DISCOURSE_MARKERS:
        if dm in sent:
            return True
    return False

In [None]:
interesting_pos = ['RB','RBR', 'RBS', 'JJ', 'JJR', 'JJS', 'IN', 'CC']
POS = {
    "CC": "coordinating conjunctions",
    "IN": "prepositions and subordinating conjunctions",
    "JJ": "adjectives",
    "JJR": "adjectives, comparative",
    "JJS": "adjectives, superlative",
    "RB": "adverbs",
    "RBR": "adverbs, comparative",
    "RBS": "adverbs, superlative"
}

analysize = ['bigrams', 'trigrams']
for analysis in analysize:
    for pos in interesting_pos:
        supporting_counts = []
        disputing_counts = []
        mentioning_counts = []
        df = pd.read_csv(f"./analysis/{analysis}_supporting_greater.csv")
        of = df[df.apply(lambda x: find_pos(x[analysis], pos), axis=1)]
        of.to_csv(f"./analysis/{analysis}_{pos}_supporting_greater.csv")
        supporting_counts.append(len(of) / len(df))
        print(f"supporting - {analysis} - {pos}: {len(of) / len(df)} ")

        df = pd.read_csv(f"./analysis/{analysis}_disputing_greater.csv")
        of = df[df.apply(lambda x: find_pos(x[analysis], pos), axis=1)]
        of.to_csv(f"./analysis/{analysis}_{pos}_disputing_greater.csv")
        disputing_counts.append(len(of) / len(df))
        print(f"disputing - {analysis} - {pos}: {len(of) / len(df)} ")
        

        df = pd.read_csv(f"./analysis/{analysis}_mentioning_greater.csv") # it should be mentioning greater than both
        of = df[df.apply(lambda x: find_pos(x[analysis], pos), axis=1)]
        of.to_csv(f"./analysis/{analysis}_{pos}_mentioning_greater.csv")
        mentioning_counts.append(len(of) / len(df))
        print(f"mentioning - {analysis} - {pos}: {len(of) / len(df)} ")
        
        
        df = pd.DataFrame({
            'Factor': pos,
            'Supporting citations': supporting_counts,
            'Disputing citations': disputing_counts,
            'Mentioning citations': mentioning_counts,
        })
        fig, ax1 = pyplot.subplots(figsize=(10, 10))
        tidy = df.melt(id_vars='Factor').rename(columns=str.title)
        sns.barplot(x='Factor', y='Value', hue='Variable', data=tidy, ax=ax1)
        ax1.set(title=f"Frequency of {POS[pos]} ({pos}) in {analysis} (%)", xlabel=f"Part of speech: {POS[pos]}", ylabel=f"Frequency of occurance(%)")
        vals = ax1.get_yticks()
        ax1.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
        sns.despine(fig)
        pyplot.savefig(f'./analysis/{analysis}_{pos}.png')
        pyplot.plot()

# bar and counts of discourse markers
# plot sentiments and correlates 

In [None]:
analysize = ['bigrams', 'trigrams']
functions = [
    {
        'name': 'Supporting',
        'type': 'supporting_greater',
        'function': 'supporting_disputing_loglikelihood'
    },
    {
        'name': 'Disputing',
        'type': 'disputing_greater',
        'function': 'supporting_disputing_loglikelihood'
    },
    {
        'name': 'Mentioning',
        'type': 'mentioning_greater',
        'function': 'supporting_mentioning_loglikelihood'
    }
]
for analysis in analysize:
    for function in functions:
        cite_type = function["type"]
        df = pd.read_csv(f"./analysis/{analysis}_{cite_type}.csv")
        df['sent'] = df.apply(lambda x: sentiment(x[analysis]), axis=1)
        print(function['name'])
        print(df[['sent', function['function']]].corr(method = 'pearson'))
        df.sort_values('sent').to_csv(f"./analysis/{analysis}_{cite_type}_sent.csv")

        fig, ax1 = pyplot.subplots(figsize=(10, 10))
        sns.scatterplot(data=df, x='sent', y=function['function'], ax=ax1)
        ax1.set(title=f"{function['name']} {analysis} sentiment versus log likelihood", ylabel="Log Likelihood", xlabel=f"Sentiment (-1 negative, 0 neutral, 1 positive)")
        vals = ax1.get_yticks()
        sns.despine(fig)
        cite_name = function['type']
        pyplot.savefig(f'./analysis/{analysis}_{cite_name}_sent_scatter.png')
        pyplot.plot()
    
    

In [None]:
def subjectivity(x):
    tup = make_tuple(x)
    sent = TextBlob(' '.join(tup)).sentiment
    return sent[1]

In [None]:
analysize = ['bigrams', 'trigrams']
functions = [
    {
        'name': 'Supporting',
        'type': 'supporting_greater',
        'function': 'supporting_disputing_loglikelihood'
    },
    {
        'name': 'Disputing',
        'type': 'disputing_greater',
        'function': 'supporting_disputing_loglikelihood'
    },
    {
        'name': 'Mentioning',
        'type': 'mentioning_greater',
        'function': 'supporting_mentioning_loglikelihood'
    }
]
for analysis in analysize:
    for function in functions:
        cite_type = function["type"]
        df = pd.read_csv(f"./analysis/{analysis}_{cite_type}.csv")
        df['subj'] = df.apply(lambda x: subjectivity(x[analysis]), axis=1)
        print(function['name'])
        print(df[['subj', function['function']]].corr(method = 'pearson'))
        df.sort_values('subj').to_csv(f"./analysis/{analysis}_{cite_type}_subj.csv")

        fig, ax1 = pyplot.subplots(figsize=(10, 10))
        sns.histplot(data=df, x='subj', ax=ax1)
        ax1.set(title=f"{function['name']} {analysis} subjectivity", xlabel=f"Count of {analysis}", ylabel=f"Subjectivity (0.0 very objective, 1.0 very subjective)")
        vals = ax1.get_yticks()
        sns.despine(fig)
        cite_name = function['type']
        pyplot.savefig(f'./analysis/{analysis}_{cite_name}_hist_plot.png')
        pyplot.plot()
        
        fig, ax1 = pyplot.subplots(figsize=(10, 10))
        sns.scatterplot(data=df, x='subj', y=function['function'], ax=ax1)
        ax1.set(title=f"{function['name']} {analysis} subjectivity versus log likelihood", ylabel="Log Likelihood", xlabel=f"Subjectivity (0.0 very objective, 1.0 very subjective)")
        vals = ax1.get_yticks()
        sns.despine(fig)
        cite_name = function['type']
        pyplot.savefig(f'./analysis/{analysis}_{cite_name}_subj_scatter.png')
        pyplot.plot()
    