# Log Odds Analysis

In [8]:
import os 
import sys

sys.path.append('/home/kalkiek/projects/reddit-political-affiliation/')

import re
import itertools
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from tqdm.notebook import tqdm
from collections import defaultdict, Counter

import bz2
import lzma
import json
import zstandard as zstd
from json import JSONDecodeError
# from src.data.download_flair_data import parse_submissions, parse_zst_submissions

### Load in User Predictions

In [2]:
year_month = '2019-01'

in_file = '/shared/0/projects/reddit-political-affiliation/data/word2vec/predictions/users_' + year_month + '.tsv'
user_predictions = {}

with open(in_file, 'r') as f:
    for line in f:
        user, score = line.split('\t')
        user_predictions[user] = float(score.strip())

### Get the 'fringe' users from both sides

In [4]:
left, right = {}, {}

for user, score in user_predictions.items():
    if score >= 0.75:
        right[user] = score
    elif score <= 0.25:
        left[user] = score
        
print("Number of right users: {}".format(len(right)))
print("Number of left users: {}".format(len(left)))

Number of right users: 4386802
Number of left users: 419697


### Get word frequencies for all fringe users from this month

### Code to handle the raw (compressed) data 

In [10]:
def get_file_handle(file_path):
    ext = file_path.split('.')[-1]

    if ext == "bz2":
        return bz2.open(file_path)
    elif ext == "xz":
        return lzma.open(file_path)

    raise AssertionError("Invalid extension for " + file_path + ". Expecting bz2 or xz file")
    

def get_word_frequencies(file_pointer, left_users, right_users):
    left_word_freq, right_word_freq = Counter(), Counter()
    
    for count, line in enumerate(file_pointer):
        try:
            submission = json.loads(f.readline().strip())
            username, text = submission['author'], submission['body']

            if username in left_users:
                for word in text.split(' '):
                    left_word_freq[word] += 1
            elif username in right_users:
                for word in text.split(' '):
                    right_word_freq[word] += 1

        except (JSONDecodeError, AttributeError) as e:
            print("Failed to parse line: {} with error: {}".format(line, e))

        if count % 1000000 == 0 and count > 0:
            print("Completed %d lines" % (count))

    return left_word_freq, right_word_freq


def get_word_frequencies_zst_files(fname):
    # Will implement later
    pass


file_path = '/shared/2/datasets/reddit-dump-all/RC/RC_2019-01.xz'
f = get_file_handle(file_path)
left_counts, right_counts = get_word_frequencies(f, left, right)

print("Number of words from left users: {}".format(len(left_counts)))
print("Number of words from right users: {}".format(len(right_counts)))

Completed 1000000 lines
Completed 2000000 lines
Completed 3000000 lines
Completed 4000000 lines
Completed 5000000 lines
Completed 6000000 lines
Completed 7000000 lines
Completed 8000000 lines
Completed 9000000 lines
Completed 10000000 lines
Completed 11000000 lines
Completed 12000000 lines
Completed 13000000 lines
Completed 14000000 lines
Completed 15000000 lines
Completed 16000000 lines
Completed 17000000 lines
Completed 18000000 lines
Completed 19000000 lines
Completed 20000000 lines
Completed 21000000 lines
Completed 22000000 lines
Completed 23000000 lines
Completed 24000000 lines
Completed 25000000 lines
Completed 26000000 lines
Completed 27000000 lines
Completed 28000000 lines
Completed 29000000 lines
Completed 30000000 lines
Completed 31000000 lines
Completed 32000000 lines
Completed 33000000 lines
Completed 34000000 lines
Completed 35000000 lines
Completed 36000000 lines
Completed 37000000 lines
Completed 38000000 lines
Completed 41000000 lines
Completed 42000000 lines
Completed

### Write the counts to files

In [11]:
out_file = '/shared/0/projects/reddit-political-affiliation/data/word2vec/log-odds/' + year_month + '_left.json'

with open(out_file, 'w') as fp:
    json.dump(dict(left_counts), fp)
    
out_file = '/shared/0/projects/reddit-political-affiliation/data/word2vec/log-odds/' + year_month + '_right.json'

with open(out_file, 'w') as fp:
    json.dump(dict(right_counts), fp)

## Log Odds Code

In [12]:
def LoadCounts(filename, min_count=0, stopwords=set()):
    result = defaultdict(int)
    word_counts = json.load(open(filename))
    for word, count in word_counts.items():
        if count >= min_count and word not in stopwords:
              result[word] = count
    return result
  
def LoadStopwords(filename):
    stopwords = set()
    for line in open(filename):
        for word in line.split():
            if word:
                stopwords.add(word)
    return stopwords


def ComputeLogOdds(counts1, counts2, prior):
    sigmasquared = defaultdict(float)
    sigma = defaultdict(float)
    delta = defaultdict(float)

    for word in prior.keys():
        prior[word] = int(prior[word] + 0.5)

    for word in counts2.keys():
        counts1[word] = int(counts1[word] + 0.5)
        if prior[word] == 0:
            prior[word] = 1

    for word in counts1.keys():
        counts2[word] = int(counts2[word] + 0.5)
        if prior[word] == 0:
            prior[word] = 1

    n1  = sum(counts1.values())
    n2  = sum(counts2.values())
    nprior = sum(prior.values())


    for word in prior.keys():
        if prior[word] > 0:
            l1 = float(counts1[word] + prior[word]) / (( n1 + nprior ) - (counts1[word] + prior[word]))
            l2 = float(counts2[word] + prior[word]) / (( n2 + nprior ) - (counts2[word] + prior[word]))
            sigmasquared[word] =  1/(float(counts1[word]) + float(prior[word])) + 1/(float(counts2[word]) + float(prior[word]))
            sigma[word] =  math.sqrt(sigmasquared[word])
            delta[word] = ( math.log(l1) - math.log(l2) ) / sigma[word]

    return delta

### Run Log Odds

In [13]:
from nltk.corpus import stopwords

# SETTINGS
first_file = '/shared/0/projects/reddit-political-affiliation/data/word2vec/log-odds/' + year_month + '_left.json'
second_file = '/shared/0/projects/reddit-political-affiliation/data/word2vec/log-odds/' + year_month + '_right.json'
min_count = 100
stop = stopwords.words('english')
prior = defaultdict(lambda:0)
out_file = '/shared/0/projects/reddit-political-affiliation/data/word2vec/log-odds/' + year_month + '_results.tsv'

counts1 = LoadCounts(first_file, min_count, stop)
counts2 = LoadCounts(second_file, min_count, stop)
prior = LoadCounts(args.prior, args.min_count, stopwords)

delta = ComputeLogOdds(counts1, counts2, prior)

for word, log_odds in sorted(delta.items(), key=lambda x: x[1]):
    args.out_file.write("{}\t{:.3f}\n".format(word, log_odds))

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/kalkiek/nltk_data'
    - '/opt/anaconda/nltk_data'
    - '/opt/anaconda/share/nltk_data'
    - '/opt/anaconda/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


# Visualize the Results