## Data Pre-Processing

In [1]:
import json

In [2]:
def extract_data(filename):
    data = dict()
    with open(filename) as f:
        data = json.load(f)
    return data

In [3]:
def get_posts_titles(data):
    posts = dict()

    for k in data.keys():
        state_posts = []
        for p in data[k]:
            post = {
                "selftext": p["selftext"],
                "title": p["title"]
            }
            state_posts.append(post)
        posts[k] = state_posts
    return posts

In [4]:
def get_comments(data):
    comments = dict()

    for k in data.keys():
        state = data[k]
        state_comments = []
        
        for p in state.keys():
            for c in state[p]:
                comment = {
                    "body": c["body"]
                }
                state_comments.append(comment)
        
        comments[k] = state_comments
    return comments

In [5]:
# get raw data for after and before election
raw_after_posts = extract_data('data/after_election_posts_data.json')
raw_after_comments = extract_data('data/after_election_SAMPLE_comments_data.json')
raw_before_posts = extract_data('data/before_election_posts_data.json')
raw_before_comments = extract_data('data/before_election_SAMPLE_comments_data.json')

In [6]:
# extract just the useful data from both posts and comments
after_posts = get_posts_titles(raw_after_posts)
before_posts = get_posts_titles(raw_before_posts)

In [7]:
after_comments = get_comments(raw_after_comments)
before_comments = get_comments(raw_before_comments)

## Filter political posts

In [8]:
# list of words to look for when filtering political posts
keywords = [
    "election",
    "president",
    "presidential",
    "vote ", # take this out for topic modeling, put in for pure filtering
    "voting",
    "abortion",
    "democracy",
    "immigration",
    "economy",
    "war ",
    "ukraine",
    "israel",
    "palestine",
    "climate",
    "healthcare",
    "inflation"
]

In [9]:
# filter for political posts
def filter_political_posts(all_posts):
    political_posts = dict()
    
    for k in all_posts.keys():
        posts = []
    
        for post in all_posts[k]:
            for w in keywords:
                if w in post["selftext"].lower() or w in post["title"].lower():
                    posts.append(post)
    
        political_posts[k] = posts
    return political_posts

In [10]:
after_political_posts = filter_political_posts(after_posts)
before_political_posts = filter_political_posts(before_posts)

In [11]:
# print out size of political posts per state - for testing
def count_political_posts(political_posts):
    pol_posts_size = dict()
    for k in political_posts.keys():
        pol_posts_size[k] = len(political_posts[k])
    return pol_posts_size

In [12]:
count_political_posts(after_political_posts)

{'california': 18,
 'michigan': 50,
 'colorado': 3,
 'oregon': 51,
 'hawaii': 14,
 'oklahoma': 24,
 'maryland': 42,
 'arizona': 8,
 'virginia': 99,
 'maine': 63,
 'indiana': 41,
 'iowa': 98,
 'washington': 31,
 'newhampshire': 38,
 'alaska': 41,
 'louisiana': 31,
 'vermont': 42,
 'newyork': 2,
 'arkansas': 4,
 'alabama': 7,
 'kentucky': 12,
 'southcarolina': 2,
 'georgia': 38,
 'montana': 4,
 'delaware': 14,
 'utah': 26,
 'rhodeisland': 20,
 'missouri': 123,
 'tennessee': 2,
 'nebraska': 29,
 'illinois': 45,
 'westvirginia': 11,
 'newmexico': 33,
 'mississippi': 17,
 'kansas': 36,
 'northdakota': 6,
 'idaho': 39,
 'southdakota': 19,
 'wyoming': 8,
 'nevada': 6}

In [13]:
count_political_posts(before_political_posts)

{'california': 33,
 'michigan': 183,
 'colorado': 9,
 'oregon': 83,
 'hawaii': 29,
 'oklahoma': 67,
 'maryland': 133,
 'arizona': 11,
 'virginia': 101,
 'maine': 64,
 'indiana': 80,
 'iowa': 165,
 'washington': 65,
 'newhampshire': 88,
 'alaska': 68,
 'louisiana': 69,
 'vermont': 45,
 'newyork': 14,
 'arkansas': 16,
 'alabama': 26,
 'kentucky': 4,
 'southcarolina': 54,
 'georgia': 196,
 'montana': 4,
 'delaware': 59,
 'utah': 73,
 'rhodeisland': 36,
 'missouri': 376,
 'tennessee': 25,
 'nebraska': 129,
 'illinois': 61,
 'westvirginia': 41,
 'newmexico': 41,
 'mississippi': 22,
 'kansas': 82,
 'northdakota': 17,
 'idaho': 93,
 'southdakota': 54,
 'wyoming': 10,
 'nevada': 12}

In [14]:
# filter for political comments
def filter_political_comments(all_comments):
    political_comments = dict()
    
    for k in all_comments.keys():
        comments = []
        for c in all_comments[k]:
           for w in keywords:
               if w in c["body"]:
                   comments.append(c)
        political_comments[k] = comments
    return political_comments

In [15]:
after_political_comments = filter_political_comments(after_comments)
before_political_comments = filter_political_comments(before_comments)

In [16]:
count_political_posts(after_political_comments)

{'nevada': 593, 'wyoming': 198}

In [17]:
count_political_posts(before_political_comments)

{'nevada': 459, 'wyoming': 317}

## Filter posts by candidate

In [18]:
candidate_keywords = {
    "trump": ["trump", "donald", "donald trump", "republican"],
    "harris": ["harris", "kamala", "kamala harris", "democrat"]
}

In [19]:
# filter for candidate posts
def filter_candidate_posts(all_posts):
    candidate_posts = dict()
    
    for k in all_posts.keys():
        candidate_posts[k] = dict()
        
        for candidate in candidate_keywords.keys():
            for w in candidate_keywords[candidate]:
                posts = set()
                
                for post in all_posts[k]:
                    if w in post["selftext"].lower() or w in post["title"].lower():
                        posts.add(post["selftext"] + " " + post["title"])
        
            candidate_posts[k][candidate] = list(posts)
    return candidate_posts

In [20]:
after_candidate_posts = filter_candidate_posts(after_posts)
before_candidate_posts = filter_candidate_posts(before_posts)

In [21]:
# print out size of political posts per state - for testing
def count_candidate_posts(candidate_posts):
    can_posts_size = dict()
    for k in candidate_posts.keys():
        can_posts_size[k] = dict()
        for candidate in candidate_keywords.keys():
            can_posts_size[k][candidate] = len(candidate_posts[k][candidate])
    return can_posts_size

In [22]:
count_candidate_posts(after_candidate_posts)

{'california': {'trump': 2, 'harris': 2},
 'michigan': {'trump': 13, 'harris': 12},
 'colorado': {'trump': 0, 'harris': 1},
 'oregon': {'trump': 2, 'harris': 3},
 'hawaii': {'trump': 3, 'harris': 0},
 'oklahoma': {'trump': 1, 'harris': 1},
 'maryland': {'trump': 1, 'harris': 5},
 'arizona': {'trump': 1, 'harris': 0},
 'virginia': {'trump': 8, 'harris': 19},
 'maine': {'trump': 4, 'harris': 6},
 'indiana': {'trump': 4, 'harris': 2},
 'iowa': {'trump': 11, 'harris': 13},
 'washington': {'trump': 3, 'harris': 5},
 'newhampshire': {'trump': 9, 'harris': 8},
 'alaska': {'trump': 5, 'harris': 5},
 'louisiana': {'trump': 4, 'harris': 4},
 'vermont': {'trump': 6, 'harris': 7},
 'newyork': {'trump': 1, 'harris': 0},
 'arkansas': {'trump': 0, 'harris': 1},
 'alabama': {'trump': 1, 'harris': 0},
 'kentucky': {'trump': 2, 'harris': 2},
 'southcarolina': {'trump': 1, 'harris': 0},
 'georgia': {'trump': 0, 'harris': 2},
 'montana': {'trump': 0, 'harris': 1},
 'delaware': {'trump': 4, 'harris': 3},
 

In [23]:
count_candidate_posts(before_candidate_posts)

{'california': {'trump': 0, 'harris': 1},
 'michigan': {'trump': 13, 'harris': 12},
 'colorado': {'trump': 0, 'harris': 0},
 'oregon': {'trump': 2, 'harris': 3},
 'hawaii': {'trump': 0, 'harris': 0},
 'oklahoma': {'trump': 5, 'harris': 3},
 'maryland': {'trump': 7, 'harris': 9},
 'arizona': {'trump': 0, 'harris': 0},
 'virginia': {'trump': 7, 'harris': 6},
 'maine': {'trump': 2, 'harris': 3},
 'indiana': {'trump': 11, 'harris': 6},
 'iowa': {'trump': 14, 'harris': 19},
 'washington': {'trump': 5, 'harris': 3},
 'newhampshire': {'trump': 10, 'harris': 6},
 'alaska': {'trump': 6, 'harris': 5},
 'louisiana': {'trump': 4, 'harris': 2},
 'vermont': {'trump': 5, 'harris': 4},
 'newyork': {'trump': 0, 'harris': 0},
 'arkansas': {'trump': 0, 'harris': 1},
 'alabama': {'trump': 1, 'harris': 1},
 'kentucky': {'trump': 0, 'harris': 0},
 'southcarolina': {'trump': 3, 'harris': 2},
 'georgia': {'trump': 13, 'harris': 10},
 'montana': {'trump': 0, 'harris': 0},
 'delaware': {'trump': 1, 'harris': 7}

In [24]:
# filter for candidate comments
def filter_candidate_comments(all_comments):
    candidate_comments = dict()
    
    for k in all_comments.keys():
        candidate_comments[k] = dict()
        
        for candidate in candidate_keywords.keys():
            for w in candidate_keywords[candidate]:
                comments = set()
                
                for post in all_comments[k]:
                    if w in post["body"].lower():
                        comments.add(post["body"])
        
            candidate_comments[k][candidate] = list(comments)
    return candidate_comments

In [25]:
after_candidate_comments = filter_candidate_comments(after_comments)
before_candidate_comments = filter_candidate_comments(before_comments)

In [26]:
count_candidate_posts(after_candidate_comments)

{'nevada': {'trump': 88, 'harris': 101},
 'wyoming': {'trump': 27, 'harris': 13}}

In [27]:
count_candidate_posts(before_candidate_comments)

{'nevada': {'trump': 82, 'harris': 84}, 'wyoming': {'trump': 64, 'harris': 34}}

In [28]:
# DATA FORMAT
# <after/before>_political_posts contains posts that contain at least one of our defined keywords,
# after or before the election respectively.

# <after/before>_political_posts = {
#    'texas': [
#        {
#            'selftext': "__",
#            'title': "__"
#        },
#        {
#            'selftext': "__",
#            'title': "__"}
#        ...
#    ],
#    'california': [
#        {
#            'selftext': "__",
#            'title': "__"
#        },
#        ...
#    ]
# }

# <after/before>_political_comments contains comments that contain at least one of our defined keywords,
# after or before the election respectively.

# <after/before>_political_posts = {
#    'texas': [
#        {
#            'body': "__"
#        },
#        {
#            'body': "__"
#        },
#        ...
#    ],
#    'california': [
#        {
#            'body': "__"
#        },
#        ...
#    ]
# }

# <after/before>_candidate_<posts/comments> contains posts/comments that contain words about each candidate (as defined in candidate_keywords),
# after or before the election respectively.

# <after/before>_candidate_<posts/comments> = {
#    'texas': {
#        'trump': [__, __],
#        'harris': [__, __],
#    },
#    'california': {
#        'trump': [__, __],
#        'harris': [__, __],
#    },
#    ...
# }

## Sentiment Analysis

In [29]:
# sentiment analysis
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import torch

In [30]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

In [31]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

2024-11-29 23:38:37.212903: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model 

In [32]:
## TEST - must use PyTorch version, the tensorflow one is not as accurate
text = "I love you!"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
print(ranking)
print(config.id2label)

for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

[2 1 0]
{0: 'negative', 1: 'neutral', 2: 'positive'}
1) positive 0.9749
2) neutral 0.0208
3) negative 0.0043


In [33]:
# compute highest sentiment score and label for given text
def get_sentiment_label_score(text):
    # get output from model
    encoded_input = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    output = model(**encoded_input)

    # compute softmax scores
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # ranking = list of labels in decreasing order of sentiment score
    ranking = np.argsort(scores)[::-1]

    # return dictionary of {ranking: score} for given text
    # sentiment_map = dict()
    # for i in range(scores.shape[0]):
    #     l = config.id2label[ranking[i]]
    #     s = scores[ranking[i]]
    #     sentiment_map[l] = s

    # return largest score and the associated sentiment
    l = config.id2label[ranking[0]]
    s = scores[ranking[0]]
    return l, s

## Take sentiment score for each candidate

Get sentiment score for each sentiment. Might look like:
```
'texas': {
    'trump': {
        'positive': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'neutral': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       }
       'negative': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'sentiment': 'positive',
       'avg_sentiment': 0.0
    },
    'harris': {
        'positive': {
            ...
       },
       'neutral': {
           ...
       }
       'negative': {
           ...
       },
       'sentiment': 'positive',
       'avg_sentiment': 0.0
    },
}

In [34]:
# helper for creating the json above
def record_sentiment(sent_data, score):
    if "scores" in sent_data.keys():
        sent_data["scores"].append(score)
    else:
        sent_data["scores"] = [score]

    if "num_posts" in sent_data.keys():
        sent_data["num_posts"] += 1
    else:
        sent_data["num_posts"] = 1

In [91]:
def get_sentiment_scores_grouped_posts(grouped_posts):
    all_sentiment = dict()
    
    for k in grouped_posts.keys():
        all_sentiment[k] = dict()
        
        for candidate in grouped_posts[k].keys():
            all_sentiment[k][candidate] = dict()
            candidate_data = {
                'positive': {},
                'neutral': {},
                'negative': {},
            }

            for post in grouped_posts[k][candidate]:
                label, score = get_sentiment_label_score(post)
                record_sentiment(candidate_data[label], score)
                
                all_sentiment[k][candidate] = candidate_data
    return all_sentiment

In [102]:
# after_cand_sentiments = get_sentiment_scores_grouped_posts(after_candidate_posts)

In [103]:
# before_cand_sentiments = get_sentiment_scores_grouped_posts(before_candidate_posts)

In [104]:
# after_cand_comment_sentiments = get_sentiment_scores_grouped_posts(after_candidate_comments)

In [105]:
# before_cand_comment_sentiments = get_sentiment_scores_grouped_posts(before_candidate_comments)

In [96]:
# class to make numpy types JSON serializable
class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

def save_data(filename, data):
    with open(filename, 'w') as f:
        json.dump(data, f, cls=NumpyEncoder, indent=4)

In [97]:
# save_data('after_cand_post_sentiments.json', after_cand_sentiments)
# save_data('before_cand_post_sentiments.json', before_cand_sentiments)
# save_data('after_cand_comment_sentiments.json', after_cand_comment_sentiments)
# save_data('before_cand_comment_sentiments.json', before_cand_comment_sentiments)

In [98]:
after_cand_sentiments = extract_data('after_cand_post_sentiments.json')
before_cand_sentiments = extract_data('before_cand_post_sentiments.json')
after_cand_comment_sentiments = extract_data('after_cand_comment_sentiments.json')
before_cand_comment_sentiments = extract_data('before_cand_comment_sentiments.json')

In [112]:
# get average, min, max, and dominant sentiment for each state
def get_sentiment_stats(all_sentiment):
    stats = all_sentiment.copy()
    
    for k in stats.keys():
        state_sent = stats[k].copy()
        
        for c in state_sent.keys():
            avgs = dict()
            candidate = state_sent[c].copy()

            for s in candidate.keys():
                sent = candidate[s].copy()
                if 'scores' not in sent.keys():
                    sent['scores'] = [0]
                    
                sent['min'] = min(sent['scores'])
                sent['max'] = max(sent['scores'])
                sent['average'] = np.mean(sent['scores'])
                avgs[sent['average']] = s
                
                if 'num_posts' not in sent.keys():
                    sent['num_posts'] = 0
                candidate[s] = sent
            
            if len(avgs.keys()) > 0:
                avg_sentiment = max(avgs.keys())
                sentiment = avgs[avg_sentiment]
                candidate['avg_sentiment'] = avg_sentiment
                candidate['sentiment'] = sentiment
                state_sent[c] = candidate
        stats[k] = state_sent
    return stats

In [115]:
after_cand_stats = get_sentiment_stats(after_cand_sentiments)
before_cand_stats = get_sentiment_stats(before_cand_sentiments)
after_cand_comment_stats = get_sentiment_stats(after_cand_comment_sentiments)
before_cand_comment_stats = get_sentiment_stats(before_cand_comment_sentiments)

In [116]:
save_data('data/candidate_sentiments/after_cand_post_sentiments.json', after_cand_stats)
save_data('data/candidate_sentiments/before_cand_post_sentiments.json', before_cand_stats)
save_data('data/candidate_sentiments/after_cand_comment_sentiments.json', after_cand_comment_stats)
save_data('data/candidate_sentiments/before_cand_comment_sentiments.json', before_cand_comment_stats)

## Political Direction Analysis
Get political direction for each topic. Might look like:
```
'texas': {
    'election': {
        'left': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'center': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       }
       'right': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'direction': 'left/center/right',
       'avg_score': 0.0
    },
    'abortion': {
        'left': {
            ...
       },
       'center': {
            ...
       }
       'right': {
            ...
       },
       'direction': 'left/center/right',
       'avg_score': 0.0
    ...
},
'california': {
    'election': {
        ...
    },
    ...
}

In [156]:
# we will group the keywords we came up with into broader topics
topics_dict = {
    'election': ['election', 'president', 'vote', 'voting'],
    'abortion': ['abortion', 'reproductive rights'],
    'immigration': ['immigration', 'immigrant', 'refugee'],
    'economy': ['economy', 'inflation', 'tax'],
    'war': ['war', 'ukraine', 'israel', 'palestine'],
    'democracy': ['democracy', 'freedom', 'stop the steal'],
    'climate': ['climate', 'climate change', 'global warming'],
    'healthcare': ['healthcare', 'health', 'medical']
}

In [167]:
# group filtered political posts 
def group_political_posts(posts):
    grouped_posts = dict()
        
    for k in posts.keys():
        grouped_posts[k] = dict()
        state_posts = posts[k]

        for topic in topics_dict.keys():
            post_set = set()
            
            for post in state_posts:
                for word in topics_dict[topic]:
                    if word in post["selftext"].lower() or word in post["title"].lower():
                        post_set.add(post["selftext"] + " " + post["title"])
                grouped_posts[k][topic] = list(post_set)
    return grouped_posts

In [168]:
grouped_posts_after = group_political_posts(after_political_posts)
grouped_posts_before = group_political_posts(before_political_posts)

In [170]:
# group filtered political comments 
def group_political_comments(comments):
    grouped_comments = dict()
        
    for k in comments.keys():
        grouped_comments[k] = dict()
        state_comments = comments[k]

        for topic in topics_dict.keys():
            comment_set = set()
                
            for comment in state_comments:
                for word in topics_dict[topic]:
                    if word in comment["body"].lower():
                        comment_set.add(comment["body"])
                grouped_comments[k][topic] = list(comment_set)
    return grouped_comments

In [171]:
grouped_comments_after = group_political_comments(after_political_comments)
grouped_comments_before = group_political_comments(before_political_comments)

In [172]:
# print out size of grouped posts per state - for testing
def count_grouped_posts(grouped_posts):
    group_posts_size = dict()
    for k in grouped_posts.keys():
        group_posts_size[k] = dict()
        
        for t in grouped_posts[k].keys(): 
            group_posts_size[k][t] = len(grouped_posts[k][t])
    return group_posts_size

In [176]:
#count_grouped_posts(grouped_posts_after)

In [174]:
count_grouped_posts(grouped_comments_after)

{'nevada': {'election': 358,
  'abortion': 5,
  'immigration': 13,
  'economy': 30,
  'war': 44,
  'democracy': 16,
  'climate': 6,
  'healthcare': 21},
 'wyoming': {'election': 43,
  'abortion': 90,
  'immigration': 5,
  'economy': 9,
  'war': 11,
  'democracy': 6,
  'climate': 10,
  'healthcare': 62}}

In [179]:
pol_dir_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
pol_dir_model = AutoModelForSequenceClassification.from_pretrained("bucketresearch/politicalBiasBERT")
dir_map = {
    0: 'left',
    1: 'center',
    2: 'right'
}

In [187]:
# TEST
text = "I don't believe in climate change"
inputs = pol_dir_tokenizer(text, return_tensors="pt")
labels = torch.tensor([0])
outputs = pol_dir_model(**inputs, labels=labels)
loss, logits = outputs[:2]
print(logits.softmax(dim=-1)[0].tolist())

[0.24744857847690582, 0.17667832970619202, 0.5758731365203857]


In [207]:
# compute highest direction score and label for given text
def get_direction_label_score(text):
    inputs = pol_dir_tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    labels = torch.tensor([0])
    outputs = pol_dir_model(**inputs, labels=labels)
    loss, logits = outputs[:2]

    # ranking = list of labels in decreasing order of direction score
    scores = logits.softmax(dim=-1)[0].tolist()
    ranking = np.argsort(logits.softmax(dim=-1)[0].tolist())[::-1]

    # return largest score and the associated sentiment
    l = dir_map[ranking[0]]
    s = scores[ranking[0]]
    return l, s

In [208]:
#TEST
get_direction_label_score("trans lives matter")

('left', 0.4164985120296478)

In [209]:
def get_direction_scores_grouped_posts(grouped_posts):
    all_directions = dict()
    
    for k in grouped_posts.keys():
        all_directions[k] = dict()
        
        for topic in grouped_posts[k].keys():
            all_directions[k][topic] = dict()
            topic_data = {
                'left': {},
                'center': {},
                'right': {},
            }

            for post in grouped_posts[k][topic]:
                direction, score = get_direction_label_score(post)
                record_sentiment(topic_data[direction], score)
                
                all_directions[k][topic] = topic_data
    return all_directions

In [210]:
after_cand_post_dir = get_direction_scores_grouped_posts(grouped_posts_after)

In [216]:
# before_cand_post_dir = get_direction_scores_grouped_posts(grouped_posts_before)

In [213]:
after_cand_comment_dir = get_direction_scores_grouped_posts(grouped_comments_after)

In [217]:
# before_cand_comment_dir = get_direction_scores_grouped_posts(grouped_comments_before)

In [214]:
save_data('after_cand_post_directions.json', after_cand_post_dir)
save_data('after_cand_comment_directions.json', after_cand_comment_dir)

In [218]:
# get average, min, max, and dominant sentiment for each state
def get_direction_stats(all_directions):
    stats = all_directions.copy()
    
    for k in stats.keys():
        state_dir = stats[k].copy()
        
        for d in state_dir.keys():
            avgs = dict()
            topics = state_dir[d].copy()

            for t in topics.keys():
                topic = topics[t].copy()
                if 'scores' not in topic.keys():
                    topic['scores'] = [0]
                    
                topic['min'] = min(topic['scores'])
                topic['max'] = max(topic['scores'])
                topic['average'] = np.mean(topic['scores'])
                avgs[topic['average']] = t
                
                if 'num_posts' not in topic.keys():
                    topic['num_posts'] = 0
                topics[t] = topic
            
            if len(avgs.keys()) > 0:
                avg_score = max(avgs.keys())
                direction = avgs[avg_score]
                topics['avg_score'] = avg_score
                topics['direction'] = direction
                state_dir[d] = topics
        stats[k] = state_dir
    return stats

In [219]:
after_topic_post_stats = get_direction_stats(after_cand_post_dir)
after_topic_comment_stats = get_direction_stats(after_cand_comment_dir)

In [221]:
save_data('data/topic_pol_directions/after_topic_post_stats.json', after_topic_post_stats)
save_data('data/topic_pol_directions/after_topic_comment_stats.json', after_topic_comment_stats)

# ===== OLD STUFF ===== DO NOT RUN =====

## Option 1: take sentiment of entire post, give an average rating

Get the aggregated sentiment score for each post. Might want something like:
```all_sentiments = {
    'texas': {
        'positive': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'neutral': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       }
       'negative': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'sentiment': 'positive',
       'avg_sentiment': 0.0
    },
    ...
}

In [24]:
# helper for creating the json above
def record_sentiment(sent_data, score):
    if "scores" in sent_data.keys():
        sent_data["scores"].append(score)
    else:
        sent_data["scores"] = [score]

    if "num_posts" in sent_data.keys():
        sent_data["num_posts"] += 1
    else:
        sent_data["num_posts"] = 1

In [25]:
# iterate through each state and compute highest weight sentiment for each post
def get_sentiment_scores_posts(political_posts):
    all_sentiment = dict()
    
    for k in political_posts.keys():
        state_data = {
            'positive': {},
            'neutral': {},
            'negative': {},
        }
        
        for post in political_posts[k]:
            # as we are not working with semantic meaning, we will combine the text and titles of the posts
            content = post['title'] + " " + post['selftext']
            label, score = get_sentiment_label_score(content)
            record_sentiment(state_data[label], score)
            
            all_sentiment[k] = state_data
    return all_sentiment

In [26]:
# this is the comments version of the above
def get_sentiment_scores_comments(political_comments):
    all_sentiment = dict()
    
    for k in political_comments.keys():
        state_data = {
            'positive': {},
            'neutral': {},
            'negative': {},
        }
        
        for post in political_comments[k]:
            content = post['body']
            label, score = get_sentiment_label_score(content)
            record_sentiment(state_data[label], score)
            
            all_sentiment[k] = state_data
    return all_sentiment

In [27]:
#after_post_sentiments = get_sentiment_scores_posts(after_political_posts)

In [28]:
#before_post_sentiments = get_sentiment_scores_posts(before_political_posts)

In [32]:
#after_comment_sentiments = get_sentiment_scores_comments(after_political_comments)

In [33]:
#before_comment_sentiments = get_sentiment_scores_comments(before_political_comments)

In [None]:
# after_post_sentiments = extract_data('reddit-sentiment-analysis/sentiments/all_posts/after_post_sentiments.json')
# before_post_sentiments = extract_data('reddit-sentiment-analysis/sentiments/all_posts/before_post_sentiments.json')
# after_comment_sentiments = extract_data('reddit-sentiment-analysis/sentiments/all_posts/after_comments_sentiments.json')
# before_comment_sentiments = extract_data('reddit-sentiment-analysis/sentiments/all_posts/before_comments_sentiments.json')

In [42]:
# get average, min, max, and dominant sentiment for each state
def get_sentiment_stats(all_sentiment):
    stats = dict()
    for k in all_sentiment.keys():
        avgs = dict()
        stats[k] = dict()
        state_sent = all_sentiment[k]
        
        for s in state_sent.keys():
            sent = state_sent[s]
            if 'scores' not in sent.keys():
                sent['scores'] = [0]
                
            sent['min'] = min(sent['scores'])
            sent['max'] = max(sent['scores'])
            sent['average'] = np.mean(sent['scores'])
            avgs[sent['average']] = s
            
            if 'num_posts' not in sent.keys():
                sent['num_posts'] = 0
    
        avg_sentiment = max(avgs.keys())
        sentiment = avgs[avg_sentiment]
        stats[k]['avg_sentiment'] = avg_sentiment
        stats[k]['sentiment'] = sentiment
    return stats

In [43]:
after_post_sentiments_stats = get_sentiment_stats(after_post_sentiments)
before_post_sentiments_stats = get_sentiment_stats(before_post_sentiments)

In [45]:
after_comment_sentiments_stats = get_sentiment_stats(after_comment_sentiments)
before_comment_sentiments_stats = get_sentiment_stats(before_comment_sentiments)

In [51]:
save_data('after_post_sentiments.json', after_post_sentiments)
save_data('before_post_sentiments.json', before_post_sentiments)
save_data('after_comment_sentiments.json', after_comment_sentiments)
save_data('before_comment_sentiments.json', before_comment_sentiments)

In [52]:
save_data('after_post_stats.json', after_post_sentiments_stats)
save_data('before_post_stats.json', before_post_sentiments_stats)
save_data('after_comment_stats.json', after_comment_sentiments_stats)
save_data('before_comment_stats.json', before_comment_sentiments_stats)

## Option 2: Get sentiment for each topic per state, print the same statistics as above
Further subdivide the political posts by their main topics. Data will look something like this:
```all_sentiments = {
    'texas': {
        'election': {
            'positive': {
                   'scores': [0, 0, ...],
                   'average': 0.0,
                   'max': 0.0,
                   'min': 0.0,
                   'num_posts': 0
               },
               'neutral': {
                   'scores': [0, 0, ...],
                   'average': 0.0,
                   'max': 0.0,
                   'min': 0.0,
                   'num_posts': 0
               }
               'negative': {
                   'scores': [0, 0, ...],
                   'average': 0.0,
                   'max': 0.0,
                   'min': 0.0,
                   'num_posts': 0
               },
               'sentiment': 'positive',
               'avg_sentiment': 0.0
            },
        },
        'republican': {
            'positive': {
                ...
               },
               'neutral': {
                ...
               }
               'negative': {
                ...
               },
               'sentiment': 'positive',
               'avg_sentiment': 0.0
            },
        },
        ...
    },
    ...
}

In [91]:
# group filtered political posts 
def group_political_posts(posts):
    grouped_posts = dict()
        
    for k in posts.keys():
        grouped_posts[k] = dict()
        state_posts = posts[k]

        for post in state_posts:
            for topic in topics_dict.keys():
                post_set = set()
                
                for word in topics_dict[topic]:
                    if word in post["selftext"].lower() or word in post["title"].lower():
                        post_set.add(post["selftext"] + " " + post["title"])
                grouped_posts[k][topic] = list(post_set)
    return grouped_posts

In [94]:
# group filtered political comments 
def group_political_comments(comments):
    grouped_comments = dict()
        
    for k in comments.keys():
        grouped_comments[k] = dict()
        state_comments = comments[k]

        for comment in state_comments:
            for topic in topics_dict.keys():
                comment_set = set()
                
                for word in topics_dict[topic]:
                    if word in comment["body"].lower():
                        comment_set.add(comment["body"])
                grouped_comments[k][topic] = list(comment_set)
    return grouped_comments

In [95]:
grouped_posts_after = group_political_posts(after_political_posts)
grouped_posts_before = group_political_posts(before_political_posts)

In [96]:
grouped_posts_after['michigan']

{'election': [],
 'republican': [' ‘Bullish on Michigan’: As polls close, Republicans are optimistic on Trump’s chances'],
 'democrat': [],
 'abortion': [],
 'immigration': [],
 'economy': [],
 'war': [],
 'democracy': [],
 'climate': [],
 'healthcare': []}

In [97]:
grouped_comments_after = group_political_comments(after_political_comments)
grouped_comments_before = group_political_comments(before_political_comments)

In [98]:
grouped_comments_after['nevada']

{'election': ['So it sounds like me, you, and David Duke would be pretty close on Israel policy; if you nut jobs wanna blow each other up, be my guest, but not with my guns, not with my taxes, and sure as hell not with American lives. Would you agree with this?\n\nAnd yes, Cheney did shoot a guy in the face. I think it\'s maybe one of the nicer things Dick has ever done in his life. He also lied to the entire world about WMD\'s to push the United States into an unquestionably immoral war, killing thousands of Americans and hundreds of thousands of Iraqis. David Duke, on the other hand, is a loud racist idiot that hasn\'t done anything meaningful in his entire life. I think one of these people is a clear threat to freedom and democracy in this country. \n\nAlso, I find it a bit ironic that in your mind, "whoever the Nazis are voting for" has been determined to be Jill Stein, the farthest left and only Jewish candidate.'],
 'republican': [],
 'democrat': [],
 'abortion': [],
 'immigratio

In [99]:
# print out size of grouped posts per state - for testing
def count_grouped_posts(grouped_posts):
    group_posts_size = dict()
    for k in grouped_posts.keys():
        group_posts_size[k] = dict()
        
        for t in grouped_posts[k].keys(): 
            group_posts_size[k][t] = len(grouped_posts[k][t])
    return group_posts_size

In [101]:
#count_grouped_posts(grouped_posts_after)

In [67]:
#count_grouped_posts(grouped_posts_before)

In [103]:
#count_grouped_posts(grouped_comments_after)

In [69]:
#count_grouped_posts(grouped_comments_before)

In [110]:
def get_sentiment_scores_grouped_posts(grouped_posts):
    all_sentiment = dict()
    
    for k in grouped_posts.keys():
        all_sentiment[k] = dict()
        
        for topic in grouped_posts[k].keys():
            all_sentiment[k][topic] = dict()
            topic_data = {
                'positive': {},
                'neutral': {},
                'negative': {},
            }

            for post in grouped_posts[k][topic]:
                label, score = get_sentiment_label_score(post)
                record_sentiment(topic_data[label], score)
                
                all_sentiment[k][topic] = topic_data
    return all_sentiment

In [113]:
post_sentiment_per_topic_after = get_sentiment_scores_grouped_posts(grouped_posts_after)
post_sentiment_per_topic_before = get_sentiment_scores_grouped_posts(grouped_posts_before)

In [114]:
comment_sentiment_per_topic_after = get_sentiment_scores_grouped_posts(grouped_comments_after)
comment_sentiment_per_topic_before = get_sentiment_scores_grouped_posts(grouped_comments_before)

In [123]:
# get average, min, max, and dominant sentiment for each topic per state
def get_sentiment_stats(all_sentiment):
    stats = dict()
    for k in all_sentiment.keys():
        stats[k] = dict()
        
        for topic in all_sentiment[k].keys():
            stats[k][topic] = dict()
            topic_sent = all_sentiment[k][topic]
            avgs = dict()
        
            for s in topic_sent.keys():
                sent = topic_sent[s]
                if 'scores' not in sent.keys():
                    sent['scores'] = [0]
                    
                sent['min'] = min(sent['scores'])
                sent['max'] = max(sent['scores'])
                sent['average'] = np.mean(sent['scores'])
                avgs[sent['average']] = s
                
                if 'num_posts' not in sent.keys():
                    sent['num_posts'] = 0
        
            if len(avgs.keys()) > 0:
                avg_sentiment = max(avgs.keys())
                sentiment = avgs[avg_sentiment]
                stats[k][topic]['avg_sentiment'] = avg_sentiment
                stats[k][topic]['sentiment'] = sentiment
    return stats

In [126]:
post_stats_per_topic_after = get_sentiment_stats(post_sentiment_per_topic_after)
post_stats_per_topic_before = get_sentiment_stats(post_sentiment_per_topic_before)

In [127]:
comment_stats_per_topic_after = get_sentiment_stats(comment_sentiment_per_topic_after)
comment_stats_per_topic_before = get_sentiment_stats(comment_sentiment_per_topic_before)

In [130]:
save_data('after_post_sentiments_per_topic.json', post_sentiment_per_topic_after)
save_data('before_post_sentiments_per_topic.json', post_sentiment_per_topic_before)
save_data('after_comment_sentiments_per_topic.json', comment_sentiment_per_topic_after)
save_data('before_comment_sentiments_per_topic.json', comment_sentiment_per_topic_before)

In [131]:
save_data('after_post_stats_per_topic.json', post_stats_per_topic_after)
save_data('before_post_stats_per_topic.json', post_stats_per_topic_before)
save_data('after_comment_stats_per_topic.json', comment_stats_per_topic_after)
save_data('before_comment_stats_per_topic.json', comment_stats_per_topic_before)

## Testing Topic Modeling for Political Data Extraction
Probably don't run the code below.

In [92]:
from bertopic import BERTopic
topic_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")

model_topics = topic_model.get_topic_info()

In [93]:
model_topics

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,633881,-1_cast_films_film_movie,"[cast, films, film, movie, 2020, comedy, relea...",
1,0,18441,0_goalscorer_scored_goals_goal,"[goalscorer, scored, goals, goal, goalkeeper, ...",
2,1,8518,1_khan_actor_raj_shah,"[khan, actor, raj, shah, crore, hai, actress, ...",
3,2,7521,2_married_divorced_couple_remarried,"[married, divorced, couple, remarried, engaged...",
4,3,6765,3_cast_actress_starred_actor,"[cast, actress, starred, actor, actors, starri...",
...,...,...,...,...,...
2372,2371,30,2371_paintings_painting_paint_art,"[paintings, painting, paint, art, artist, gall...",
2373,2372,30,2372_tulips_tulip_economists_economic,"[tulips, tulip, economists, economic, bulbs, 1...",
2374,2373,30,2373_squads_squad_roster_players,"[squads, squad, roster, players, teams, tourna...",
2375,2374,30,2374_entrances_subterranean_tunnel_stairs,"[entrances, subterranean, tunnel, stairs, pyra...",


In [95]:
# Taking the numbers of topics that contain our list of keywords
political_topic_nums = []
for w in keywords:
    for i in range(len(model_topics["Representation"])):
        if w in model_topics["Representation"][i]:
            political_topic_nums.append(i)

In [96]:
political_topic_nums

[10,
 56,
 74,
 75,
 125,
 243,
 272,
 320,
 359,
 391,
 556,
 608,
 718,
 876,
 1010,
 1033,
 1068,
 1147,
 1217,
 1259,
 1296,
 1664,
 1879,
 1963,
 1996,
 1997,
 2232,
 2299,
 363,
 608,
 782,
 939,
 1147,
 1458,
 1594,
 1681,
 1729,
 1856,
 2123,
 2140,
 10,
 56,
 90,
 202,
 272,
 363,
 372,
 449,
 782,
 939,
 1033,
 1147,
 1458,
 1594,
 1681,
 1729,
 2123,
 2140,
 74,
 359,
 1217,
 10,
 1217,
 1879,
 286,
 503,
 1010,
 1681,
 1856,
 235,
 523,
 745,
 1639,
 290,
 601,
 1259,
 1318,
 1962,
 535,
 841,
 966,
 1404,
 1998,
 2352,
 2366,
 23,
 878,
 1179,
 543,
 590,
 714,
 957,
 261,
 455,
 674,
 1954,
 2028,
 2040,
 2344]

In [97]:
# Document needs to be a list to pass into topic modelling model
# Create a new json of { state1: [ "title1, text1", "title2, text2", ... ], state2: ["title1, text1", ...]
state_to_post = dict()

for k in political_posts.keys():
    posts = []
    for post in political_posts[k]:
        post_str = post['title'] + " " + post['selftext']
        posts.append(post_str)
    state_to_post[k] = posts

In [98]:
state_to_topic = dict()

for k in state_to_post.keys():
    if len(state_to_post[k]) != 0:
        post_topics, post_probs = topic_model.transform(state_to_post[k])
        state_to_topic[k] = {
            'topics': post_topics,
            'probabilities': post_probs
        }

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:11,380 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:14,057 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:15,479 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:15,700 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:15,789 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:19,162 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:20,597 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:21,001 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:21,072 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:21,330 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:22,135 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:22,772 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:23,085 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:23,274 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:26,794 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:30,724 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:30,820 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:32,397 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:33,001 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:33,261 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:34,864 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:35,114 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:35,296 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:35,643 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:40,180 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:40,498 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:41,101 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:41,285 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:43,414 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:43,551 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:43,996 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:44,240 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:44,580 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:44,684 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:45,394 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:47,458 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:48,154 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:49,988 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:50,334 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:51,135 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:51,796 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:52,602 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:53,217 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:53,755 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-11-24 19:16:57,285 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:57,925 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-24 19:16:59,724 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [99]:
state_to_topic

{'texas': {'topics': array([1889,   65, 1889, 1272, 1889,  234,  234,  234, 1501, 2297,  251]),
  'probabilities': array([0.3581864 , 0.3898025 , 0.39754048, 0.33664972, 0.3568266 ,
         0.5804087 , 0.5804087 , 0.43511474, 0.39582056, 0.37019277,
         0.38129365], dtype=float32)},
 'california': {'topics': array([ 308,  308,  308,  673,    9,    9,    9,  542,  542,   20,    9,
            9,    9,    9, 1239,  481]),
  'probabilities': array([0.37759215, 0.37759215, 0.37759215, 0.6793472 , 0.49634874,
         0.49634874, 0.49634874, 0.6666453 , 0.6666453 , 0.44082233,
         0.39934936, 0.39934936, 0.39934936, 0.39934936, 0.43308628,
         0.59780335], dtype=float32)},
 'michigan': {'topics': array([ 260,  260, 1213,  461, 1615,  542]),
  'probabilities': array([0.4505028 , 0.4505028 , 0.41623247, 0.45489872, 0.3821689 ,
         0.41819587], dtype=float32)},
 'minnesota': {'topics': array([313, 436,  22]),
  'probabilities': array([0.4170059 , 0.3053832 , 0.31694606], d

In [100]:
political_posts_modeled = dict()

for k in state_to_topic.keys():
    indices = []
    state = state_to_topic[k]
    state_topics = state['topics']
    state_probs = state['probabilities']

    for i in range(len(state_topics)):
        if state_topics[i] in political_topic_nums and state_probs[i] >= 0.5:
            indices.append(i)
    political_posts_modeled[k] = indices

In [101]:
political_posts_modeled

{'texas': [],
 'california': [],
 'michigan': [],
 'minnesota': [],
 'colorado': [],
 'wisconsin': [],
 'florida': [],
 'connecticut': [],
 'oregon': [],
 'ohio': [],
 'northcarolina': [],
 'oklahoma': [],
 'maryland': [],
 'arizona': [],
 'virginia': [],
 'maine': [],
 'indiana': [],
 'iowa': [],
 'washington': [],
 'newhampshire': [],
 'alaska': [],
 'louisiana': [],
 'massachusetts': [],
 'vermont': [],
 'newyork': [],
 'arkansas': [],
 'pennsylvania': [],
 'alabama': [],
 'kentucky': [],
 'southcarolina': [],
 'georgia': [],
 'delaware': [],
 'utah': [],
 'rhodeisland': [],
 'missouri': [],
 'tennessee': [],
 'nebraska': [],
 'illinois': [],
 'westvirginia': [],
 'newmexico': [],
 'mississippi': [],
 'kansas': [],
 'northdakota': [],
 'idaho': [],
 'southdakota': [],
 'wyoming': [],
 'nevada': []}