In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from itertools import combinations
import glob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import tokenize
import nltk
import datetime
import pickle
import pdb
import os
from utils import (process_coms,
                 filter_processed_coms_by_date,
                 get_sub_edges,
                 get_division_edges,
                 get_rivalry_graph,
                 filter_comments_by_subs,
                 get_division_edges)

In [None]:
# Read in list of subreddit names, initials, and flairs

with open('nfl_subs.txt','r') as f:
    subs = [s.strip() for s in f.readlines()]
with open('nfl_flairs.txt','r') as f:
    flairs = [fl.strip() for fl in f.readlines()]
with open('nfl_inits.txt','r') as f:
    inits = [i.strip() for i in f.readlines()]
df = pd.DataFrame(dict(subreddit=subs, flair=flairs, inits=inits))
teams=df
team2abbrev = dict(zip(teams.subreddit,teams.inits))
flair2team = dict(zip(teams.flair, teams.subreddit))

# Initial comment processing

In [None]:
### Process raw comment data and analyze sentiment

rerun = []
nltk.download('punkt')
for ii, filename in enumerate(glob.glob('data/nfl/comments/*.csv')):

    subname = filename.split('/')[-1].split('.')[0]
   
    d, meta  = process_coms(filename, no_zero_sentiment=True)
    if type(d) == str: 
        rerun.append(d)
        continue

             
    if ii ==0:
        metadata = meta
    else:
        for key in metadata:
            metadata[key] += meta[key]
    
    d.to_csv('data/nfl_nonzero/processed/comments/{}.csv'.format(subname))    

df = pd.DataFrame(metadata)
df.to_csv('data/nfl_nonzero/processed/metadata.csv')

# Organizing comments into NFL Weeks and season phases

In [None]:
### Filter processed coms by NFL Season Week and compute new metadata

tag = '_nonzero' # Only include comments with non-zero sentiment scores
dweek = datetime.timedelta(days= 7)


for week in range(0,25):
    
    if week == 0:
            
        stopstring = '2022-09-08'
        stop = datetime.datetime.fromisoformat(stopstring)
        startstring = '2021-03-01'
        start = datetime.datetime.fromisoformat(startstring)

        print('week:{}'.format(week),start,stop)

    elif week == 1:
        startstring = '2022-09-08'
        start = datetime.datetime.fromisoformat(startstring)
        stop += dweek
        print('week:{}'.format(week),start,stop)

    else:
        start += dweek
        stop += dweek
        print('week:{}'.format(week),start,stop)
        
    for ii, filename in enumerate(glob.glob('data/nfl'+tag+'/processed/comments/*.csv')):
    
        
        subname = filename.split('/')[-1].split('.')[0]
        d, meta  = filter_processed_coms_by_date(filename,start,stop)
        
        if type(d) == str: 
            print('help', d)
            continue

        if ii ==0:
            metadata = meta
        else:
            for key in metadata:
                metadata[key] += meta[key]
        if not os.path.exists('data/nfl'+tag+'/weeks/{}'.format(str(week))):
            os.mkdir('data/nfl'+tag+'/weeks/{}'.format(str(week)))
            os.mkdir('data/nfl'+tag+'/weeks/{}/comments'.format(str(week)))

        d.to_csv('data/nfl'+tag+'/weeks/{}/comments/{}.csv'.format(str(week),subname))
    md = pd.DataFrame(metadata)
    md.to_csv('data/nfl'+tag+'/weeks/{}/metadata.csv'.format(str(week)))

In [None]:
    
### Alternatively, collect all regular-season comments

version = 'regular' #Need to do offseason 2022-03-01 2022-09-07
which_nfl = 'nfl'
for ii, filename in enumerate(glob.glob('data/{}/processed/comments/*.csv'.format(which_nfl))):
    subname = filename.split('/')[-1].split('.')[0]
    
    startstring = '2022-09-08'
    start = datetime.datetime.fromisoformat(startstring)

    stopstring = '2023-02-13'
    stop = datetime.datetime.fromisoformat(stopstring)


    d, meta  = filter_date(filename,start, stop)
    if type(d) == str: 
        print(d)
        continue
    
    for k, v in meta.items():
        print(k,v)
             
    if ii ==0:
        metadata = meta
    else:
        for key in metadata:
            metadata[key] += meta[key]
    d.to_csv('data/{}/{}/comments/{}.csv'.format(which_nfl, version, subname))
df = pd.DataFrame(metadata)
df.to_csv('data/{}/{}/metadata.csv'.format(which_nfl, version))

# Constructing graph edges

In [None]:
   
### Construct subreddit graph edges week by week ####

for week in range(25):
    all_edges = []

    for ii, filename in enumerate(glob.glob('data/nfl_nonzero/weeks/{}/comments/*.csv'.format(week))):
        edges = get_sub_edges(filename,teams.subreddit.values)
        print(len(edges))



        if type(edges) == str: 
            print(edges)
            continue
        all_edges.extend(edges)



    with open('data/nfl_nonzero/weeks/{}/all_edges.pkl'.format(week),'wb') as f:
        pickle.dump(all_edges, f)
    
    meta
    
### Construct subreddit graph edges in offseason, regular season, etc

all_edges = []
version = 'regular'
for ii, filename in enumerate(glob.glob('data/nfl_nonzero/{}/comments/*.csv'.format(version))):
    edges, meta = get_sub_edges(filename,teams.subreddit.values)
    
    
   
    if type(edges) == str: 
        print(edges)
        continue
    all_edges.extend(edges)
    
    for k, v in meta.items():
        print(k,v)
             
    if ii ==0:
        metadata = meta
    else:
        for key in metadata:
            metadata[key] += meta[key]
with open('data/nfl_nonzero/{}/all_edges.pkl'.format(version),'wb') as f:
    pickle.dump(all_edges, f)
dfm = pd.DataFrame(metadata)
dfm.to_csv('data/nfl_nonzero/{}/edge_metadata.csv'.format(version))

# Summary info from graph edges

In [None]:
for version in ['offseason','regular','playoffs']:
    print(version)
    edgefile = 'data/nfl_nonzero/{}/all_edges.pkl'.format(version)
    data = dict(zip(teams.subreddit.values, [dict(in_weight = 0, out_weight=0, in_sent=0, out_sent=0, in_cont=0, out_cont=0, in_score = 0, out_score = 0) for team in teams.subreddit.values]))
    with open(edgefile,'rb') as f:
        all_edges = pickle.load(f)

    
    
    for e in all_edges:
        steam, tteam, weight, sent, cont, score = e
        print('sent', sent)
        if e[0] == e[1]: 
            data[steam]['self_sent'] = sent
            continue
        data[tteam]['in_weight'] += weight
        data[tteam]['in_sent']+= sent
        data[tteam]['in_cont'] += cont
        data[tteam]['in_score'] += score

        
        data[steam]['out_weight'] += weight
        data[steam]['out_sent']+= sent
        data[steam]['out_cont'] += cont
        data[steam]['out_score'] += score

    for key,val in data.items():
        for kk, vv in val.items():
            if kk == 'self_sent': continue
            data[key][kk]/=32.
    df = pd.DataFrame(data)
    df = df.transpose()
    df['team'] = df.index.values
    df.to_csv('data/summary_stats_in_{}.csv'.format(version), index=False)
    
    
    with open('data/summary_dict_in_{}.pkl'.format(version),'wb') as f:
        pickle.dump(data, f)