In [5]:
from pprint import pprint
import re
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import nltk 
from nltk.sentiment import SentimentIntensityAnalyzer as SIA
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danchizik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/danchizik/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [8]:
df = pd.read_csv('data/reddit_nba_master_data.csv')

In [9]:
df.head()

Unnamed: 0,Title,ID,Author,Name,Author Flair Text,# Comments,Time,# Upvotes,Link,Upvote Ratio
0,Daily Discussion Thread + Game Thread Index,1hybybh,NBA_MOD,t3_1hybybh,r/NBA,5,1736536000.0,9,https://www.reddit.com/r/nba/comments/1hybybh/...,0.92
1,Weekly Friday Self-Promotion and Fan Art Thread,1hy3vjo,NBA_MOD,t3_1hy3vjo,r/NBA,0,1736514000.0,6,https://www.reddit.com/r/nba/comments/1hy3vjo/...,0.88
2,Jimmy Butler on his Instagram story to his Big...,1hy64xk,YujiDomainExpansion,t3_1hy64xk,,1476,1736521000.0,11827,https://streamable.com/9zqmf3,0.93
3,"Jarrett Allen explains Ethical Basketball: ""Fa...",1hxwrre,2131andBeyond,t3_1hxwrre,:cle-5: Cavaliers,409,1736485000.0,9988,https://streamable.com/uo94x4,0.97
4,Joe Mazzulla goes in depth on what his typical...,1hxy23z,SliMShady55222,t3_1hxy23z,:sea-1: Supersonics,252,1736490000.0,6170,https://streamable.com/u9yy46,0.98


In [10]:
df.shape

(121766, 10)

In [11]:
#dropping duplicate rows
df = df.drop_duplicates()

In [12]:
df.shape

(111646, 10)

In [13]:
#making the title column a list to then assign sentiment labels
title_list = df['Title'].tolist()
type(title_list[0])

str

In [14]:
sia = SIA()
results = []

for title in title_list: 
    pol_score = sia.polarity_scores(title)
    pol_score['Title'] = title
    results.append(pol_score)

In [15]:
pprint(results[:3], width=100)

[{'Title': 'Daily Discussion Thread + Game Thread Index',
  'compound': 0.0,
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'Title': 'Weekly Friday Self-Promotion and Fan Art Thread',
  'compound': 0.3182,
  'neg': 0.0,
  'neu': 0.723,
  'pos': 0.277},
 {'Title': 'Jimmy Butler on his Instagram story to his Big Face Coffee employee: “Our best guy '
           'right here. You see that? See that? I gave you a compliment. That’s what bosses do. '
           'They build you up they don’t break you down.”',
  'compound': 0.8253,
  'neg': 0.0,
  'neu': 0.825,
  'pos': 0.175}]


In [16]:
titles_and_labels = pd.DataFrame.from_records(results)
titles_and_labels.head()

Unnamed: 0,neg,neu,pos,compound,Title
0,0.0,1.0,0.0,0.0,Daily Discussion Thread + Game Thread Index
1,0.0,0.723,0.277,0.3182,Weekly Friday Self-Promotion and Fan Art Thread
2,0.0,0.825,0.175,0.8253,Jimmy Butler on his Instagram story to his Big...
3,0.0,0.577,0.423,0.765,"Jarrett Allen explains Ethical Basketball: ""Fa..."
4,0.0,0.848,0.152,0.3612,Joe Mazzulla goes in depth on what his typical...


In [17]:
titles_and_labels.shape

(111646, 5)

In [18]:
titles_and_labels['label'] = 0
titles_and_labels.loc[titles_and_labels['compound'] > 0.075, 'label'] = 1
titles_and_labels.loc[titles_and_labels['compound'] < -0.075, 'label'] = -1
titles_and_labels.head()

Unnamed: 0,neg,neu,pos,compound,Title,label
0,0.0,1.0,0.0,0.0,Daily Discussion Thread + Game Thread Index,0
1,0.0,0.723,0.277,0.3182,Weekly Friday Self-Promotion and Fan Art Thread,1
2,0.0,0.825,0.175,0.8253,Jimmy Butler on his Instagram story to his Big...,1
3,0.0,0.577,0.423,0.765,"Jarrett Allen explains Ethical Basketball: ""Fa...",1
4,0.0,0.848,0.152,0.3612,Joe Mazzulla goes in depth on what his typical...,1


In [19]:
neg = titles_and_labels['neg'].tolist()
compound = titles_and_labels['compound'].tolist()
neu = titles_and_labels['neu'].tolist()
pos = titles_and_labels['pos'].tolist()
label = titles_and_labels['label'].tolist()

In [20]:
df['neg'] = neg
df['neu'] = neu
df['pos'] = pos
df['compound'] = compound
df['label'] = label

In [21]:
df.head()

Unnamed: 0,Title,ID,Author,Name,Author Flair Text,# Comments,Time,# Upvotes,Link,Upvote Ratio,neg,neu,pos,compound,label
0,Daily Discussion Thread + Game Thread Index,1hybybh,NBA_MOD,t3_1hybybh,r/NBA,5,1736536000.0,9,https://www.reddit.com/r/nba/comments/1hybybh/...,0.92,0.0,1.0,0.0,0.0,0
1,Weekly Friday Self-Promotion and Fan Art Thread,1hy3vjo,NBA_MOD,t3_1hy3vjo,r/NBA,0,1736514000.0,6,https://www.reddit.com/r/nba/comments/1hy3vjo/...,0.88,0.0,0.723,0.277,0.3182,1
2,Jimmy Butler on his Instagram story to his Big...,1hy64xk,YujiDomainExpansion,t3_1hy64xk,,1476,1736521000.0,11827,https://streamable.com/9zqmf3,0.93,0.0,0.825,0.175,0.8253,1
3,"Jarrett Allen explains Ethical Basketball: ""Fa...",1hxwrre,2131andBeyond,t3_1hxwrre,:cle-5: Cavaliers,409,1736485000.0,9988,https://streamable.com/uo94x4,0.97,0.0,0.577,0.423,0.765,1
4,Joe Mazzulla goes in depth on what his typical...,1hxy23z,SliMShady55222,t3_1hxy23z,:sea-1: Supersonics,252,1736490000.0,6170,https://streamable.com/u9yy46,0.98,0.0,0.848,0.152,0.3612,1


In [22]:
pd.crosstab(index=df["label"], columns="count")

col_0,count
label,Unnamed: 1_level_1
-1,20292
0,55914
1,35440


In [23]:
#Remove Irrelevant Characters
def remove_irrelevant_characters(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    return text

In [24]:
#Convert Text to Lowercase
def to_lowercase(text):
    return text.lower()

In [25]:
#Remove Stop Words
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

In [26]:
#Stemming 
stemmer = PorterStemmer()

def stem_words(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

In [27]:
#Whitespace Normalization
def normalize_whitespace(text):
    return " ".join(text.split())

In [28]:
#all functions in a pipelineabs
def clean_text_pipeline(text):
    text = remove_irrelevant_characters(text)
    text = to_lowercase(text)
    text = remove_stopwords(text)
    text = normalize_whitespace(text)
    return text

In [29]:
df["Cleaned Titles"] = df["Title"].apply(clean_text_pipeline)

In [30]:
df

Unnamed: 0,Title,ID,Author,Name,Author Flair Text,# Comments,Time,# Upvotes,Link,Upvote Ratio,neg,neu,pos,compound,label,Cleaned Titles
0,Daily Discussion Thread + Game Thread Index,1hybybh,NBA_MOD,t3_1hybybh,r/NBA,5,1.736536e+09,9,https://www.reddit.com/r/nba/comments/1hybybh/...,0.92,0.000,1.000,0.000,0.0000,0,daily discussion thread game thread index
1,Weekly Friday Self-Promotion and Fan Art Thread,1hy3vjo,NBA_MOD,t3_1hy3vjo,r/NBA,0,1.736514e+09,6,https://www.reddit.com/r/nba/comments/1hy3vjo/...,0.88,0.000,0.723,0.277,0.3182,1,weekly friday selfpromotion fan art thread
2,Jimmy Butler on his Instagram story to his Big...,1hy64xk,YujiDomainExpansion,t3_1hy64xk,,1476,1.736521e+09,11827,https://streamable.com/9zqmf3,0.93,0.000,0.825,0.175,0.8253,1,jimmy butler instagram story big face coffee e...
3,"Jarrett Allen explains Ethical Basketball: ""Fa...",1hxwrre,2131andBeyond,t3_1hxwrre,:cle-5: Cavaliers,409,1.736485e+09,9988,https://streamable.com/uo94x4,0.97,0.000,0.577,0.423,0.7650,1,jarrett allen explains ethical basketball farm...
4,Joe Mazzulla goes in depth on what his typical...,1hxy23z,SliMShady55222,t3_1hxy23z,:sea-1: Supersonics,252,1.736490e+09,6170,https://streamable.com/u9yy46,0.98,0.000,0.848,0.152,0.3612,1,joe mazzulla goes depth typical conversations ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121757,Oblique strains are far less common in the NBA...,1ggogh5,ILikeDillonBrooks,t3_1ggogh5,DPSF,5,1.730407e+09,23,https://x.com/instreetclothes/status/185205816...,0.96,0.199,0.801,0.000,-0.6908,-1,oblique strains far less common nba sports par...
121758,Bane and Smart out,1ggkx6z,Altruistic_Brief4444,t3_1ggkx6z,DB,46,1.730398e+09,36,https://i.redd.it/89eprmzlt4yd1.jpeg,0.97,0.000,0.526,0.474,0.4019,1,bane smart
121761,HAPPY JALLOWEEN!! Will it be Trick or Treat fo...,1ggeap3,nam67,t3_1ggeap3,HUFF DADDY,90,1.730381e+09,37,https://www.reddit.com/r/memphisgrizzlies/comm...,0.98,0.045,0.669,0.287,0.8185,1,happy jalloween trick treat memphis tonight gr...
121762,Dicks Sporting Goods has some 23/24 city editi...,1gghtyv,liltrikz,t3_1gghtyv,,7,1.730390e+09,18,https://i.redd.it/s50g97tv54yd1.jpeg,0.96,0.000,0.893,0.107,0.3400,1,dicks sporting goods city edition gear sale ha...
