# NBA vs NFL Subreddit Part 1 - Data Collection 
---

_Author: Matthew Hill_

### Imports
---

In [3]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from nltk.stem import WordNetLemmatizer

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer


# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Data Gathering
---

#### Using Pushsift's API

In [4]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [5]:
params = {'subreddit': 'nba',
         'size': 500,
         'before': 1666302682}

In [6]:
params2 = {'subreddit': 'nfl',
         'size': 500,
         'before': 1666302682}

In [7]:
params3 = {'subreddit': 'nba',
         'size': 500,
         'after': 1666302682}

In [8]:
params4 = {'subreddit': 'nfl',
         'size': 500,
         'after': 1666302682}

In [9]:
params5 = {'subreddit': 'nba',
         'size': 500,
         'before': 1666502682}

In [10]:
params6 = {'subreddit': 'nfl',
         'size': 500,
         'before': 1666502682}

In [11]:
params7 = {'subreddit': 'nba',
         'size': 500,
         'before': 1666644209}

In [12]:
params8 = {'subreddit': 'nfl',
         'size': 500,
         'before': 1666644209}

###  Using Request Library

In [13]:
res = requests.get(url, params)

In [14]:
res2 = requests.get(url, params2)

In [15]:
res3 = requests.get(url, params3)

In [16]:
res4 = requests.get(url, params4)

In [17]:
res5 = requests.get(url, params5)

In [18]:
res6 = requests.get(url, params6)

In [19]:
res7 = requests.get(url, params7)

In [20]:
res8 = requests.get(url, params8)

### Status Codes

In [21]:
res.status_code

200

In [22]:
res2.status_code

200

In [23]:
res3.status_code

200

In [24]:
res4.status_code

200

In [25]:
res5.status_code

200

In [26]:
res6.status_code

200

In [27]:
res7.status_code

200

In [28]:
res8.status_code

200

### Gathering Data with JSON Format

In [29]:
data = res.json()

In [30]:
data2 = res2.json()

In [31]:
data3 = res3.json()

In [32]:
data4 = res4.json()

In [33]:
data5 = res5.json()

In [34]:
data6 = res6.json()

In [35]:
data7 = res7.json()

In [36]:
data8 = res8.json()

In [37]:
posts = data['data'] + data3['data'] + data5['data'] + data7['data']

In [38]:
posts2 = data2['data'] + data4['data'] + data6['data'] + data8['data']

In [39]:
len(posts)

997

In [40]:
len(posts2)

1000

###  Sample NBA Reddit Text

In [41]:
posts[0]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'SiX_Paths-Madara',
 'author_flair_background_color': '',
 'author_flair_css_class': 'Thunder2',
 'author_flair_richtext': [{'e': 'text', 't': '[OKC] Russell Westbrook'}],
 'author_flair_text': '[OKC] Russell Westbrook',
 'author_flair_text_color': 'dark',
 'author_flair_type': 'richtext',
 'author_fullname': 't2_eeagp8k',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1666301949,
 'domain': 'youtu.be',
 'full_link': 'https://www.reddit.com/r/nba/comments/y9at70/202223_lakers_season_begins_narrated_by_lebron/',
 'gildings': {},
 'id': 'y9at70',
 'is_created_from_ads_ui': False,
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': False,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_ri

###  Sample NFL Reddit Text

In [42]:
posts2[0]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'ArizonaMadeDank',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_2t1kgdc0',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1666301726,
 'domain': 'i.redd.it',
 'full_link': 'https://www.reddit.com/r/nfl/comments/y9apwj/the_true_reason_were_all_excited_for_tnf/',
 'gildings': {},
 'id': 'y9apwj',
 'is_created_from_ads_ui': False,
 'is_crosspostable': False,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': True,
 'is_robot_indexable': False,
 'is_self': False,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 0,

### Creating NBA DataFrame

In [43]:
df1 = pd.DataFrame(posts)


In [44]:
df1.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,author_fullname,...,removed_by_category,author_flair_template_id,link_flair_css_class,link_flair_template_id,link_flair_text,distinguished,suggested_sort,author_cakeday,edited,banned_by
0,[],False,SiX_Paths-Madara,,Thunder2,"[{'e': 'text', 't': '[OKC] Russell Westbrook'}]",[OKC] Russell Westbrook,dark,richtext,t2_eeagp8k,...,,,,,,,,,,
1,[],False,[deleted],,,,,dark,,,...,deleted,,,,,,,,,
2,[],False,DRAZZILB1424,#edeff1,Magic1,"[{'a': ':orl-1:', 'e': 'emoji', 'u': 'https://...",:orl-1: Magic,dark,richtext,t2_13fwme,...,,d85b1fba-3feb-11e8-b7d0-0e993ebc6d5c,news,dbc12ec6-5c53-11e4-93ee-12313b0b3108,News,,,,,
3,[],False,Tomheza,#edeff1,Lakers1,"[{'e': 'text', 't': 'Lakers'}]",Lakers,dark,richtext,t2_1rsbmfvb,...,,cc0d49ae-3feb-11e8-8327-0e3e867879aa,news,dbc12ec6-5c53-11e4-93ee-12313b0b3108,News,,,,,
4,[],False,Tomheza,#edeff1,Lakers1,"[{'e': 'text', 't': 'Lakers'}]",Lakers,dark,richtext,t2_1rsbmfvb,...,,cc0d49ae-3feb-11e8-8327-0e3e867879aa,news,dbc12ec6-5c53-11e4-93ee-12313b0b3108,News,,,,,


In [45]:
df1 = df1[['subreddit', 'selftext', 'title']]

In [46]:
df1.head()

Unnamed: 0,subreddit,selftext,title
0,nba,,2022-23 Lakers Season Begins — Narrated by LeB...
1,nba,[deleted],[C.J. Holmes] Klay Thompson said he has no int...
2,nba,,[Wojnarowski] Fanatics CEO Michael Rubin compl...
3,nba,,[Wojnarowski] Fanatics CEO Michael Rubin compl...
4,nba,,[Wojnarowski] ESPN Sources: Fanatics CEO Micha...


In [47]:
df1.shape

(997, 3)

In [57]:
df1.to_csv('Data/nba')

### Creating NFL DataFrame

In [50]:
df2 = pd.DataFrame(posts2)

In [51]:
df2 = df2[['subreddit', 'selftext', 'title']]

In [52]:
df2.head()

Unnamed: 0,subreddit,selftext,title
0,nfl,,The true reason we're all excited for TNF
1,nfl,Longest active streaks not above .500 in NFL \...,Interesting statistic
2,nfl,,"[RapSheet] Sources: #Jets WR Elijah Moore, fru..."
3,nfl,,"[Rapoport] Sources: #Jets WR Elijah Moore, fru..."
4,nfl,,"[Rapoport] Sources: #Jets WR Elijah Moore, fru..."


In [53]:
df2.shape

(1000, 3)

In [56]:
df2.to_csv('Data/nfl')

In [55]:
df = pd.concat([df1, df2], axis=0)

### Combining NBA & NFL Data

In [55]:
df.head()

Unnamed: 0,subreddit,selftext,title
0,nba,,2022-23 Lakers Season Begins — Narrated by LeB...
1,nba,[deleted],[C.J. Holmes] Klay Thompson said he has no int...
2,nba,,[Wojnarowski] Fanatics CEO Michael Rubin compl...
3,nba,,[Wojnarowski] Fanatics CEO Michael Rubin compl...
4,nba,,[Wojnarowski] ESPN Sources: Fanatics CEO Micha...


In [58]:
df.shape

(1997, 3)

### Convert NBA/NFL into binary labels

In [59]:
df['subreddit'].replace(['nba', 'nfl'], [1, 0], inplace=True)

In [60]:
df.head()

Unnamed: 0,subreddit,selftext,title
0,1,,2022-23 Lakers Season Begins — Narrated by LeB...
1,1,[deleted],[C.J. Holmes] Klay Thompson said he has no int...
2,1,,[Wojnarowski] Fanatics CEO Michael Rubin compl...
3,1,,[Wojnarowski] Fanatics CEO Michael Rubin compl...
4,1,,[Wojnarowski] ESPN Sources: Fanatics CEO Micha...


In [59]:
df.to_csv('Data/nbavsnfl')