In [1]:
import pandas as pd
import nltk
import json
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup             
from nltk.corpus import stopwords
import re

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

# Load in raw json data

In [2]:
json_data=open('../API-data/ps_news').read()
news_comments = json.loads(json_data)


In [3]:
json_data=open('../API-data/ps_news_posts').read()
news_posts = json.loads(json_data)


In [4]:
json_data=open('../API-data/ps_upnews').read()
upnews_comments = json.loads(json_data)


In [5]:
json_data=open('../API-data/ps_upnews_posts').read()
upnews_posts = json.loads(json_data)

# Reduce filesize 

For the sake of saving some memory and time, I am artificially reducing the list lengths.

Note that this is just a "fast and dirty" check through the data to see what it looks like. This will guide how I will properly clean and evaluate the data.

In [6]:
news_comments = news_comments[:13000]

In [7]:
news_posts = news_posts[:5500]

In [8]:
upnews_comments = upnews_comments[:13000]

In [9]:
upnews_posts = upnews_posts[:5000]

#### Check a random post to verify the pulled data

In [10]:
news_posts[765]

{'author': 'CaydenCabello9935',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_182vnjuf',
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1535961613,
 'domain': 'ciudad.com.ar',
 'full_link': 'https://www.reddit.com/r/news/comments/9cjn3e/gwyneth_paltrow_sus_truquitos_de_belleza_para/',
 'id': '9cjn3e',
 'is_crosspostable': False,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_self': False,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 0,
 'num_crossposts': 0,
 'over_18': False,
 'parent_whitelist_status': 'all_ads',
 'permalink': '/r/news/comments/9cjn3e/gwyneth_paltrow_sus_truquitos_de_belleza_para/',
 'pinned': False,
 'pwls': 6,
 'retrieved_on': 153596161

#### And doing the same for posts.

In [11]:
news_comments[7030]['body']

'Those who do not learn from history, are doomed to reheat it.'

# Converting News headlines to dataframe

Saving to CSV

In [12]:
news_headlines = [li['title'] for li in news_posts]

In [13]:
df_news_head = pd.DataFrame(news_headlines, columns=['headlines'])

In [14]:
df_news_head.drop_duplicates(inplace=True)
df_news_head.shape

(5172, 1)

In this initial search, I designated r/news as 1 and r/upliftingnews as 0. 

In [15]:
df_news_head['subreddit'] = 1

In [16]:
df_news_head.head()

Unnamed: 0,headlines,subreddit
0,Steve Bannon disinvited from New Yorker festiv...,1
1,Brazil's National Museum Fire: What It Means f...,1
2,"Democrats, Eyeing a Majority, Prepare an Inves...",1
3,None of them were Redditors.,1
4,Seahawks Owner Gives $100k To Help Republicans...,1


In [17]:
df_news_head.to_csv('news_headlines.csv')

# Converting Uplifting News headlines to dataframe

Saving to CSV

In [18]:
upnews_headlines = [li['title'] for li in upnews_posts]

In [19]:
df_upnews_head = pd.DataFrame(upnews_headlines)

In [20]:
df_upnews_head.shape

(5000, 1)

In [21]:
df_upnews_head.drop_duplicates(inplace=True)

df_upnews_head.shape

(4761, 1)

In [22]:
df_upnews_head.to_csv('upnews_headlines.csv')

# Converting News comments to dataframe

In [23]:
news_comms = [li['body'] for li in news_comments]

In [24]:
news_comms_auth = [li['author'] for li in news_comments]

In [25]:
df_news_comms = pd.DataFrame(news_comms)
df_news_comms_auth = pd.DataFrame(news_comms_auth)

In [26]:
df_news_comms = pd.concat((df_news_comms, df_news_comms_auth), axis=1)

In [27]:
df_news_comms.shape

(13000, 2)

In [28]:
df_news_comms.drop_duplicates(inplace=True)

df_news_comms.shape

(11698, 2)

In [29]:
df_news_comms.to_csv('news_comments.csv')

# Converting Uplifting News comments to dataframe

In [30]:
upnews_comms = [li['body'] for li in upnews_comments]

In [31]:
upnews_comms_auth = [li['author'] for li in news_comments]

In [32]:
df_upnews_comms = pd.DataFrame(upnews_comms, columns=["comments"])
df_upnews_comms_auth = pd.DataFrame(upnews_comms_auth, columns=['user'])

In [33]:
df_upnews_comms = pd.concat((df_upnews_comms, df_upnews_comms_auth), axis=1)

In [34]:
df_upnews_comms.shape

(13000, 2)

In [35]:
df_upnews_comms.drop_duplicates(inplace=True)

df_upnews_comms.shape

(12968, 2)

In [36]:
df_upnews_comms.to_csv('upnews_comments.csv')

In [37]:
df_upnews_comms['subreddit'] = 1

# BeautifulSoup and text cleanup

Again, this is just for the sake of practice and to see what this will produce.

In [38]:
cvec = CountVectorizer()

In [39]:
example1 = BeautifulSoup(df_news_head['headlines'][0], 'lxml')
example1.get_text()

'Steve Bannon disinvited from New Yorker festival after Jimmy Fallon, Jim Carrey pull out'

In [40]:
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search

In [41]:
def review_to_words(raw_post):
    # Function to convert a raw review to a string of words
    #
    # 1. Remove HTML
    post_text = BeautifulSoup(raw_post).get_text()
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", post_text)
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    #
    # 4. Sets are easier to search through than lists and also remove duplicates
    stops = set(stopwords.words('english'))
    # 
    # 5. Remove stop words
    cleaned_words = [w for w in words if not w in stops]
    #
    # 6. Return the result.
    return (" ".join(cleaned_words))


In [42]:
review_to_words(letters_only)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


'steve bannon disinvited new yorker festival jimmy fallon jim carrey pull'

#### Note - I did not end up using this as I had iniitially wanted to, but I do intend on revisiting the cleanup to do so.