In [1]:
import pandas as pd
import nltk
import json
import re
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from pprint import pprint



### Same process as 2b - skip to next notebook for consolidation and cleanup.

In [2]:
json_data=open('../API-data/ps_upnews_posts').read()
upnews_posts = json.loads(json_data)

In [3]:
upnews_headlines = [li['title'] for li in upnews_posts]

df_upnews = pd.DataFrame(upnews_headlines, columns=['headlines'])

df_upnews.drop_duplicates(inplace=True)

In [4]:
df_upnews['news'] = 1

In [5]:
len(upnews_headlines)

5024

In [6]:
sia = SIA()
results = []

for line in upnews_headlines:
    pol_score = sia.polarity_scores(line)
    pol_score['headline'] = line
    results.append(pol_score)

In [7]:
results[:5]

[{'neg': 0.386,
  'neu': 0.614,
  'pos': 0.0,
  'compound': -0.5267,
  'headline': '10 Nigerian Celebrities Who Have Been Sentenced To Prison'},
 {'neg': 0.0,
  'neu': 0.58,
  'pos': 0.42,
  'compound': 0.7003,
  'headline': 'Philadelphia’s Homeless Are Finding New Hope Thanks to This Organization'},
 {'neg': 0.0,
  'neu': 0.5,
  'pos': 0.5,
  'compound': 0.4588,
  'headline': 'Kindness can change a life'},
 {'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0,
  'compound': 0.0,
  'headline': 'Deaf toddler hears for the first time'},
 {'neg': 0.0,
  'neu': 0.653,
  'pos': 0.347,
  'compound': 0.1511,
  'headline': 'It’s not empty anymore.'}]

In [8]:
df = pd.DataFrame.from_records(results)
df.shape

(5024, 5)

In [9]:
df_upnews.shape

(4784, 2)

In [10]:
df['news'] = 1

In [11]:
df.head()

Unnamed: 0,compound,headline,neg,neu,pos,news
0,-0.5267,10 Nigerian Celebrities Who Have Been Sentence...,0.386,0.614,0.0,1
1,0.7003,Philadelphia’s Homeless Are Finding New Hope T...,0.0,0.58,0.42,1
2,0.4588,Kindness can change a life,0.0,0.5,0.5,1
3,0.0,Deaf toddler hears for the first time,0.0,1.0,0.0,1
4,0.1511,It’s not empty anymore.,0.0,0.653,0.347,1


In [12]:
df.headline = df.headline.map(lambda x: re.sub('[^a-zA-Z0-9\s]','',x))
df.headline = df.headline.map(lambda x: re.sub('/r/News', ' ', x))
df.headline = df.headline.map(lambda x: re.sub('/r/Upliftingnews', ' ', x))
df.headline = df.headline.map(lambda x: re.sub('http[^\s]*', ' ', x))

In [13]:
df['label'] = 0
df.loc[df['compound'] > 0.2, 'label'] = 1
df.loc[df['compound'] < -0.2, 'label'] = -1
df.head()

Unnamed: 0,compound,headline,neg,neu,pos,news,label
0,-0.5267,10 Nigerian Celebrities Who Have Been Sentence...,0.386,0.614,0.0,1,-1
1,0.7003,Philadelphias Homeless Are Finding New Hope Th...,0.0,0.58,0.42,1,1
2,0.4588,Kindness can change a life,0.0,0.5,0.5,1,1
3,0.0,Deaf toddler hears for the first time,0.0,1.0,0.0,1,0
4,0.1511,Its not empty anymore,0.0,0.653,0.347,1,0


In [14]:
counts = df.label.value_counts()
print(counts)

 0    2080
 1    1884
-1    1060
Name: label, dtype: int64


In [15]:
df = df[df.label != 0]
df.shape

(2944, 7)

In [16]:
df.to_csv('upnews_posts_SA')