In [1]:
import pandas as pd
import nltk
import math
import numpy as np
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from pprint import pprint

%matplotlib inline

# NEWS = 0
# UPNEWS = 1



In [2]:
json_data=open('./API-data/ps_news_posts').read()
news_posts = json.loads(json_data)

FileNotFoundError: [Errno 2] No such file or directory: './API-data/ps_news_posts'

In [None]:
json_data=open('./API-data/ps_upnews_posts').read()
upnews_posts = json.loads(json_data)

In [None]:
upnews_headlines = [li['title'] for li in upnews_posts]

df_upnews = pd.DataFrame(upnews_headlines, columns=['headlines'])

df_upnews.drop_duplicates(inplace=True)

In [None]:
news_headlines = [li['title'] for li in news_posts]

df_news = pd.DataFrame(news_headlines, columns=['headlines'])

df_news.drop_duplicates(inplace=True)

In [None]:
df_upnews['news'] = 1
df_news['news'] = 0

In [None]:
df = pd.concat((df_upnews, df_news), axis=0, ignore_index=True)

In [None]:
df.head()

In [None]:
sia = SIA()
results = []

for line in news_headlines:
    pol_score = sia.polarity_scores(line)
    pol_score['headline'] = line
    results.append(pol_score)

In [None]:
#results

In [None]:
df = pd.DataFrame.from_records(results)
df.head()

In [None]:
df.headline = df.headline.map(lambda x: re.sub('[^a-zA-Z0-9\s]','',x))
df.headline = df.headline.map(lambda x: re.sub('/r/News', ' ', x))
df.headline = df.headline.map(lambda x: re.sub('/r/Upliftingnews', ' ', x))
df.headline = df.headline.map(lambda x: re.sub('http[^\s]*', ' ', x))


In [None]:
df['label'] = 0
df.loc[df['compound'] > 0.2, 'label'] = 1
df.loc[df['compound'] < -0.2, 'label'] = -1
df.head()

In [None]:
counts = df.label.value_counts()
print(counts)

In [None]:
print("Positive headlines:\n")
pprint(list(df[df['label'] == 1].headline)[:5], width=200)

print("\nNegative headlines:\n")
pprint(list(df[df['label'] == -1].headline)[:5], width=200)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))

counts = df.label.value_counts(normalize=True) * 100

sns.barplot(x=counts.index, y=counts, ax=ax)

ax.set_xticklabels(['Negative', 'Neutral', 'Positive'])
ax.set_ylabel("Percentage")

plt.show()

In [None]:
stop_words = stopwords.words('english')

tokenizer = RegexpTokenizer(r'\w+')

In [None]:
def process_text(headlines):
    tokens = []
    for line in headlines:
        toks = tokenizer.tokenize(line)
        toks = [t.lower() for t in toks if t.lower() not in stop_words]
        tokens.extend(toks)
    
    return tokens

In [None]:
pos_lines = list(df[df.label == 1].headline)

pos_tokens = process_text(pos_lines)
pos_freq = nltk.FreqDist(pos_tokens)

pd.DataFrame(pos_freq.most_common(10), columns=('word', 'count'))

In [None]:
neg_lines = list(df[df.label == -1].headline)

neg_tokens = process_text(neg_lines)
neg_freq = nltk.FreqDist(neg_tokens)

pd.DataFrame(neg_freq.most_common(10), columns=('word', 'count'))

In [None]:
y_val = [x[1] for x in pos_freq.most_common()]

fig = plt.figure(figsize=(10,5))
plt.plot(y_val)

plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Word Frequency Distribution (Positive)")
plt.show()

In [None]:
y_final = []
for i, k, z, t in zip(y_val[0::4], y_val[1::4], y_val[2::4], y_val[3::4]):
    y_final.append(math.log(i + k + z + t))

x_val = [math.log(i + 1) for i in range(len(y_final))]

fig = plt.figure(figsize=(10,5))

plt.xlabel("Words (Log)")
plt.ylabel("Frequency (Log)")
plt.title("Word Frequency Distribution (Positive)")
plt.plot(x_val, y_final)
plt.show()

In [None]:
y_val = [x[1] for x in neg_freq.most_common()]

fig = plt.figure(figsize=(10,5))
plt.plot(y_val)

plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Word Frequency Distribution (Negative)")
plt.show()

In [None]:
y_final = []
for i, k, z in zip(y_val[0::3], y_val[1::3], y_val[2::3]):
    if i + k + z == 0:
        break
    y_final.append(math.log(i + k + z))

x_val = [math.log(i+1) for i in range(len(y_final))]

fig = plt.figure(figsize=(10,5))

plt.xlabel("Words (Log)")
plt.ylabel("Frequency (Log)")
plt.title("Word Frequency Distribution (Negative)")
plt.plot(x_val, y_final)
plt.show()

In [None]:
#df.to_csv('reddit_headlines_sentiment.csv', index=False)

In [None]:
sia = SIA()
results2 = []

for line in upnews_headlines:
    pol_score = sia.polarity_scores(line)
    pol_score['headline'] = line
    results2.append(pol_score)

In [None]:
#results2

In [None]:
df2 = pd.DataFrame.from_records(results2)
df2.head()

In [None]:
df2['label'] = 0
df2.loc[df2['compound'] > 0.2, 'label'] = 1
df2.loc[df2['compound'] < -0.2, 'label'] = -1
df2.head()

In [None]:
df2.headline = df2.headline.map(lambda x: re.sub('^a-zA-Z0-9','',x))
df2.headline = df2.headline.map(lambda x: re.sub('/r/News', ' ', x))
df2.headline = df2.headline.map(lambda x: re.sub('/r/Upliftingnews', ' ', x))
df2.headline = df2.headline.map(lambda x: re.sub('http[^\s]*', ' ', x))


In [None]:
counts2 = df2.label.value_counts()
print(counts2)

In [None]:
print("Positive headlines:\n")
pprint(list(df2[df2['label'] == 1].headline)[:5], width=200)

print("\nNegative headlines:\n")
pprint(list(df2[df2['label'] == -1].headline)[:5], width=200)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))

counts2 = df2.label.value_counts(normalize=True) * 100

sns.barplot(x=counts2.index, y=counts2, ax=ax)

ax.set_xticklabels(['Negative', 'Neutral', 'Positive'])
ax.set_ylabel("Percentage")

plt.show()