# Training a model to predict if a post is good or bad

In [1]:
import pandas as pd
import nltk
import numpy as np
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from pprint import pprint
from sklearn.pipeline import Pipeline


%matplotlib inline



## Load in data

In [2]:
df1 = pd.read_csv('../API-data/news_posts_SA')
df1 = df1[:3000] # Artificially limiting the number of news posts to more closely match upliftingnews. 

This is not the optimal way to deal with unbalanced classes. 

In [3]:
df1.shape

(3000, 8)

In [4]:
df2 = pd.read_csv('../API-data/upnews_posts_SA')
df2.head()

Unnamed: 0.1,Unnamed: 0,compound,headline,neg,neu,pos,news,label
0,0,-0.5267,10 Nigerian Celebrities Who Have Been Sentence...,0.386,0.614,0.0,1,-1
1,1,0.7003,Philadelphias Homeless Are Finding New Hope Th...,0.0,0.58,0.42,1,1
2,2,0.4588,Kindness can change a life,0.0,0.5,0.5,1,1
3,6,0.2003,Shows Up Culinary Therapy A Different Kind o...,0.057,0.867,0.077,1,1
4,7,0.2382,Up to 50 of the worlds coral has died but scie...,0.083,0.799,0.118,1,1


In [5]:
df2.shape

(2944, 8)

In [6]:
df = pd.concat((df1, df2), axis=0, ignore_index=True)
df.shape

(5944, 8)

In [7]:
df = df[df.label != 0]
df.label.value_counts()

 1    3279
-1    2665
Name: label, dtype: int64

## Count Vectorizer to assign the strings to boolean arrays

In [8]:
vect = CountVectorizer(max_features=1000, binary=True)
X = vect.fit_transform(df['headline'].values.astype('str')) 

X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
from sklearn.model_selection import train_test_split

X = df.headline.values.astype('str')
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

In [11]:
# vect.vocabulary_
# commenting this out due to excessive length

## Classification

Using MultinomialNB to fit and score the model.

In [12]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

In [13]:
nb.fit(X_train_vect, y_train)

nb.score(X_train_vect, y_train)

0.8663461538461539

In [14]:
nb.score(X_test_vect, y_test)

0.8026905829596412

In [15]:
y_pred = nb.predict(X_test_vect)

y_pred

array([ 1,  1,  1, ..., -1,  1,  1])

#### Comparing the predicted y-values to the actual in the form of a confusion matrix

Also included "classification_report", which I am not entirely sure of how to read.

In [16]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[622 190]
 [162 810]]


             precision    recall  f1-score   support

         -1       0.79      0.77      0.78       812
          1       0.81      0.83      0.82       972

avg / total       0.80      0.80      0.80      1784



## Test the prediction of the NB trained model

Testing to see if a post can be correctly identified as positive or negative

In [17]:
news_h = df.headline[1]
news_h

'Brazils National Museum Fire What It Means for Science'

In [18]:
news_review_transformed = vect.transform([news_h])
nb.predict(news_review_transformed)[0]

-1

In [19]:
upnews_h = df.headline[3250]
upnews_h

'600 homes repaired 26 million meals served and thousands helped through JJ Watts relief fund'

In [20]:
upnews_review_transformed = vect.transform([upnews_h])
nb.predict(upnews_review_transformed)[0]

1

In [21]:
upnews_h2 = df.headline[3703]
upnews_h2

'Free tuition for medical students Its about time Hopefully more schools will follow'

In [22]:
positive_review_transformed2 = vect.transform([upnews_h2])
nb.predict(positive_review_transformed2)[0]

1

Comparing the output of the model predictor based solely on how it was labeled by Vader, this is relatively accurate. However, that is not to say that the sentiment analysis is accurate.

## Try some other models

Using tf-idf vectorizer instead of count vectorizer.

In [23]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [24]:
tfidf = TfidfVectorizer(stop_words='english', min_df=10, max_df=.95, ngram_range=(1, 3))

In [25]:
term_mat = tfidf.fit_transform(df['headline'].values.astype('str'))

In [26]:
len(tfidf.get_feature_names())

1176

In [27]:
term_df = pd.SparseDataFrame(term_mat, columns=tfidf.get_feature_names())

In [28]:
term_df.fillna(0, inplace=True)

In [29]:
SVD = TruncatedSVD(n_components=100)
svd_matrix = SVD.fit_transform(term_df)
svd_matrix.shape

(5944, 100)

In [30]:
component_names = ["component_"+str(i+1) for i in range(100)]
svd_df = pd.DataFrame(svd_matrix,
                      columns=component_names)

In [31]:
svd_df.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_91,component_92,component_93,component_94,component_95,component_96,component_97,component_98,component_99,component_100
0,0.000831,0.047526,0.166058,-0.053914,0.303857,-0.230784,-0.212336,-0.059711,-0.041123,-0.040243,...,0.041332,-0.049308,-0.00016,-0.080288,0.103986,0.003051,0.00455,0.031368,-0.004892,-0.036757
1,0.000165,0.014917,0.037146,-0.000212,-0.008844,0.000187,0.002786,-0.000619,0.070699,0.021199,...,0.005693,-0.017511,-0.008939,-0.007358,0.014048,0.006628,-0.009145,0.011235,-0.021072,-0.005543
2,0.001376,0.047268,0.142053,-0.134252,0.158789,0.271589,0.096239,0.003967,-0.036412,-0.009832,...,-0.043518,-0.058715,0.022017,0.011769,-0.055975,-0.034812,-0.009152,-0.030072,-0.062124,0.026225
3,0.00048,0.011787,0.042641,0.004642,-0.001823,-0.022365,-0.012399,0.007786,0.020372,0.030803,...,0.032363,-0.00749,-0.01384,0.037017,-0.023748,0.05477,-0.023642,0.062422,0.069843,-0.084336
4,0.000451,0.010929,0.035688,-0.009113,-0.025787,-0.007924,-0.013723,-0.024696,0.003196,0.004363,...,-0.015059,0.004037,0.048332,-0.043391,0.019181,-0.043005,-0.026261,0.010088,0.009656,-0.023244


In [32]:
loadings = pd.DataFrame(SVD.components_,
                        index=component_names,
                        columns=term_df.columns).T

In [33]:
loadings['abs_component_1'] = np.abs(loadings.component_1)
loadings['abs_component_2'] = np.abs(loadings.component_2)

In [34]:
loadings.sort_values('abs_component_1',ascending=False).head(10)[['component_1']]

Unnamed: 0,component_1
worth,0.410904
biography,0.408917
net worth,0.408482
biography net,0.407499
biography net worth,0.407499
net,0.404583
king,0.010694
neutrality,0.009383
net neutrality,0.009383
bio biography,0.009179


In [35]:
loadings.sort_values('abs_component_2',ascending=False).head(10)[['component_2']]

Unnamed: 0,component_2
best,0.916224
news,0.156035
best news,0.110157
help,0.08139
new,0.080433
man,0.076859
trump,0.067857
free,0.054312
service,0.054185
services,0.052638


In [36]:
cos_sim_mat = cosine_similarity(svd_matrix, svd_matrix)
cos_sim_mat.shape

(5944, 5944)

In [37]:
np.argsort(cos_sim_mat[1])[:-11:-1]

array([  1, 535, 479, 453, 435,  58, 517, 353, 511, 464])

In [38]:
np.sort(cos_sim_mat[1])[:-11:-1]

array([1.        , 0.99747722, 0.99685205, 0.99614323, 0.99614323,
       0.9949985 , 0.98575963, 0.96819086, 0.96819086, 0.9445423 ])

In [39]:
df.loc[np.argsort(cos_sim_mat[50])[:-11:-1], :]

Unnamed: 0.1,Unnamed: 0,compound,headline,neg,neu,pos,news,label
50,92,0.34,Oil prices could surge over 90 and create stic...,0.12,0.602,0.278,0,1
2495,5312,-0.2057,Scott Morrison says Paris targets still on des...,0.168,0.832,0.0,0,-1
2976,6327,-0.25,Pompeos NK visit cancelled because of demands ...,0.143,0.857,0.0,0,-1
3675,1150,0.5719,Alexa Bliss Says Evolution Is What The Womens ...,0.0,0.709,0.291,1,1
616,1303,-0.3182,AntiSemitism row Corbyn has been misinterprete...,0.223,0.777,0.0,0,-1
121,232,-0.4767,Cocaine smuggling is corrupting Antwerp politi...,0.307,0.693,0.0,0,-1
645,1360,0.2023,Former Advisor Of US Treasury Says Blockchain ...,0.0,0.833,0.167,0,1
2214,4724,0.4939,Leak detected aboard space station but NASA sa...,0.116,0.548,0.336,0,1
5234,3826,0.5267,Kudlow says economic growth could top 4 for a ...,0.0,0.694,0.306,1,1
557,1181,0.5106,Theresa May says a 2nd Brexit referendum would...,0.0,0.798,0.202,0,1


In [41]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

# Params for classifier
params = {
    "classifier__n_estimators": [50, 100, 150],
    "classifier__max_depth": [3, None],
    "classifier__max_features": [10, 25, 50],
    "classifier__min_samples_leaf": [1, 3, 10],
    "classifier__criterion": ["gini", "entropy"],
}

In [42]:
gs = GridSearchCV(estimator=pipeline , param_grid=params) 
gs_fit = gs.fit(X_train, y_train)
gs.best_params_

{'classifier__criterion': 'entropy',
 'classifier__max_depth': None,
 'classifier__max_features': 10,
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 150}

In [43]:
gs.score(X_test, y_test)

0.8133408071748879

This scored slightly better than with MultinomialNB. I will test some new headlines (not in the pulled dataset) against this model.

In [54]:
new_news = ["OxyContin creator being sued for \'significant role in causing opioid epidemic'",
            'Dallas Officer Arrested In Shooting Death Of Botham Shem Jean',
            'G14-year-old charged in rape, murder of 83-year-old neighbor']

new_upnews = ["The Sniping Scientists Whose Work Saved Millions of Lives",
            "NICU volunteer donates a million dollars to local baby unit",
            "He spent 27 years wrongly convicted of murder. He wants to spend the rest encouraging inmates to read"]

In [55]:
gs.predict(new_upnews)

array([ 1,  1, -1])

In [56]:
gs.predict(new_news)

array([ 1, -1, -1])

# Interpretation/Conclusion

Not the best results, but passable. The model got two sentiments wrong, predicting that the third post in upliftingnews is negative, and the first post in news is positive.