# NLP Analysis Script
## What it do
- Takes in a set of articles in a csv file
- Using a previous database of articles, finds keywords of each mew article
- Finds similarities and sentiments of each article
- Outputs CSV of all of this data

In [34]:
# import the necessary libraries

import pandas as pd
import os, os.path
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix
from numpy import mean

# import the kaggle news
news = pd.read_csv("kaggle_archive/articles1.csv")

# rename the text column, shorten
news = news.rename(columns={"content":'text'})
news = news.head(100)

# get the word count for each article
news['word_count'] = news['text'].apply(lambda x: len(x.split(" ")))

# creating a list of stopwords and adding custom stopwords
stop_words = set(stopwords.words("english"))
new_words = []
stop_words = stop_words.union(new_words)

# create a corpus to store the words in
corpus = []

# clean the text
for i in range(len(news)):
    
    # remove punctutation
    text = re.sub('[^a-zA-Z]',' ', news['text'][i])
    
    # convert to lowercase
    text = text.lower()
    
    # remove tags
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ",text)
    
    # convert to list from string
    text = text.split()
    
    # stem
    ps = PorterStemmer()
    
    # Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in stop_words]
    text = " ".join(text)
    corpus.append(text)

# get the vocabulary keys, set tf-idf parameters
cv = CountVectorizer(max_df = .8,stop_words=stop_words,max_features=10000, ngram_range=(1,3))
X = cv.fit_transform(corpus)

# start tf-idf
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)

# get feature names from the kaggle news
feature_names=cv.get_feature_names()

def checkForRepeats(keywords):
    words = []
    repeat = 0
    for i in range(len(keywords)):
        for j in range(len(keywords)):
            if (i != j) and (keywords[i] in keywords[j]):
                repeat = 1
        if repeat == 0:
            words.append(keywords[i])
        repeat = 0
    return words

# set number of articles
n_articles = 10

# set number of keywords
n_keywords = 5

# cosine similarity of the articles we want to look at
def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

# do the cosine sim for 9 articles (the max we'd be doing)
sims = get_cosine_sim(*corpus[:n_articles])
sim = []
for i in range(n_articles):
    sim.append(np.mean(sims[i][np.arange(len(sims[i]))!=i]))

# initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# function for sorting tf_idf in descending order
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

# get the feature names and tf-idf score of top n items
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    
    # use only top n items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        # keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    # create a tuples of feature,score
    # results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

# create lists to return results
words = []
sentiments = []
conf = []

# iterate through the articles to get keywords and sentiment
for i in range(n_articles):
    
    # fetch document for which keywords needs to be extracted
    doc=corpus[i]

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

    # sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    
    # extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,n_keywords)
    
    # convert keywords to format, check for repeats
    keys = list(keywords.items())
    conf = [i[1] for i in keys]
    keys = [i[0] for i in keys]
    keys = checkForRepeats(keys)
    
    # gets the sentiment
    sent = sia.polarity_scores(corpus[i])
    sent = sent.get('compound')

    # add to lists
    words.append(keys)
    sentiments.append(sent)

    #print
#     print("\nArticle: ",i)
#     print("\nSentiment: ",sent)
#     print("\nKeywords: ")
#     for i in range(len(keys)):
#         print(keys[i],conf[i])
     
# add outputs to pandas database    
outputs = pd.DataFrame({'sentiment': sentiments, 'words': words,'similarity':sim})
new = pd.concat([news,outputs],axis=1)




In [35]:
new.head(5)

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,text,word_count,sentiment,words,similarity
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...,920,0.6497,"[republican, house, administration, health, su...",0.102034
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood...",4905,-0.9999,"[detective, th precinct, mr fernandez]",0.194279
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri...",2521,0.9888,"[mr wong, disney, artist, tyrus]",0.188907
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t...",2212,-0.869,"[death, died, music, star, palmer]",0.16436
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ...",741,0.9789,"[korea, north, ballistic missile]",0.16177


In [33]:
i=1
print(sims)
np.mean(sims[i][np.arange(len(sims[i]))!=i])

[[1.         0.13272679 0.09715447 0.11813731 0.12119686 0.03841132
  0.13783291 0.09507318 0.07241627 0.1053561 ]
 [0.13272679 1.         0.23883316 0.22695824 0.19275134 0.14636894
  0.13953671 0.24279347 0.20426468 0.22428047]
 [0.09715447 0.23883316 1.         0.21020827 0.23083512 0.08411987
  0.15465706 0.17110828 0.29497721 0.21826521]
 [0.11813731 0.22695824 0.21020827 1.         0.12965419 0.15764651
  0.09823744 0.19914592 0.17053465 0.16872139]
 [0.12119686 0.19275134 0.23083512 0.12965419 1.         0.1151847
  0.17511537 0.155576   0.20699371 0.1286221 ]
 [0.03841132 0.14636894 0.08411987 0.15764651 0.1151847  1.
  0.06369836 0.1079453  0.07994332 0.08660914]
 [0.13783291 0.13953671 0.15465706 0.09823744 0.17511537 0.06369836
  1.         0.11948089 0.10980687 0.06416979]
 [0.09507318 0.24279347 0.17110828 0.19914592 0.155576   0.1079453
  0.11948089 1.         0.16040469 0.14891559]
 [0.07241627 0.20426468 0.29497721 0.17053465 0.20699371 0.07994332
  0.10980687 0.1604046

0.19427931283207445