# NLP 


## TF-IDF and Sentiment Analysis

The goal of this notebook is to perform Tf-Idf and Sentiment analysis on the given articles.

In [1]:
import seaborn as sns
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
sns.set_context("talk")
import re
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [2]:
#uploading a small data to work with
short = pd.read_csv('short')
short.head()

Unnamed: 0.1,Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
0,0,Phillip Molnar,A British real estate company Monday launched ...,A British real estate company Monday launched ...,Sandiegouniontribune.com,http://www.sandiegouniontribune.com/business/r...,2018-01-09 00:00:00,"Purplebricks, flat fee real estate listers, la..."
1,1,,2018-01-09 — bbc.com\t\n\t\nsource article |...,``In parts of the continent - especially comme...,,,2018-01-09 00:00:00,Why African millennials can't get enough of Bi...
2,2,"Associated Press, By Associated Press","NEW YORK — Kodak, which traces its roots to th...","NEW YORK — Kodak, which traces its roots to th...",Bostonherald.com,http://www.bostonherald.com/news/national/2018...,2018-01-09 00:00:00,Kodak surges at it becomes latest 'cryptocurre...
3,3,,2018-01-09 — thehill.com\t\n\t\nsource artic...,``JPMorgan Chase CEO Jamie Dimon has walked ba...,,,2018-01-09 00:00:00,"Dimon: ""I regret calling bitcoin a fraud"""
4,4,,2018-01-09 — reuters.com\t\n\t\nsource artic...,"``Staff at the regulatory agency ""expressed co...",,,2018-01-09 00:00:00,Fund managers say bitcoin ETF proposals withdr...


In [3]:
#dropping unnamed column
short = short.drop(["Unnamed: 0"], axis=1)

In [4]:
#this code removes all punctuation and alphanumeric words from the content
removed_punct=[]
for text in short["contents"]:
    text=re.sub(r'[^\w\s]','',text)
    text=re.sub(r'\b([a-zA-Z]+[0-9]+[a-zA-Z0-9]*|[0-9]+[a-zA-Z]+[a-zA-Z0-9]*)\b', "", text)
    removed_punct.append(text)

In [5]:
#performing Tf-idf
all_documents= list(removed_punct)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)
tfidf_feat = tfidf.fit_transform(all_documents)


In [6]:
tfidf_feat.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

# Sentiment Analysis

Uses vader_lexicon for performing lexicon alaysis.

In [8]:
#using lexicon-sentiment dictionary (not financial though)
!head vader_lexicon.txt

$:	-1.5	0.80623	[-1, -1, -1, -1, -3, -1, -3, -1, -2, -1]
%)	-0.4	1.0198	[-1, 0, -1, 0, 0, -2, -1, 2, -1, 0]
%-)	-1.5	1.43178	[-2, 0, -2, -2, -1, 2, -2, -3, -2, -3]
&-:	-0.4	1.42829	[-3, -1, 0, 0, -1, -1, -1, 2, -1, 2]
&:	-0.7	0.64031	[0, -1, -1, -1, 1, -1, -1, -1, -1, -1]
( '}{' )	1.6	0.66332	[1, 2, 2, 1, 1, 2, 2, 1, 3, 1]
(%	-0.9	0.9434	[0, 0, 1, -1, -1, -1, -2, -2, -1, -2]
('-:	2.2	1.16619	[4, 1, 4, 3, 1, 2, 3, 1, 2, 1]
(':	2.3	0.9	[1, 3, 3, 2, 2, 4, 2, 3, 1, 2]
((-:	2.1	0.53852	[2, 2, 2, 1, 2, 3, 2, 2, 3, 2]


In [9]:
#reading the sentiments
sentiments = pd.read_table('vader_lexicon.txt', names=['token', 'polarity','SD','human ratings'])
sentiments.tail(30)

Unnamed: 0,token,polarity,SD,human ratings
7487,xd,2.8,0.87178,"[3, 3, 4, 2, 3, 3, 1, 2, 4, 3]"
7488,xp,1.6,0.4899,"[2, 2, 2, 1, 1, 1, 2, 2, 1, 2]"
7489,yay,2.4,1.0198,"[1, 3, 3, 2, 2, 1, 4, 4, 2, 2]"
7490,yeah,1.2,0.6,"[1, 1, 1, 2, 1, 1, 0, 2, 1, 2]"
7491,yearning,0.5,1.0247,"[0, 1, 0, 1, 0, 3, 0, 1, -1, 0]"
7492,yeees,1.7,1.00499,"[1, 3, 1, 2, 1, 1, 4, 2, 1, 1]"
7493,yep,1.2,0.4,"[1, 1, 1, 1, 1, 1, 2, 2, 1, 1]"
7494,yes,1.7,0.78102,"[1, 2, 2, 1, 1, 1, 3, 3, 1, 2]"
7495,youthful,1.3,0.45826,"[1, 2, 1, 2, 1, 1, 1, 1, 2, 1]"
7496,yucky,-1.8,0.6,"[-2, -1, -1, -2, -2, -1, -2, -2, -3, -2]"


In [10]:
sent = sentiments.drop(['SD','human ratings'], axis=1).set_index('token')


In [11]:
# lm_sent = pd.read_csv('lm.csv')
# lm_sent.iloc[45]

In [12]:
# Save your regex in punct_re- getting rid of punctuation 
punct_re = r'[^\w\s]'
short['no_punc'] = short['contents'].str.lower().apply(lambda x: re.sub(punct_re,' ', x))
short.head()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title,no_punc
0,Phillip Molnar,A British real estate company Monday launched ...,A British real estate company Monday launched ...,Sandiegouniontribune.com,http://www.sandiegouniontribune.com/business/r...,2018-01-09 00:00:00,"Purplebricks, flat fee real estate listers, la...",a british real estate company monday launched ...
1,,2018-01-09 — bbc.com\t\n\t\nsource article |...,``In parts of the continent - especially comme...,,,2018-01-09 00:00:00,Why African millennials can't get enough of Bi...,2018 01 09 bbc com\t\n\t\nsource article ...
2,"Associated Press, By Associated Press","NEW YORK — Kodak, which traces its roots to th...","NEW YORK — Kodak, which traces its roots to th...",Bostonherald.com,http://www.bostonherald.com/news/national/2018...,2018-01-09 00:00:00,Kodak surges at it becomes latest 'cryptocurre...,new york kodak which traces its roots to th...
3,,2018-01-09 — thehill.com\t\n\t\nsource artic...,``JPMorgan Chase CEO Jamie Dimon has walked ba...,,,2018-01-09 00:00:00,"Dimon: ""I regret calling bitcoin a fraud""",2018 01 09 thehill com\t\n\t\nsource artic...
4,,2018-01-09 — reuters.com\t\n\t\nsource artic...,"``Staff at the regulatory agency ""expressed co...",,,2018-01-09 00:00:00,Fund managers say bitcoin ETF proposals withdr...,2018 01 09 reuters com\t\n\t\nsource artic...


In [13]:
#splitting into separate words for sentiment analysis
short['split']= short.no_punc.str.split()
tidy_short = (pd.DataFrame(short.split.apply(pd.Series).stack())
               .reset_index(level=1)
               .rename(columns={'level_1':'num', 0: 'word'}))
short=short.drop('split', axis=1)

In [14]:
tidy_short.head()


Unnamed: 0,num,word
0,0,a
0,1,british
0,2,real
0,3,estate
0,4,company


In [15]:
tidy_short['id'] = tidy_short.index
tidy_short.head()

Unnamed: 0,num,word,id
0,0,a,0
0,1,british,0
0,2,real,0
0,3,estate,0
0,4,company,0


In [89]:
tidy_sort = tidy_short.merge(sent, how='left', left_on='word', right_index=True)

In [16]:
#merging polarities with the words we have
polarities = (tidy_short
              .merge(sent, how='left', left_on='word', right_index=True)
              .fillna(value=0)
              .groupby('id')
              .sum()
              .drop('num', axis=1))
polarities.tail()

Unnamed: 0_level_0,polarity
id,Unnamed: 1_level_1
95,218.2
96,413.4
97,393.4
98,-54.9
99,93.5


In [17]:
#putting back the polarities with the dataframe we have 
senti = short.merge(polarities, how='left', left_index=True, right_index=True)
senti

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title,no_punc,polarity
0,Phillip Molnar,A British real estate company Monday launched ...,A British real estate company Monday launched ...,Sandiegouniontribune.com,http://www.sandiegouniontribune.com/business/r...,2018-01-09 00:00:00,"Purplebricks, flat fee real estate listers, la...",a british real estate company monday launched ...,6.692000e+02
1,,2018-01-09 — bbc.com\t\n\t\nsource article |...,``In parts of the continent - especially comme...,,,2018-01-09 00:00:00,Why African millennials can't get enough of Bi...,2018 01 09 bbc com\t\n\t\nsource article ...,0.000000e+00
2,"Associated Press, By Associated Press","NEW YORK — Kodak, which traces its roots to th...","NEW YORK — Kodak, which traces its roots to th...",Bostonherald.com,http://www.bostonherald.com/news/national/2018...,2018-01-09 00:00:00,Kodak surges at it becomes latest 'cryptocurre...,new york kodak which traces its roots to th...,1.073000e+02
3,,2018-01-09 — thehill.com\t\n\t\nsource artic...,``JPMorgan Chase CEO Jamie Dimon has walked ba...,,,2018-01-09 00:00:00,"Dimon: ""I regret calling bitcoin a fraud""",2018 01 09 thehill com\t\n\t\nsource artic...,0.000000e+00
4,,2018-01-09 — reuters.com\t\n\t\nsource artic...,"``Staff at the regulatory agency ""expressed co...",,,2018-01-09 00:00:00,Fund managers say bitcoin ETF proposals withdr...,2018 01 09 reuters com\t\n\t\nsource artic...,0.000000e+00
5,,2018-01-09 — cnbc.com\t\n\t\nsource article ...,``People are pouring money into bitcoin and ot...,,,2018-01-09 00:00:00,Bitcoin ‘fascinating' because perception of ri...,2018 01 09 cnbc com\t\n\t\nsource article ...,0.000000e+00
6,itokoichi,\tPermalink\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\...,https://smhn.info/201801-microsoft-stop-pay-wi...,Hatena.ne.jp,http://d.hatena.ne.jp/itokoichi/20180109/p1,2018-01-09 00:00:00,マイクロソフト、Bitcoin決済をやめる – すまほん!!,\tpermalink\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\...,0.000000e+00
7,Yahoo.com,Link your broker and track your portfolio on Y...,Jim Cramer gauged the speculative potential of...,Yahoo.com,https://finance.yahoo.com/video/cramer-marijua...,2018-01-08 23:52:00,"Cramer: Marijuana, bitcoin and Micron are all ...",link your broker and track your portfolio on y...,3.600000e+00
8,The Star Online,"ADVERTISEMENT\t\r\n Tuesday, 9 ...",Check out these exclusives in your newspaper t...,Thestar.com.my,http://www.thestar.com.my/news/nation/2018/01/...,2018-01-08 23:47:27,What's in your copy of The Star today? (Nov 9),advertisement\t\r\n tuesday 9 ...,4.950000e+01
9,kodiak1120,If you print this Thing and display it in publ...,This is a simple box that is designed to hold ...,Thingiverse.com,https://www.thingiverse.com/thing:2750839,2018-01-08 23:40:46,Crypto Seed Word Storage,if you print this thing and display it in publ...,4.030000e+02


# K-Means

In [19]:
true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(tfidf_feat)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [21]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

Top terms per cluster:
Cluster 0:
 his
 nissan
 braintovehicle
 ride
 media
 epaper
 sealed
 waves
 skullcap
 electrodes
Cluster 1:
 the
 to
 you
 of
 and
 on
 in
 be
 permalink
 by
Cluster 2:
 year
 companies
 county
 partnership
 saine
 stein
 1000
 pay
 breach
 the
Cluster 3:
 the
 of
 and
 for
 to
 market
 in
 on
 your
 markets
Cluster 4:
 forbidden
 403
 envisioned
 epalettetoyota
 epalette
 epa
 eonloadoendecatchntryfemitinternalerrorncatchrforvar
 eobjecttfunctionntypeof
 envisions
 日本語
