In [283]:
import pandas as pd
import nltk
import os
from nltk.sentiment import SentimentAnalyzer
import re
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
os.chdir('/Users/dawnstaana/Downloads/Project 2')

In [284]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
print("Corpora download complete.")

Corpora download complete.


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dawnstaana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dawnstaana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dawnstaana/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawnstaana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dawnstaana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [285]:
train = pd.read_csv('news_headlines_train.csv')

In [286]:
train.head()

Unnamed: 0,text,sentiment
0,"In addition , a further 29 employees can be la...",-1
1,The authorisation is in force until the end of...,0
2,The value of the deal was not disclosed .,0
3,You need to be ready when the window opens up ...,0
4,Major Order in India Comptel Corporation has r...,1


In [287]:
train.isnull().sum()

text         0
sentiment    0
dtype: int64

In [288]:
# Lower case the text column
train['text'] = train['text'].str.lower()
train.head()

Unnamed: 0,text,sentiment
0,"in addition , a further 29 employees can be la...",-1
1,the authorisation is in force until the end of...,0
2,the value of the deal was not disclosed .,0
3,you need to be ready when the window opens up ...,0
4,major order in india comptel corporation has r...,1


In [289]:
train['text'] = train['text'].str.replace('[{}]'.format(string.punctuation), '')
train.head()

Unnamed: 0,text,sentiment
0,in addition a further 29 employees can be lai...,-1
1,the authorisation is in force until the end of...,0
2,the value of the deal was not disclosed,0
3,you need to be ready when the window opens up ...,0
4,major order in india comptel corporation has r...,1


In [290]:
xx = train['text'].apply(nltk.word_tokenize)

In [291]:
xx

0       [in, addition, a, further, 29, employees, can,...
1       [the, authorisation, is, in, force, until, the...
2        [the, value, of, the, deal, was, not, disclosed]
3       [you, need, to, be, ready, when, the, window, ...
4       [major, order, in, india, comptel, corporation...
                              ...                        
3188    [the, insolvency, act, regulates, the, amount,...
3189    [we, have, also, cut, our, price, projections,...
3190    [tyrvaan, sanomat, published, twice, a, week, ...
3191                             [pct, lower, at, 444210]
3192    [of, course, you, ll, have, direct, access, to...
Name: text, Length: 3193, dtype: object

In [292]:
stemmer = nltk.PorterStemmer()
train['token'] = train['text'].apply(nltk.word_tokenize)

In [293]:
#sentence must be a dataframe column first then tokenized
def stem_sentences(sentence):
    stemmed_tokens = [stemmer.stem(token) for token in sentence]
    return ' '.join(stemmed_tokens)

In [294]:
train['text'] = train['token'].apply(stem_sentences)

In [295]:
train['token'] = train['text'].apply(nltk.word_tokenize)

In [296]:
train.head()

Unnamed: 0,text,sentiment,token
0,in addit a further 29 employe can be laid off ...,-1,"[in, addit, a, further, 29, employe, can, be, ..."
1,the authoris is in forc until the end of the n...,0,"[the, authoris, is, in, forc, until, the, end,..."
2,the valu of the deal wa not disclos,0,"[the, valu, of, the, deal, wa, not, disclos]"
3,you need to be readi when the window open up r...,0,"[you, need, to, be, readi, when, the, window, ..."
4,major order in india comptel corpor ha receiv ...,1,"[major, order, in, india, comptel, corpor, ha,..."


In [297]:
def rem_stop(sentence):
    stemmed_tokens = [word for word in sentence if word not in nltk.corpus.stopwords.words('english')]
    return ' '.join(stemmed_tokens)

In [298]:
train['text'] = train['token'].apply(rem_stop)
train['token'] = train['text'].apply(nltk.word_tokenize)

In [299]:
train['tag'] = train['token'].apply(nltk.pos_tag)

In [300]:
train.head()

Unnamed: 0,text,sentiment,token,tag
0,addit 29 employe laid notic whole workforc lai...,-1,"[addit, 29, employe, laid, notic, whole, workf...","[(addit, NN), (29, CD), (employe, NN), (laid, ..."
1,authoris forc end next annual gener meet repea...,0,"[authoris, forc, end, next, annual, gener, mee...","[(authoris, NN), (forc, JJ), (end, NN), (next,..."
2,valu deal wa disclos,0,"[valu, deal, wa, disclos]","[(valu, NN), (deal, NN), (wa, NN), (disclos, NN)]"
3,need readi window open rosberg said,0,"[need, readi, window, open, rosberg, said]","[(need, NN), (readi, NN), (window, NN), (open,..."
4,major order india comptel corpor ha receiv sig...,1,"[major, order, india, comptel, corpor, ha, rec...","[(major, JJ), (order, NN), (india, NN), (compt..."


In [301]:
sid = SIA()

In [302]:
xx = train['text'].apply(sid.polarity_scores)

In [303]:
xx = [num['compound'] for num in xx]
xx

[0.0,
 0.296,
 0.0,
 0.0,
 0.34,
 0.0,
 -0.2732,
 0.0,
 0.0,
 0.296,
 0.0,
 0.0,
 0.4404,
 0.0,
 0.4939,
 0.8442,
 0.3818,
 0.0,
 0.0,
 0.3182,
 0.0,
 0.1531,
 -0.5574,
 0.0,
 0.0,
 0.4939,
 0.3818,
 0.0,
 0.1027,
 0.0,
 0.0,
 0.0,
 0.4939,
 0.6808,
 0.0,
 0.296,
 0.8126,
 0.2732,
 0.0,
 0.0,
 0.34,
 0.4588,
 0.5267,
 0.0772,
 0.0516,
 0.4404,
 0.4404,
 0.0,
 0.2732,
 0.2732,
 0.5106,
 0.0,
 -0.1779,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1531,
 0.0,
 -0.1779,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1779,
 0.0,
 0.4019,
 0.0,
 -0.2732,
 0.0,
 0.0,
 0.5423,
 0.0,
 0.4404,
 0.3182,
 0.0772,
 0.4939,
 0.0,
 0.0772,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5106,
 0.0,
 0.8074,
 0.34,
 0.0,
 0.5574,
 0.1531,
 0.34,
 0.0,
 0.0,
 0.4767,
 0.1531,
 -0.128,
 0.0,
 0.3182,
 0.765,
 0.3182,
 0.34,
 0.4404,
 0.34,
 0.3612,
 0.7184,
 0.4404,
 0.4939,
 0.6369,
 0.34,
 -0.0772,
 0.2023,
 0.0,
 0.0,
 0.34,
 0.0,
 -0.0258,
 -0.296,
 -0.1027,
 0.0,
 0.1531,
 0.4019,
 0.0,
 0.0,
 0.1531,
 0.0,
 -0.296,
 0.0,
 0.0

In [304]:
xx = [1 if number > 0.4 else -1 if number < -0.4 else 0 for number in xx]
xx

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 -1,
 -1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 

In [305]:
train['pop_score'] = xx

In [306]:
(train['sentiment'] == train['pop_score']).sum()/train.shape[0] * 100

60.977137488255565

In [307]:
train['pop_score'].value_counts()

 0    2381
 1     761
-1      51
Name: pop_score, dtype: int64

In [308]:
tp = 0
for i in range(0,len(xx)):
    if (train['sentiment'][i]==1 and train['pop_score'][i]==1):
        tp +=1
print(f'the true positive rate {tp}')

the true positive rate 353


In [191]:
tn = 0
for i in range(0,len(xx)):
    if (train['sentiment'][i]==-1 and train['pop_score'][i]==-1):
        tn +=1
print(f'the true negative rate {tn}')

the true negative rate 31


In [192]:
tneu = 0
for i in range(0,len(xx)):
    if (train['sentiment'][i]==0 and train['pop_score'][i]==0):
        tneu += 1
print(f'the true neutral rate {tneu}')

the true neutral rate 1563


In [193]:
fp = 0
for i in range(0,len(xx)):
    if (train['sentiment'][i]==0 or train['sentiment'][i]==-1 and train['pop_score'][i]==1):
        fp +=1
print(f'the false positive rate {fp}')

the false positive rate 1985


In [194]:
fn = 0
for i in range(0,len(xx)):
    if (train['sentiment'][i]==1 or train['sentiment'][i]==-1 and train['pop_score'][i]==-1):
        fn +=1
print(f'the false negative rate {fn}')

the false negative rate 939


In [309]:
count_vectorizer = CountVectorizer()
ft = count_vectorizer.fit_transform(train['text'])
ft

<3193x6961 sparse matrix of type '<class 'numpy.int64'>'
	with 39685 stored elements in Compressed Sparse Row format>

In [313]:
for i in train['tag']:
    for j in i:
        print(j)

('addit', 'NN')
('29', 'CD')
('employe', 'NN')
('laid', 'VBN')
('notic', 'JJ')
('whole', 'JJ')
('workforc', 'NN')
('laid', 'VBN')
('short', 'JJ')
('period', 'NN')
('need', 'NN')
('authoris', 'NN')
('forc', 'JJ')
('end', 'NN')
('next', 'JJ')
('annual', 'JJ')
('gener', 'NN')
('meet', 'NN')
('repeal', 'JJ')
('authoris', 'NN')
('acquir', 'NN')
('share', 'NN')
('given', 'VBN')
('gener', 'RB')
('meet', 'JJ')
('held', 'VBD')
('april', 'RB')
('4', 'CD')
('2007', 'CD')
('valu', 'NN')
('deal', 'NN')
('wa', 'NN')
('disclos', 'NN')
('need', 'NN')
('readi', 'NN')
('window', 'NN')
('open', 'JJ')
('rosberg', 'NN')
('said', 'VBD')
('major', 'JJ')
('order', 'NN')
('india', 'NN')
('comptel', 'NN')
('corpor', 'NN')
('ha', 'NN')
('receiv', 'NN')
('signific', 'JJ')
('longterm', 'JJ')
('order', 'NN')
('mediat', 'NN')
('provis', 'NN')
('solut', 'NN')
('use', 'NN')
('lead', 'JJ')
('oper', 'IN')
('india', 'NN')
('stockmann', 'NN')
('wa', 'WRB')
('establish', 'VB')
('1862', 'CD')
('finland', 'NN')
('becam', 'VB

('raw', 'JJ')
('materi', 'NN')
('continu', 'NN')
('recycl', 'VBP')
('paper', 'NN')
('board', 'NN')
('onli', 'NN')
('driver', 'NN')
('wa', 'NN')
('left', 'VBD')
('car', 'NN')
('suspect', 'NN')
('wa', 'NN')
('kidnap', 'NN')
('forc', 'JJ')
('gunpoint', 'NN')
('drive', 'NN')
('durham', 'NN')
('februari', 'JJ')
('elcoteq', 'NN')
('group', 'NN')
('sold', 'VBD')
('st', 'RB')
('petersburg', 'JJ')
('facil', 'NN')
('accord', 'NN')
('unconfirm', 'JJ')
('inform', 'NN')
('reason', 'NN')
('could', 'MD')
('suppli', 'VB')
('problem', 'NN')
('becaus', 'NN')
('russian', 'JJ')
('custom', 'NN')
('servic', 'NN')
('sale', 'NN')
('unit', 'NN')
('slump', 'NN')
('last', 'JJ')
('year', 'NN')
('industri', 'NN')
('wa', 'NN')
('hit', 'VBD')
('poor', 'JJ')
('snowfal', 'JJ')
('major', 'JJ')
('resort', 'NN')
('winter', 'NN')
('200607', 'CD')
('adp', 'JJ')
('news', 'NN')
('feb', 'NN')
('12', 'CD')
('2009', 'CD')
('finnish', 'JJ')
('construct', 'NN')
('compani', 'NN')
('lemminkainen', 'NN')
('oyj', 'NN')
('hel', 'NN')


('veri', 'NN')
('differ', 'VBP')
('model', 'NN')
('india', 'NN')
('follow', 'VBP')
('total', 'JJ')
('amount', 'NN')
('subscript', 'JJ')
('price', 'NN')
('wa', 'NN')
('record', 'NN')
('fund', 'NN')
('invest', 'JJS')
('nonrestrict', 'NN')
('equiti', 'NN')
('total', 'JJ')
('two', 'CD')
('offer', 'NN')
('receiv', 'NN')
('contract', 'NN')
('author', 'NN')
('particip', 'NN')
('thi', 'NN')
('contract', 'NN')
('howev', 'NN')
('contract', 'NN')
('wa', 'IN')
('allot', 'NN')
('affecto', 'NN')
('finland', 'NN')
('oy', 'NN')
('cut', 'NN')
('equival', 'NN')
('cost', 'NN')
('3545', 'CD')
('employe', 'NN')
('target', 'NN')
('compani', 'NN')
('said', 'VBD')
('annual', 'JJ')
('net', 'JJ')
('sale', 'NN')
('unit', 'NN')
('eur', 'VBZ')
('5', 'CD')
('million', 'CD')
('current', 'JJ')
('employ', 'NN')
('55', 'CD')
('peopl', 'NN')
('upm', 'NN')
('said', 'VBD')
('move', 'NN')
('lower', 'RBR')
('net', 'JJ')
('profit', 'NN')
('x20ac', '$')
('385', 'CD')
('million', 'CD')
('us', 'PRP')
('520', 'CD')
('million', '

('2008', 'CD')
('finnish', 'JJ')
('powersuppli', 'FW')
('solut', 'NN')
('provid', 'NN')
('efor', 'FW')
('oyj', 'NN')
('omx', 'NN')
('efo1v', 'NN')
('announc', 'NN')
('today', 'NN')
('launch', 'JJ')
('opu', 'NN')
('dc', 'NN')
('latest', 'JJS')
('power', 'NN')
('system', 'NN')
('opu', 'JJ')
('product', 'NN')
('line', 'NN')
('layoff', 'NN')
('also', 'RB')
('take', 'VBP')
('place', 'NN')
('suomussalmi', 'NN')
('kuhmo', 'NN')
('plant', 'NN')
('thi', 'NN')
('autumn', 'NN')
('result', 'NN')
('codetermin', 'NN')
('negoti', 'NNS')
('held', 'VBD')
('oper', 'JJ')
('profit', 'NN')
('improv', 'NN')
('eur', 'VBZ')
('203', 'CD')
('mn', 'NN')
('eur', 'NN')
('114', 'CD')
('mn', 'NN')
('consider', 'VB')
('weaker', 'JJR')
('us', 'PRP')
('dollar', 'NN')
('british', 'JJ')
('pound', 'NN')
('impact', 'NN')
('sale', 'NN')
('averag', 'JJ')
('price', 'NN')
('euro', 'NN')
('upm', 'NN')
('said', 'VBD')
('allow', 'JJ')
('young', 'JJ')
('child', 'NN')
('move', 'VB')
('forward', 'RB')
('hi', 'JJ')
('life', 'NN')
('f

('intellig', 'NN')
('project', 'NN')
('skat', 'NN')
('atria', 'NN')
('also', 'RB')
('buy', 'VBP')
('share', 'NN')
('kauhajoen', 'JJ')
('teurastamokiinteistot', 'NN')
('oy', 'NN')
('kauhajoki', 'VBP')
('slaughterhous', 'JJ')
('propertyfrom', 'NN')
('itikka', 'NN')
('cooper', 'NN')
('deliveri', 'NN')
('expect', 'VBP')
('take', 'VB')
('place', 'NN')
('later', 'RB')
('thi', 'JJ')
('month', 'NN')
('svyturysuteno', 'JJ')
('alu', 'NNS')
('control', 'NN')
('nordic', 'JJ')
('group', 'NN')
('baltic', 'JJ')
('beverag', 'NN')
('hold', 'VBP')
('bbh', 'NN')
('post', 'NN')
('47percent', 'CD')
('growth', 'NN')
('beer', 'NN')
('sale', 'NN')
('januarymay', 'NN')
('4622', 'CD')
('million', 'CD')
('litr', 'NN')
('fiskar', 'NN')
('ha', 'NN')
('strong', 'JJ')
('portfolio', 'NN')
('intern', 'JJ')
('brand', 'NN')
('includ', 'NN')
('fiskar', 'NN')
('iittala', 'JJ')
('gerber', 'NN')
('silva', 'NN')
('buster', 'NN')
('fortum', 'NN')
('hold', 'VBP')
('902', 'CD')
('pct', 'JJ')
('share', 'NN')
('capit', 'VBZ')
('9

('eur', 'RB')
('85', 'CD')
('mn', 'JJ')
('third', 'JJ')
('quarter', 'NN')
('2007', 'CD')
('negoti', 'JJ')
('concern', 'NN')
('246', 'CD')
('salari', 'NN')
('senior', 'JJ')
('salari', 'NN')
('employe', 'NN')
('schedul', 'JJ')
('complet', 'NN')
('six', 'CD')
('week', 'NN')
('dure', 'NN')
('strike', 'NN')
('finnair', 'NN')
('estim', 'JJ')
('incur', 'JJ')
('net', 'JJ')
('loss', 'NN')
('eur2m', 'NN')
('eur2', 'VBZ')
('5', 'CD')
('per', 'IN')
('day', 'NN')
('carrier', 'NN')
('said', 'VBD')
('area', 'NN')
('travel', 'NN')
('agenc', 'NN')
('sell', 'VBP')
('outlet', 'NN')
('sister', 'NN')
('chain', 'NN')
('suomen', 'NNS')
('matkatoimisto', 'VBP')
('smtand', 'NN')
('use', 'NN')
('remain', 'VBP')
('bricksandmortar', 'JJ')
('presenc', 'NN')
('primarili', 'NN')
('target', 'NN')
('busi', 'NN')
('custom', 'NN')
('go', 'VBP')
('forward', 'RB')
('patrick', 'NN')
('jeambar', 'NN')
('also', 'RB')
('continu', 'JJ')
('respons', 'NNS')
('innov', 'VBP')
('health', 'NN')
('safeti', 'NN')
('environ', 'NN')
('h

('addit', 'NN')
('normal', 'JJ')
('season', 'NN')
('fluctuat', 'NN')
('market', 'NN')
('situat', 'NN')
('ha', 'NN')
('weaken', 'JJ')
('dure', 'NN')
('autumn', 'NN')
('2008', 'CD')
('finnish', 'JJ')
('outokumpu', 'MD')
('technolog', 'VB')
('ha', 'JJ')
('award', 'NN')
('sever', 'VBD')
('new', 'JJ')
('grind', 'NN')
('technolog', 'NN')
('contract', 'NN')
('compani', 'NNS')
('oper', 'IN')
('four', 'CD')
('princip', 'NN')
('divis', 'NN')
('consum', 'NN')
('packag', 'NN')
('offic', 'JJ')
('paper', 'NN')
('special', 'JJ')
('paper', 'NN')
('well', 'NN')
('market', 'NN')
('pulp', 'NN')
('energi', 'NN')
('result', 'NN')
('merger', 'NN')
('largest', 'JJS')
('profession', 'NN')
('electron', 'NN')
('telecommun', 'NN')
('technolog', 'NN')
('contract', 'NN')
('manufactur', 'NN')
('center', 'NN')
('finland', 'VBP')
('form', 'NN')
('sievi', 'NN')
('intern', 'JJ')
('electron', 'NN')
('industri', 'NN')
('compani', 'NN')
('elcoteq', 'NN')
('ha', 'NN')
('laid', 'VBD')
('ten', 'JJ')
('employe', 'JJ')
('talli

('carri', 'JJ')
('right', 'JJ')
('dividend', 'NN')
('sharehold', 'VBD')
('right', 'JJ')
('registr', 'NN')
('finnish', 'JJ')
('trade', 'NN')
('regist', 'NN')
('recent', 'JJ')
('interview', 'NN')
('financi', 'NN')
('time', 'NN')
('ft', 'JJ')
('sampo', 'RB')
('board', 'NN')
('chairman', 'NN')
('bjorn', 'VBD')
('wahlroo', 'NN')
('said', 'VBD')
('pc', 'NN')
('wa', 'NN')
('certainli', 'JJ')
('sale', 'NN')
('price', 'NN')
('set', 'VBD')
('sek', 'JJ')
('85', 'CD')
('billion', 'CD')
('confirm', 'NNS')
('earlier', 'RBR')
('statement', 'NN')
('growth', 'NN')
('wa', 'WRB')
('strongest', 'JJS')
('fsecur', 'NN')
('oper', 'IN')
('isp', 'NN')
('mobil', 'NN')
('oper', 'IN')
('cabl', 'NN')
('oper', 'IN')
('busi', 'NN')
('metso', 'NN')
('foundri', 'NN')
('jyvaskyla', 'NN')
('oy', 'NN')
('discontinu', 'JJ')
('product', 'NN')
('thi', 'NN')
('line', 'NN')
('30', 'CD')
('septemb', 'NN')
('2008', 'CD')
('group', 'NN')
('net', 'JJ')
('sale', 'NN')
('eur', 'NN')
('235', 'CD')
('million', 'CD')
('2009', 'CD')
('

('inform', 'NN')
('technolog', 'NN')
('servic', 'NN')
('giant', 'JJ')
('wipro', 'JJ')
('technolog', 'NN')
('transfer', 'NN')
('sell', 'NN')
('radio', 'NN')
('access', 'NN')
('relat', 'JJ')
('rd', 'NN')
('activ', 'NN')
('berlin', 'NN')
('mobil', 'NN')
('phone', 'NN')
('sale', 'NN')
('rose', 'VBD')
('25', 'CD')
('587', 'CD')
('billion', 'CD')
('euro', 'NNS')
('enterpris', 'VBP')
('solut', 'JJ')
('sale', 'NN')
('drop', 'NN')
('39', 'CD')
('186', 'CD')
('million', 'CD')
('euro', 'NN')
('tikkurila', 'NN')
('divis', 'NN')
('kemira', 'NNP')
('group', 'NN')
('control', 'NN')
('23', 'CD')
('russian', 'JJ')
('market', 'NN')
('field', 'NN')
('st', 'VBD')
('petersburg', 'JJ')
('paint', 'NN')
('produc', 'NN')
('tex', 'NN')
('compani', 'NN')
('ha', 'NN')
('export', 'NN')
('twenti', 'JJ')
('european', 'JJ')
('countri', 'NN')
('well', 'RB')
('africa', 'RB')
('compani', 'NN')
('ha', 'NN')
('commit', 'NN')
('expand', 'VBP')
('apollo', 'NN')
('portfolio', 'NN')
('vacon', 'NN')
('control', 'NN')
('5', 'CD

('season', 'NN')
('contract', 'NN')
('sign', 'NN')
('acquir', 'NN')
('uranium', 'JJ')
('produc', 'NN')
('sotkamo', 'NN')
('nickelzinc', 'RB')
('mine', 'JJ')
('eastern', 'JJ')
('finland', 'NN')
('talvivaara', 'NN')
('adp', 'JJ')
('news', 'NN')
('feb', 'NN')
('12', 'CD')
('2009', 'CD')
('finnish', 'JJ')
('solut', 'NN')
('provid', 'NN')
('affecto', 'NN')
('oyj', 'NN')
('hel', 'NN')
('afe1v', 'NN')
('said', 'VBD')
('today', 'NN')
('net', 'JJ')
('profit', 'NN')
('rose', 'VBD')
('eur', '$')
('85', 'CD')
('million', 'CD')
('usd', 'JJ')
('11m', 'CD')
('2008', 'CD')
('eur', 'NN')
('7', 'CD')
('million', 'CD')
('2007', 'CD')
('oper', 'JJ')
('profit', 'NN')
('fell', 'VBD')
('eur', 'RB')
('354', 'CD')
('mn', 'NNS')
('eur', 'RB')
('688', 'CD')
('mn', 'NN')
('2007', 'CD')
('includ', 'NN')
('vessel', 'NN')
('sale', 'NN')
('gain', 'NN')
('eur', 'NN')
('123', 'CD')
('mn', 'NN')
('reach', 'NN')
('agreement', 'NN')
('union', 'NN')
('flight', 'NN')
('continu', 'NN')
('normal', 'JJ')
('finnair', 'NN')
('sp

('box', 'NN')
('need', 'NN')
('made', 'VBD')
('recycl', 'NNS')
('materi', 'VB')
('compar', 'JJ')
('net', 'JJ')
('sale', 'NN')
('expect', 'VBP')
('increas', 'VBZ')
('10', 'CD')
('2008', 'CD')
('line', 'NN')
('group', 'NN')
('target', 'NN')
('combin', 'NN')
('activ', 'VBZ')
('creat', 'NN')
('valu', 'NN')
('sharehold', 'VBD')
('good', 'JJ')
('employe', 'NN')
('custom', 'NN')
('goal', 'NN')
('signific', 'NN')
('expans', 'NNS')
('finland', 'VBP')
('northern', 'JJ')
('baltic', 'JJ')
('region', 'NN')
('dnb', 'NN')
('nord', 'DT')
('norway', 'RB')
('like', 'IN')
('nordic', 'JJ')
('buyer', 'NN')
('citadel', 'NN')
('nordea', 'NN')
('would', 'MD')
('good', 'VB')
('strateg', 'JJ')
('fit', 'NN')
('accord', 'NN')
('document', 'NN')
('publish', 'JJ')
('pietiek', 'NN')
('outotec', 'JJ')
('headquart', 'NN')
('espoo', 'NN')
('finland', 'VBP')
('lead', 'NN')
('provid', 'NN')
('process', 'NN')
('solut', 'NN')
('technolog', 'NN')
('servic', 'JJ')
('mine', 'NN')
('metallurg', 'NN')
('industri', 'NN')
('outso

('technopoli', 'RB')
('europ', 'JJ')
('lead', 'NN')
('oper', 'IN')
('technopark', 'NN')
('order', 'NN')
('relat', 'NN')
('renew', 'NN')
('network', 'NN')
('telecommun', 'NN')
('oper', 'NN')
('finnish', 'JJ')
('electron', 'NN')
('contract', 'NN')
('manufactur', 'JJ')
('scanfil', 'JJ')
('report', 'NN')
('net', 'JJ')
('sale', 'NN')
('eur', 'NN')
('589', 'CD')
('mn', 'JJ')
('second', 'JJ')
('quarter', 'NN')
('2007', 'CD')
('eur', 'NN')
('624', 'CD')
('mn', 'NN')
('year', 'NN')
('earlier', 'RBR')
('finland', 'NN')
('hamina', 'NN')
('cargo', 'NN')
('termin', 'NN')
('finnish', 'JJ')
('nurminen', 'JJ')
('logist', 'NN')
('ha', 'NN')
('purchas', 'VBP')
('warehous', 'JJ')
('ground', 'NN')
('leas', 'RB')
('right', 'JJ')
('port', 'NN')
('hamina', 'NN')
('outotec', 'JJ')
('current', 'JJ')
('deliv', 'NN')
('new', 'JJ')
('oil', 'NN')
('shale', 'NN')
('process', 'NN')
('plant', 'NN')
('eesti', 'NN')
('energia', 'VBP')
('estonia', 'JJ')
('base', 'NN')
('enefit', 'NN')
('technolog', 'NN')
('ha', 'NN')
('

('august', 'JJ')
('2008', 'CD')
('glaston', 'NN')
('north', 'JJ')
('asian', 'JJ')
('sale', 'NN')
('servic', 'JJ')
('region', 'NN')
('upgrad', 'JJ')
('new', 'JJ')
('market', 'NN')
('area', 'NN')
('north', 'JJ')
('asia', 'NN')
('technic', 'JJ')
('indic', 'JJ')
('stock', 'NN')
('bullish', 'JJ')
('sp', 'NN')
('give', 'VBP')
('nok', 'JJ')
('posit', 'NN')
('4', 'CD')
('star', 'NN')
('5', 'CD')
('buy', 'NN')
('rank', 'NN')
('south', 'NN')
('america', 'JJ')
('asia', 'NN')
('promis', 'NN')
('market', 'NN')
('thi', 'NN')
('subscript', 'NN')
('fortum', 'NN')
('ownership', 'NN')
('tgk10', 'NN')
('ha', 'NN')
('increas', 'NNS')
('slightli', 'VBP')
('76', 'CD')
('share', 'NN')
('vote', 'NN')
('right', 'RB')
('finnish', 'JJ')
('stainless', 'NN')
('steel', 'NN')
('manufactur', 'NNS')
('outokumpu', 'VBP')
('degefor', 'JJ')
('plan', 'NN')
('recruit', 'VBP')
('new', 'JJ')
('engin', 'NN')
('order', 'NN')
('meet', 'NN')
('retir', 'NN')
('wave', 'VBP')
('next', 'JJ')
('year', 'NN')
('compani', 'NNS')
('give'

('profit', 'NN')
('lead', 'NN')
('effici', 'JJ')
('product', 'NN')
('said', 'VBD')
('bo', 'JJ')
('annvik', 'JJ')
('head', 'NN')
('specialti', 'NN')
('stainless', 'NN')
('growth', 'NN')
('net', 'JJ')
('sale', 'NN')
('ha', 'NN')
('continu', 'VB')
('favour', 'JJ')
('middl', 'NN')
('east', 'JJ')
('africaand', 'NN')
('asia', 'NN')
('pacif', 'NN')
('ongo', 'NN')
('project', 'NN')
('tekla', 'NN')
('structur', 'NN')
('use', 'NN')
('vashi', 'JJ')
('exhibit', 'NN')
('centr', 'NN')
('develop', 'VB')
('insteel', 'JJ')
('engin', 'NN')
('pvt', 'NN')
('ltdiivrcl', 'NN')
('infrastructur', 'NN')
('project', 'NN')
('ltd', 'VBZ')
('cidco', 'NN')
('thank', 'JJ')
('internet', 'NN')
('consum', 'NN')
('compar', 'NN')
('product', 'NN')
('previous', 'JJ')
('finnish', 'JJ')
('compani', 'NN')
('competit', 'NN')
('net', 'JJ')
('sale', 'NN')
('eaten', 'NN')
('weak', 'JJ')
('us', 'PRP')
('dollar', 'NN')
('industri', 'JJ')
('asia', 'NN')
('\x88', 'NNP')
('pakistan', 'NN')
('malaysia', 'NN')
('taiwan', 'NN')
('philip

('invest', 'VBP')
('shop', 'NN')
('center', 'NN')
('redevelop', 'NN')
('project', 'NN')
('refinanc', 'NN')
('matur', 'JJ')
('debt', 'NN')
('adjust', 'JJ')
('present', 'NN')
('situat', 'NN')
('cut', 'NN')
('capac', 'NN')
('cost', 'NN')
('without', 'IN')
('howev', 'NN')
('jeopardis', 'NN')
('asia', 'NN')
('strategi', 'NN')
('longer', 'JJR')
('term', 'NN')
('valu', 'JJ')
('firm', 'NN')
('forestri', 'NN')
('hold', 'VBP')
('increas', 'JJ')
('sek', 'VBP')
('36', 'CD')
('bn', 'NN')
('roshan', 'JJ')
('net', 'JJ')
('sale', 'NN')
('2006', 'CD')
('191', 'CD')
('million', 'CD')
('ebitda', 'NN')
('wa', '$')
('665', 'CD')
('million', 'CD')
('rapala', 'NN')
('vmc', 'NN')
('corpor', 'NN')
('rapala', 'JJ')
('lead', 'JJ')
('fish', 'NN')
('tackl', 'NN')
('sport', 'NN')
('good', 'JJ')
('manufactur', 'NN')
('distributor', 'NN')
('main', 'JJ')
('owner', 'NN')
('peltonen', 'VBD')
('80', 'CD')
('sharehold', 'NN')
('veidekk', 'JJ')
('headquart', 'NN')
('oslo', 'VBZ')
('norway', 'RB')
('scandinavian', 'JJ')
('c

('sign', 'NN')
('longterm', 'JJ')
('servic', 'JJ')
('deal', 'NN')
('elisa', 'VBZ')
('deliv', 'NN')
('necessari', 'JJ')
('voic', 'NN')
('data', 'NNS')
('servic', 'NN')
('aker', 'NN')
('yard', 'NN')
('finland', 'NN')
('portion', 'NN')
('125', 'CD')
('million', 'CD')
('record', 'NN')
('part', 'NN')
('win', 'VBP')
('prior', 'JJ')
('patent', 'NN')
('disput', 'NN')
('finnish', 'JJ')
('phone', 'NN')
('maker', 'NN')
('nokia', 'NN')
('oyj', 'NN')
('port', 'NN')
('facil', 'NN')
('throughput', 'VBD')
('250000', 'CD')
('teu', 'NN')
('75', 'CD')
('mln', 'NN')
('ton', 'NN')
('gener', 'NN')
('cargo', 'NN')
('group', 'NN')
('ebit', 'VBD')
('first', 'JJ')
('half', 'NN')
('wa', 'NN')
('eur13', 'VBZ')
('6', 'CD')
('us', 'PRP')
('178', 'CD')
('fall', 'NN')
('short', 'JJ')
('eur22', 'NN')
('5', 'CD')
('post', 'NN')
('period', 'NN')
('2009', 'CD')
('veneer', 'NN')
('plywood', 'NN')
('industri', 'NN')
('sweden', 'JJ')
('quit', 'NN')
('small', 'JJ')
('despit', 'NN')
('raw', 'JJ')
('materi', 'NN')
('resourc', 

('servic', 'NN')
('mobil', 'NN')
('fix', 'NN')
('network', 'NN')
('finnish', 'JJ')
('invest', 'JJS')
('group', 'NN')
('panostaja', 'NN')
('oyj', 'NN')
('said', 'VBD')
('net', 'JJ')
('profit', 'NN')
('went', 'VBD')
('86', 'CD')
('mln', 'NN')
('euro', 'NN')
('114', 'CD')
('mln', 'JJ')
('fiscal', 'JJ')
('200506', 'CD')
('end', 'NN')
('octob', 'NN')
('31', 'CD')
('2006', 'CD')
('28', 'CD')
('mln', 'NN')
('euro', 'NN')
('37', 'CD')
('mln', 'NN')
('period', 'NN')
('fiscal', 'JJ')
('200405', 'CD')
('earn', 'NN')
('per', 'IN')
('share', 'NN')
('ep', 'NN')
('2005', 'CD')
('amount', 'NN')
('loss', 'NN')
('eur1', 'VBZ')
('26', 'CD')
('pulkovo', 'NN')
('park', 'NN')
('readi', 'NN')
('2016', 'CD')
('first', 'JJ')
('stage', 'NN')
('23000', 'CD')
('sq', 'NN')
('finish', 'NN')
('first', 'RB')
('quarter', 'NN')
('2010', 'CD')
('tamper', 'NN')
('scienc', 'NN')
('park', 'NN')
('finnish', 'JJ')
('compani', 'NN')
('leas', 'NNS')
('build', 'VBP')
('offic', 'JJ')
('properti', 'NN')
('specialis', 'NN')
('faci

('spring', 'NN')
('2006', 'CD')
('total', 'NN')
('386530', 'CD')
('stock', 'NN')
('option', 'NN')
('2002', 'CD')
('b', 'NN')
('annul', 'NN')
('talk', 'NN')
('aim', 'NN')
('restructur', 'NN')
('oper', 'IN')
('cut', 'NN')
('cost', 'NN')
('estonia', 'RB')
('beer', 'NN')
('market', 'NN')
('overal', 'JJ')
('grew', 'VBD')
('three', 'CD')
('percent', 'NN')
('last', 'JJ')
('year', 'NN')
('130', 'CD')
('million', 'CD')
('liter', 'NN')
('quarterli', 'NN')
('dilut', 'NN')
('ep', 'NN')
('continu', 'NN')
('oper', 'NN')
('came', 'VBD')
('021', 'CD')
('eur', 'NN')
('compar', 'NN')
('last', 'JJ')
('year', 'NN')
('012', 'CD')
('eur', 'NN')
('swedish', 'JJ')
('maritim', 'NN')
('administr', 'NN')
('ha', 'NN')
('close', 'RB')
('furusund', 'JJ')
('channel', 'NNS')
('vike', 'IN')
('line', 'NN')
('normal', 'JJ')
('charter', 'NN')
('en', 'FW')
('rout', 'NN')
('stockholm', 'NN')
('section', 'NN')
('put', 'VBD')
('place', 'NN')
('form', 'NN')
('loadbear', 'VBP')
('steel', 'NN')
('structur', 'NN')
('bridg', 'NNS

('control', 'NN')
('busi', 'NN')
('unit', 'NN')
('sinc', 'NN')
('januari', 'NN')
('2007', 'CD')
('elit', 'NN')
('resid', 'NN')
('tower', 'VBD')
('new', 'JJ')
('develop', 'VB')
('tameer', 'JJ')
('locat', 'JJ')
('heart', 'NN')
('dubai', 'NN')
('marina', 'NN')
('adp', 'JJ')
('news', 'NN')
('oct', 'IN')
('1', 'CD')
('2008', 'CD')
('finnish', 'JJ')
('consult', 'NN')
('engin', 'NN')
('compani', 'NN')
('poyri', 'NN')
('oyj', 'NN')
('omx', 'NN')
('poy1v', 'NN')
('said', 'VBD')
('today', 'NN')
('wa', 'VBP')
('award', 'RB')
('eur', '$')
('52', 'CD')
('million', 'CD')
('usd', 'JJ')
('74', 'CD')
('extens', 'NNS')
('exist', 'VBP')
('consult', 'NN')
('engin', 'NN')
('contract', 'NN')
('venezuel', 'NN')
('compani', 'NN')
('also', 'RB')
('said', 'VBD')
('deploy', 'JJ')
('danish', 'JJ')
('4g', 'CD')
('network', 'NN')
('continu', 'NN')
('expect', 'VBP')
('cover', 'NN')
('75', 'CD')
('danish', 'JJ')
('popul', 'NN')
('2011', 'CD')
('thi', 'JJ')
('new', 'JJ')
('version', 'NN')
('veri', 'NN')
('import', 'NN

In [225]:
count_vectorizer.vocabulary_
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(ft)
X_train_tfidf.shape

(3193, 6961)

In [227]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train['sentiment'])

In [228]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),])

In [229]:
text_clf = text_clf.fit(train['text'], train['sentiment'])

In [259]:
test = pd.read_csv('news_headlines_test_sample_submission.csv')
test['text'] = test['text'].str.lower()
test['text'] = test['text'].str.replace('[{}]'.format(string.punctuation), '')
stemmer = nltk.PorterStemmer()
test['token'] = test['text'].apply(nltk.word_tokenize)
test['token'] = test['token'].apply(stem_sentences)
test.head()

Unnamed: 0,text,sentiment,token
0,proline plus is available in both adjustable s...,0,prolin plu is avail in both adjust singl and m...
1,digia said its consolidated net sales for janu...,0,digia said it consolid net sale for januaryjun...
2,cash flow from operating activities is estimat...,0,cash flow from oper activ is estim to be posit
3,11 august 2010 finnish measuring equipment ma...,0,11 august 2010 finnish measur equip maker vais...
4,metso foundries jyvaskyla oy will discontinue ...,0,metso foundri jyvaskyla oy will discontinu pro...


In [262]:
test['token'] = test['text'].apply(nltk.word_tokenize).apply(rem_stop)

In [None]:
sub = pd.read_csv('news_headlines_test_sample_submission.csv')

In [267]:
prednb = text_clf.predict(test['token'])
sub['sentiment'] = prednb
sub.to_csv('nb.csv', index=False)

In [271]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42)),])
text_clf = text_clf_svm.fit(train['text'], train['sentiment'])

In [274]:
predsvm = text_clf.predict(test['token'])
sub['sentiment'] = predsvm
sub.to_csv('svm.csv', index=False)