# Stock Sentiment Analysis using News Headlines

In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

### Dataset Features - 
1. Top 25 Headlines for specific companies
2. Labels 0 - Stock price had a negative impact
3. Labels 1 - Stock price had a positive impact

In [2]:
news_data = pd.read_csv("data/news-headlines.csv", encoding="ISO-8859-1")
news_data.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title
1,2000-01-04,0,Scorecard,The best lake scene,Leader: German sleaze inquiry,"Cheerio, boyo",The main recommendations,Has Cubie killed fees?,Has Cubie killed fees?,Has Cubie killed fees?,...,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man's extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn't know without the ...,Millennium bug fails to bite
2,2000-01-05,0,Coventry caught on counter by Flo,United's rivals on the road to Rio,Thatcher issues defence before trial by video,Police help Smith lay down the law at Everton,Tale of Trautmann bears two more retellings,England on the rack,Pakistan retaliate with call for video of Walsh,Cullinan continues his Cape monopoly,...,South Melbourne (Australia),Necaxa (Mexico),Real Madrid (Spain),Raja Casablanca (Morocco),Corinthians (Brazil),Tony's pet project,Al Nassr (Saudi Arabia),Ideal Holmes show,Pinochet leaves hospital after tests,Useful links
3,2000-01-06,1,Pilgrim knows how to progress,Thatcher facing ban,McIlroy calls for Irish fighting spirit,Leicester bin stadium blueprint,United braced for Mexican wave,"Auntie back in fashion, even if the dress look...",Shoaib appeal goes to the top,Hussain hurt by 'shambles' but lays blame on e...,...,Putin admits Yeltsin quit to give him a head s...,BBC worst hit as digital TV begins to bite,How much can you pay for...,Christmas glitches,"Upending a table, Chopping a line and Scoring ...","Scientific evidence 'unreliable', defence claims",Fusco wins judicial review in extradition case,Rebels thwart Russian advance,Blair orders shake-up of failing NHS,Lessons of law's hard heart
4,2000-01-07,1,Hitches and Horlocks,Beckham off but United survive,Breast cancer screening,Alan Parker,Guardian readers: are you all whingers?,Hollywood Beyond,Ashes and diamonds,Whingers - a formidable minority,...,Most everywhere: UDIs,Most wanted: Chloe lunettes,Return of the cane 'completely off the agenda',From Sleepy Hollow to Greeneland,Blunkett outlines vision for over 11s,"Embattled Dobson attacks 'play now, pay later'...",Doom and the Dome,What is the north-south divide?,Aitken released from jail,Gone aloft


In [3]:
news_data['Date'].agg(['min', 'max'])

min    2000-01-03
max    2016-07-01
Name: Date, dtype: object

In [4]:
train_data = news_data[news_data["Date"] < "2015-01-01"]
test_data = news_data[news_data["Date"] > "2014-12-31"]
train_label= news_data["Label"][news_data["Date"] < "2015-01-01"]
test_label = news_data["Label"][news_data["Date"] > "2014-12-31"]

train_data.shape, test_data.shape, train_label.shape, test_label.shape

((3723, 27), (378, 27), (3723,), (378,))

In [5]:
train_data['Date'].agg(['min', 'max']), test_data['Date'].agg(['min', 'max'])

(min    2000-01-03
 max    2014-12-31
 Name: Date, dtype: object,
 min    2015-01-02
 max    2016-07-01
 Name: Date, dtype: object)

In [6]:
train_data = train_data.iloc[:, 2:]
test_data = test_data.iloc[:, 2:]

train_data.replace("[^a-zA-Z]", " ", 
                   regex = True, 
                   inplace = True)

In [7]:
train_data.head(1)

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,A hindrance to operations extracts from the...,Scorecard,Hughes instant hit buoys Blues,Jack gets his skates on at ice cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,Derby raise a glass to Strupar s debut double,Southgate strikes Leeds pay the penalty,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl s successor drawn into scandal,The difference between men and women,Sara Denver nurse turned solicitor,Diana s landmine crusade put Tories in a panic,Yeltsin s resignation caught opposition flat f...,Russian roulette,Sold out,Recovering a title


In [8]:
# Converting the training data to lower-case
for cols in train_data.columns:
    train_data[cols] = train_data[cols].str.lower()

train_data.head(1)

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,a hindrance to operations extracts from the...,scorecard,hughes instant hit buoys blues,jack gets his skates on at ice cold alex,chaos as maracana builds up for united,depleted leicester prevail as elliott spoils e...,hungry spurs sense rich pickings,gunners so wide of an easy target,derby raise a glass to strupar s debut double,southgate strikes leeds pay the penalty,...,flintoff injury piles on woe for england,hunters threaten jospin with new battle of the...,kohl s successor drawn into scandal,the difference between men and women,sara denver nurse turned solicitor,diana s landmine crusade put tories in a panic,yeltsin s resignation caught opposition flat f...,russian roulette,sold out,recovering a title


In [9]:
# Merging all 25 headlines
headlines = []

for row in range(len(train_data.index)):
    headlines.append(' '.join(str(x) for x in train_data.iloc[row, 0:25]))

In [10]:
headlines[0]

'a  hindrance to operations   extracts from the leaked reports scorecard hughes  instant hit buoys blues jack gets his skates on at ice cold alex chaos as maracana builds up for united depleted leicester prevail as elliott spoils everton s party hungry spurs sense rich pickings gunners so wide of an easy target derby raise a glass to strupar s debut double southgate strikes  leeds pay the penalty hammers hand robson a youthful lesson saints party like it s      wear wolves have turned into lambs stump mike catches testy gough s taunt langer escapes to hit     flintoff injury piles on woe for england hunters threaten jospin with new battle of the somme kohl s successor drawn into scandal the difference between men and women sara denver  nurse turned solicitor diana s landmine crusade put tories in a panic yeltsin s resignation caught opposition flat footed russian roulette sold out recovering a title'

In [11]:
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(len(headlines)):
    print("At sentence: {}".format(i))
    
    review = headlines[i].split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words("english"))]
    review = ' '.join(review)
    corpus.append(review)

At sentence: 0
At sentence: 1
At sentence: 2
At sentence: 3
At sentence: 4
At sentence: 5
At sentence: 6
At sentence: 7
At sentence: 8
At sentence: 9
At sentence: 10
At sentence: 11
At sentence: 12
At sentence: 13
At sentence: 14
At sentence: 15
At sentence: 16
At sentence: 17
At sentence: 18
At sentence: 19
At sentence: 20
At sentence: 21
At sentence: 22
At sentence: 23
At sentence: 24
At sentence: 25
At sentence: 26
At sentence: 27
At sentence: 28
At sentence: 29
At sentence: 30
At sentence: 31
At sentence: 32
At sentence: 33
At sentence: 34
At sentence: 35
At sentence: 36
At sentence: 37
At sentence: 38
At sentence: 39
At sentence: 40
At sentence: 41
At sentence: 42
At sentence: 43
At sentence: 44
At sentence: 45
At sentence: 46
At sentence: 47
At sentence: 48
At sentence: 49
At sentence: 50
At sentence: 51
At sentence: 52
At sentence: 53
At sentence: 54
At sentence: 55
At sentence: 56
At sentence: 57
At sentence: 58
At sentence: 59
At sentence: 60
At sentence: 61
At sentence: 62
At

At sentence: 491
At sentence: 492
At sentence: 493
At sentence: 494
At sentence: 495
At sentence: 496
At sentence: 497
At sentence: 498
At sentence: 499
At sentence: 500
At sentence: 501
At sentence: 502
At sentence: 503
At sentence: 504
At sentence: 505
At sentence: 506
At sentence: 507
At sentence: 508
At sentence: 509
At sentence: 510
At sentence: 511
At sentence: 512
At sentence: 513
At sentence: 514
At sentence: 515
At sentence: 516
At sentence: 517
At sentence: 518
At sentence: 519
At sentence: 520
At sentence: 521
At sentence: 522
At sentence: 523
At sentence: 524
At sentence: 525
At sentence: 526
At sentence: 527
At sentence: 528
At sentence: 529
At sentence: 530
At sentence: 531
At sentence: 532
At sentence: 533
At sentence: 534
At sentence: 535
At sentence: 536
At sentence: 537
At sentence: 538
At sentence: 539
At sentence: 540
At sentence: 541
At sentence: 542
At sentence: 543
At sentence: 544
At sentence: 545
At sentence: 546
At sentence: 547
At sentence: 548
At sentence: 5

At sentence: 974
At sentence: 975
At sentence: 976
At sentence: 977
At sentence: 978
At sentence: 979
At sentence: 980
At sentence: 981
At sentence: 982
At sentence: 983
At sentence: 984
At sentence: 985
At sentence: 986
At sentence: 987
At sentence: 988
At sentence: 989
At sentence: 990
At sentence: 991
At sentence: 992
At sentence: 993
At sentence: 994
At sentence: 995
At sentence: 996
At sentence: 997
At sentence: 998
At sentence: 999
At sentence: 1000
At sentence: 1001
At sentence: 1002
At sentence: 1003
At sentence: 1004
At sentence: 1005
At sentence: 1006
At sentence: 1007
At sentence: 1008
At sentence: 1009
At sentence: 1010
At sentence: 1011
At sentence: 1012
At sentence: 1013
At sentence: 1014
At sentence: 1015
At sentence: 1016
At sentence: 1017
At sentence: 1018
At sentence: 1019
At sentence: 1020
At sentence: 1021
At sentence: 1022
At sentence: 1023
At sentence: 1024
At sentence: 1025
At sentence: 1026
At sentence: 1027
At sentence: 1028
At sentence: 1029
At sentence: 1030


At sentence: 1434
At sentence: 1435
At sentence: 1436
At sentence: 1437
At sentence: 1438
At sentence: 1439
At sentence: 1440
At sentence: 1441
At sentence: 1442
At sentence: 1443
At sentence: 1444
At sentence: 1445
At sentence: 1446
At sentence: 1447
At sentence: 1448
At sentence: 1449
At sentence: 1450
At sentence: 1451
At sentence: 1452
At sentence: 1453
At sentence: 1454
At sentence: 1455
At sentence: 1456
At sentence: 1457
At sentence: 1458
At sentence: 1459
At sentence: 1460
At sentence: 1461
At sentence: 1462
At sentence: 1463
At sentence: 1464
At sentence: 1465
At sentence: 1466
At sentence: 1467
At sentence: 1468
At sentence: 1469
At sentence: 1470
At sentence: 1471
At sentence: 1472
At sentence: 1473
At sentence: 1474
At sentence: 1475
At sentence: 1476
At sentence: 1477
At sentence: 1478
At sentence: 1479
At sentence: 1480
At sentence: 1481
At sentence: 1482
At sentence: 1483
At sentence: 1484
At sentence: 1485
At sentence: 1486
At sentence: 1487
At sentence: 1488
At sentenc

At sentence: 1892
At sentence: 1893
At sentence: 1894
At sentence: 1895
At sentence: 1896
At sentence: 1897
At sentence: 1898
At sentence: 1899
At sentence: 1900
At sentence: 1901
At sentence: 1902
At sentence: 1903
At sentence: 1904
At sentence: 1905
At sentence: 1906
At sentence: 1907
At sentence: 1908
At sentence: 1909
At sentence: 1910
At sentence: 1911
At sentence: 1912
At sentence: 1913
At sentence: 1914
At sentence: 1915
At sentence: 1916
At sentence: 1917
At sentence: 1918
At sentence: 1919
At sentence: 1920
At sentence: 1921
At sentence: 1922
At sentence: 1923
At sentence: 1924
At sentence: 1925
At sentence: 1926
At sentence: 1927
At sentence: 1928
At sentence: 1929
At sentence: 1930
At sentence: 1931
At sentence: 1932
At sentence: 1933
At sentence: 1934
At sentence: 1935
At sentence: 1936
At sentence: 1937
At sentence: 1938
At sentence: 1939
At sentence: 1940
At sentence: 1941
At sentence: 1942
At sentence: 1943
At sentence: 1944
At sentence: 1945
At sentence: 1946
At sentenc

At sentence: 2349
At sentence: 2350
At sentence: 2351
At sentence: 2352
At sentence: 2353
At sentence: 2354
At sentence: 2355
At sentence: 2356
At sentence: 2357
At sentence: 2358
At sentence: 2359
At sentence: 2360
At sentence: 2361
At sentence: 2362
At sentence: 2363
At sentence: 2364
At sentence: 2365
At sentence: 2366
At sentence: 2367
At sentence: 2368
At sentence: 2369
At sentence: 2370
At sentence: 2371
At sentence: 2372
At sentence: 2373
At sentence: 2374
At sentence: 2375
At sentence: 2376
At sentence: 2377
At sentence: 2378
At sentence: 2379
At sentence: 2380
At sentence: 2381
At sentence: 2382
At sentence: 2383
At sentence: 2384
At sentence: 2385
At sentence: 2386
At sentence: 2387
At sentence: 2388
At sentence: 2389
At sentence: 2390
At sentence: 2391
At sentence: 2392
At sentence: 2393
At sentence: 2394
At sentence: 2395
At sentence: 2396
At sentence: 2397
At sentence: 2398
At sentence: 2399
At sentence: 2400
At sentence: 2401
At sentence: 2402
At sentence: 2403
At sentenc

At sentence: 2805
At sentence: 2806
At sentence: 2807
At sentence: 2808
At sentence: 2809
At sentence: 2810
At sentence: 2811
At sentence: 2812
At sentence: 2813
At sentence: 2814
At sentence: 2815
At sentence: 2816
At sentence: 2817
At sentence: 2818
At sentence: 2819
At sentence: 2820
At sentence: 2821
At sentence: 2822
At sentence: 2823
At sentence: 2824
At sentence: 2825
At sentence: 2826
At sentence: 2827
At sentence: 2828
At sentence: 2829
At sentence: 2830
At sentence: 2831
At sentence: 2832
At sentence: 2833
At sentence: 2834
At sentence: 2835
At sentence: 2836
At sentence: 2837
At sentence: 2838
At sentence: 2839
At sentence: 2840
At sentence: 2841
At sentence: 2842
At sentence: 2843
At sentence: 2844
At sentence: 2845
At sentence: 2846
At sentence: 2847
At sentence: 2848
At sentence: 2849
At sentence: 2850
At sentence: 2851
At sentence: 2852
At sentence: 2853
At sentence: 2854
At sentence: 2855
At sentence: 2856
At sentence: 2857
At sentence: 2858
At sentence: 2859
At sentenc

At sentence: 3262
At sentence: 3263
At sentence: 3264
At sentence: 3265
At sentence: 3266
At sentence: 3267
At sentence: 3268
At sentence: 3269
At sentence: 3270
At sentence: 3271
At sentence: 3272
At sentence: 3273
At sentence: 3274
At sentence: 3275
At sentence: 3276
At sentence: 3277
At sentence: 3278
At sentence: 3279
At sentence: 3280
At sentence: 3281
At sentence: 3282
At sentence: 3283
At sentence: 3284
At sentence: 3285
At sentence: 3286
At sentence: 3287
At sentence: 3288
At sentence: 3289
At sentence: 3290
At sentence: 3291
At sentence: 3292
At sentence: 3293
At sentence: 3294
At sentence: 3295
At sentence: 3296
At sentence: 3297
At sentence: 3298
At sentence: 3299
At sentence: 3300
At sentence: 3301
At sentence: 3302
At sentence: 3303
At sentence: 3304
At sentence: 3305
At sentence: 3306
At sentence: 3307
At sentence: 3308
At sentence: 3309
At sentence: 3310
At sentence: 3311
At sentence: 3312
At sentence: 3313
At sentence: 3314
At sentence: 3315
At sentence: 3316
At sentenc

At sentence: 3719
At sentence: 3720
At sentence: 3721
At sentence: 3722


In [12]:
corpus[0]

'hindrance operation extract leaked report scorecard hughes instant hit buoy blue jack get skate ice cold alex chaos maracana build united depleted leicester prevail elliott spoil everton party hungry spur sense rich picking gunner wide easy target derby raise glass strupar debut double southgate strike leeds pay penalty hammer hand robson youthful lesson saint party like wear wolf turned lamb stump mike catch testy gough taunt langer escape hit flintoff injury pile woe england hunter threaten jospin new battle somme kohl successor drawn scandal difference men woman sara denver nurse turned solicitor diana landmine crusade put tory panic yeltsin resignation caught opposition flat footed russian roulette sold recovering title'

In [13]:
# Vectorizing the corpus  -Bag of Words
vectorizer = CountVectorizer(ngram_range = (2, 2), 
                             max_features = 50000)

# Fit the data
train_data = vectorizer.fit_transform(corpus).toarray()

In [14]:
train_data.shape

(3723, 50000)

In [15]:
test_data_combined = []

for row in range(len(test_data.index)):
    test_data_combined.append(' '.join(str(x) for x in test_data.iloc[row, :]))

test_data = vectorizer.transform(test_data_combined).toarray()

In [16]:
test_data.shape

(378, 50000)

In [17]:
# Building models
rf_classifier = RandomForestClassifier(n_estimators = 200, 
                                       criterion='entropy')

nb_classifier = GaussianNB()

In [18]:
rf_classifier.fit(train_data, train_label)
nb_classifier.fit(train_data, train_label)

In [19]:
rf_predictions = rf_classifier.predict(test_data)
rf_predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [20]:
nb_predictions = nb_classifier.predict(test_data)
nb_predictions

array([0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,

In [23]:
print("ACCURACY SCORES FOR TEST DATA\n")
print("Random Forest Algorithm: {}".format(accuracy_score(test_label, rf_predictions)))
print("Naive Bayes Algorithm: {}".format(accuracy_score(test_label, nb_predictions)))

ACCURACY SCORES FOR TEST DATA

Random Forest Algorithm: 0.5026455026455027
Naive Bayes Algorithm: 0.4894179894179894
