## Import libraries and read data

In [1]:
import os
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize,sent_tokenize,RegexpTokenizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import random
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\genec\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\genec\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
articles = []

# reading 2013 (only took first 5 days of each month due to computational/memory purposes)
for file in os.listdir('data/2013test/'):
    articles.append(open(os.path.join('data/2013test',file),'rt',errors='ignore').read())
    
# split into sentences
sentences = []

for i in range(len(articles)):
    sentences.append(sent_tokenize(articles[i]))
    
newSen= []
for i in range(len(sentences)):
    for j in range(len(sentences[i])):
        newSen.append(sentences[i][j])

In [3]:
ceoPos = pd.read_csv('data/training/ceo.csv',header = None,encoding = 'ISO-8859-1')
ceoPOs = ceoPos.fillna("",inplace=True)
ceoPos['Names'] = ceoPos[0]+' '+ceoPos[1]
ceoPos = ceoPos.drop(columns=[0,1])
ceoNeg = pd.read_csv('data/famous-people.csv')

#perPos = pd.read_csv('data/training/percentage.csv',header = None,encoding = 'ISO-8859-1')
#perNeg = pd.read_csv('data/perNegative.csv',header = None)

compPos = pd.read_csv('data/training/companies.csv',header = None,encoding = 'ISO-8859-1')
compNeg = pd.read_csv('data/words.csv', sep = ' ')
compPos.columns = ['Name']
compNeg.columns = ['Name']

## Extracting sentences and adding features

#### CEO

In [4]:
ceoPosNeg = []

for x in ceoPos['Names']:
    for y in newSen:
        if x in y:
            ceoPosNeg.append([x,y,'1'])

for x in ceoNeg['Name']:
    for y in newSen:
        if x in y:
            ceoPosNeg.append([x,y,'0'])

In [5]:
# Feature 1: if sentence contains 'CEO','Chief' or 'Executive'
keyWords = ['CEO','Chief','chief','Executive','executive','Found','founder']
for x in ceoPosNeg:
    if any(word in x[1] for word in keyWords):
        x.append(1)
    else:
        x.append(0)

In [6]:
# Feature 2: if sentence contains financial keywords
keyWords = ['%', '$','percent','money','dollars','stock','IPO','share','price','market','price','earnings']
for x in ceoPosNeg:
    if any(word in x[1] for word in keyWords):
        x.append(1)
    else:
        x.append(0)

In [7]:
# Feature 3: if sentence contains ceo responsibility keywords
keyWords = ['firm','office','performance','efficiency','company','jobs']
for x in ceoPosNeg:
    if any(word in x[1] for word in keyWords):
        x.append(1)
    else:
        x.append(0)

In [8]:
# Feature 4: sentence doesnt have recreational words
keyWords = ['compete','play','perform','celebrity','professional','actor','artist','sports','player','tv','blog','athlete']
for x in ceoPosNeg:
    if any(word in x[1] for word in keyWords):
        x.append(0)
    else:
        x.append(1)

#### Company names

In [11]:
newSen

['Earlier today we had a strong South Korean PMI report.',
 'The latest?',
 'Taiwan.',
 'It just saw a rise in December PMI from 47.4 to 50.6.',
 "From the report: \nWith the House prepared to vote on the Senate fiscal cliff bill orchestrated by the White House and Senate Minority Leader Mitch McConnell, conservatives are railing against GOP House Speaker John Boehner for caving on the deal.Â\xa0 Here's the banner leading Drudge Report right now:Â\xa0 Drudge Report\nGood news for the global economy.",
 'South Korea -- whose heavy reliance on global trade -- is seen by some economists as the "canary in the coalmine" just came in with a strong PMI report.',
 'From the report: The HSBC South Korea Purchasing Managersâ€™ IndexTM (PMIÂ®) â€“ a composite indicator designed to provide a single-figure snapshot of the health of the manufacturing sector â€“ registered 50.1 in December.',
 'That was an improvement on Novemberâ€™s 48.2 and the highest reading since May.',
 'However, being barely a

In [9]:
compNeg = compNeg.sample(4111)

In [None]:
compPosNeg = []

for x in compPos['Name']:
    for y in newSen:
        if x in y:
            compPosNeg.append([x,y,'1'])

for x in compNeg['Name']:
    for y in newSen:
        if x in y:
            compPosNeg.append([x,y,'0'])

In [None]:
# Feature 1: if sentence contains organization suffixes
keyWords = ['inc','Inc','INC','comp','corp','Corp','CORP','llc','Llc','LLC','ltd','Ltd','LTD','group','Group','holdings','Holdings','bank','business','firm','office','Co.','co.']
for x in compPosNeg:
    if any(word in x[1] for word in keyWords):
        x.append(1)
    else:
        x.append(0)

In [None]:
# Feature 2: if sentence contains financial keywords
keyWords = ['%', '$','percent','money','dollar','stock','IPO','share','price','market','price','earnings','GDP','finance']
for x in compPosNeg:
    if any(word in x[1] for word in keyWords):
        x.append(1)
    else:
        x.append(0)

In [None]:
# Feature 3: if sentence contains performance keywords
keyWords = ['performance','efficiency','customer','satisfaction','growth','potential','opportunity','service','consumer','task','cloud','payroll','analyst']
for x in compPosNeg:
    if any(word in x[1] for word in keyWords):
        x.append(1)
    else:
        x.append(0)

## Logit analysis

#### CEO

In [None]:
ceoCols = ['y','f1','f2','f3','f4']
ceoDf = pd.DataFrame(columns = ceoCols)

for x in ceoPosNeg:
    ceoDf = ceoDf.append(pd.DataFrame(data=([[x[2],x[3],x[4],x[5],x[6]]]),columns = ceoCols))

In [None]:
ceoXCols = ['f1','f2','f3','f4'] 

X = ceoDf.loc[:, ceoDf.columns != 'y']
y = ceoDf.loc[:, ceoDf.columns == 'y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Oversampling neg samples using SMOTE
os = SMOTE(random_state=0)
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=ceoXCols )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])

X=os_data_X[ceoXCols]
y=os_data_y['y']

In [None]:
import statsmodels.api as sm
logit_model=sm.Logit(y.astype(float),X.astype(float))
result=logit_model.fit()
print(result.summary2())

In [None]:
# Remove duty kwords columns (high p value)
ceoXColsNew = ['f1','f2','f3','f4'] 

ceoX=os_data_X[ceoXColsNew]
ceoy=os_data_y['y']

logit_model=sm.Logit(ceoy.astype(float),ceoX.astype(float))
result=logit_model.fit()
print(result.summary2())

In [None]:
X_testnew = X_test

ceologreg = LogisticRegression()
ceologreg.fit(ceoX.astype(float), ceoy.astype(float))


y_pred = ceologreg.predict(X_testnew)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(ceologreg.score(X_testnew.astype(float), y_test.astype(float))))

#### Company name

In [None]:
compCols = ['y','f1','f2','f3']
compDf = pd.DataFrame(columns = compCols)

for x in compPosNeg:
    compDf = compDf.append(pd.DataFrame(data=([[x[2],x[3],x[4],x[5]]]),columns = compCols))

In [None]:
compXCols = ['f1','f2','f3'] 

X = compDf.loc[:, compDf.columns != 'y']
y = compDf.loc[:, compDf.columns == 'y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Oversampling neg samples using SMOTE
os = SMOTE(random_state=0)
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=compXCols )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])

compX=os_data_X[compXCols]
compy=os_data_y['y']

In [None]:
logit_model=sm.Logit(compy.astype(float),compX.astype(float))
result=logit_model.fit()
print(result.summary2())

In [None]:
complogreg = LogisticRegression()
complogreg.fit(compX.astype(float), compy.astype(float))

y_pred = complogreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(complogreg.score(X_test.astype(float), y_test.astype(float))))

## Find and classify

#### CEO

In [None]:
# find CEO names
names = []
names2 = []
names3 = []
count = -1

for i in range(len(newSen)):
    names.append(re.findall(r'([A-Z][-a-z]+)',newSen[i]))

for i in range(len(names)):
    names2.append(pos_tag(names[i]))

for list in names2:
    count+=1
    if len(list) ==0:
        continue        
    for x in list:
        if 'NNP' in x:
            names3.append([x[0],newSen[count]])

In [None]:
# Data Cleaning
names4 = []

# Remove whitespace
for x in names3:
    x[0] = x[0].strip()
    x[0] = x[0].strip('-')
    print(x[0])

# Remove instances when word is at beginning of sentence and not in dictionary synset
for x in names3:
    if (x[1].find(x[0]) != 0) & (not wordnet.synsets(x[0])):
        names4.append(x)

In [None]:
for x in names4:
    if x[0] == 'S-':
        print(x[0])

In [None]:
# Add features again

keyWords = ['CEO','Chief','chief','Executive','executive','founder']
for x in names4:
    if any(word in x[1] for word in keyWords):
        x.append(1)
    else:
        x.append(0)

keyWords2 = ['%', '$','percent','money','dollars','stock','IPO','share','price','market','price','earnings']
for x in names4:
    if any(word in x[1] for word in keyWords2):
        x.append(1)
    else:
        x.append(0)
        
keyWords3 = ['compete','play','perform','celebrity','professional','actor','artist','sports','player','tv','blog','athlete']
for x in names4:
    if any(word in x[1] for word in keyWords3):
        x.append(0)
    else:
        x.append(1)

In [None]:
ceoCols = ['Name','f1','f2','f4']
ceoPred = pd.DataFrame(columns = ceoCols)

for x in names4:
    ceoPred = ceoPred.append(pd.DataFrame(data=([[x[0],x[2],x[3],x[4]]]),columns = ceoCols))
    
ceoX = ceoPred.loc[:, ceoPred.columns != 'Name']
ceoY = ceologreg.predict(ceoX.astype(float))
ceoPred['prediction'] = ceoY

In [None]:
finalceo = ceoPred.groupby('Name')['prediction'].agg(np.mean)
ceos = []
finalceolist = finalceo.index.tolist()

count = 0
for x in finalceo:
    if x > .5:
        ceos.append(finalceolist[count])
    count+=1

In [None]:
ceoCSV = pd.DataFrame(data = ceos, columns = ['CEO Name'])
ceoCSV.to_csv('outputLists/ceos.csv')

#### Percent

In [None]:
# find percentages
pc = []
percents = []
count = -1

for i in range(len(newSen)):
    pc.append(re.findall(r'([0-9]+.?\/?[0-9]+%|[a-z]+%|[0-9]+.?\/?[0-9]+ percent|[a-z]+ [a-z]+? percent|[0-9]+.?\/?[0-9]+-[0-9]+.?\/?[0-9]+%|[a-z]+ [a-z]+?%)',newSen[i]))

for x in pc:
    count+=1
    if len(x)==0:
        continue
    percents.append([x,newSen[count]])

In [None]:
# data cleaning: make sure extracted words are alphanumerical numbers

keyWords = ['0','1','2','3','4','5','6','7','8','9','one','two','three','four','five','six','seven','eight','nine','ten','eleven','twelve','teen','ty']

perFinal= []
for x in percents:
    if any(word in x[0][0] for word in keyWords):
        perFinal.append(x[0][0])

In [None]:
perCSV = pd.DataFrame(data = perFinal, columns = ['percents'])
perCSV.to_csv('outputLists/percents.csv')

#### Company names

In [None]:
# find company names
comp = []
comp1 = []
comp2 = []
count = -1

for i in range(len(newSen)):
    comp.append(re.findall(r'(?:[A-Z][a-zA-Z&.]+ ){0,4}[A-Z][a-zA-Z&]+ ?',newSen[i]))
        
for i in range(len(cNames)):
    comp1.append(pos_tag(comp[i]))

for list in comp1:
    count+=1
    if len(list) ==0:
        continue      
    for x in list:
        if 'NNP' in x:
            comp2.append([x[0],newSen[count]])

In [None]:
# Data Cleaning
comp3 = []

# Remove whitespace
for x in comp2:
    x[0] = x[0].strip()

# Remove instances when word is at beginning of sentence
for x in comp2:
    if (x[1].find(x[0]) != 0) & (not wordnet.synsets(x[0])):
        comp3.append(x)

In [None]:
# add features again

keyWords = ['inc','Inc','INC','comp','corp','Corp','CORP','llc','Llc','LLC','ltd','Ltd','LTD','group','Group','holdings','Holdings','bank','business','firm','office','Co.','co.']
for x in comp3:
    if any(word in x[1] for word in keyWords):
        x.append(1)
    else:
        x.append(0)
        
keyWords = ['%', '$','percent','money','dollar','stock','IPO','share','price','market','price','earnings','GDP','finance']
for x in comp3:
    if any(word in x[1] for word in keyWords):
        x.append(1)
    else:
        x.append(0)
        
keyWords = ['performance','efficiency','customer','satisfaction','growth','potential','opportunity','service','consumer','task','cloud','payroll','analyst']
for x in comp3:
    if any(word in x[1] for word in keyWords):
        x.append(1)
    else:
        x.append(0)

In [None]:
compCols = ['Name','f1','f2','f3']
compPred = pd.DataFrame(columns = compCols)

for x in comp3:
    compPred = compPred.append(pd.DataFrame(data=([[x[0],x[2],x[3],x[4]]]),columns = compCols))
    
compX = compPred.loc[:, compPred.columns != 'Name']
compY = complogreg.predict(compX.astype(float))
compPred['prediction'] = compY

In [None]:
compFinal = compPred.groupby('Name').mean()
companies = []
compFinalList = compFinal.index.tolist()

count = 0
for x in compFinal['prediction']:
    if x > .5:
        companies.append(compFinalList[count])
    count+=1

In [None]:
compCSV = pd.DataFrame(data = companies, columns = ['Company Name'])
compCSV.to_csv('outputLists/companies.csv')