# Text Analytics

#### Charlie Marshall
#### Prof. Klabjan
#### IEMS 308
#### 2 March 2020

In [2]:
import pandas as pd
import numpy as np
import scipy
import re
import glob
import os
from nltk.tokenize import word_tokenize,sent_tokenize,RegexpTokenizer
from nltk import pos_tag
import spacy

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix

from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

from imblearn.over_sampling import SMOTE
import statsmodels.api as sm

### Load in Data

In [3]:
percent = pd.read_csv("/Users/charlesmarshall/Desktop/IEMS 308/Project 3/all/percentage.csv", engine = "python", names = ['perc'])

In [4]:
percent.head()

Unnamed: 0,perc
0,66%
1,40%
2,90%
3,49%
4,100%


In [5]:
ceo = pd.read_csv("/Users/charlesmarshall/Desktop/IEMS 308/Project 3/all/ceo.csv", engine = "python", names = ['first', 'last'])

In [6]:
def ceo_name(df):
    for i in range(len(ceo)):
        if pd.isnull(ceo.loc[i,'last']):
            ceo.loc[i,'ceo_full'] = ceo.loc[i,'first']
        elif pd.isnull(ceo.loc[i,'first']):
            ceo.loc[i,'ceo_full'] = ceo.loc[i,'last']
        else:
            ceo.loc[i,'ceo_full'] = ceo.loc[i,'first'] + ' ' + ceo.loc[i,'last']
            
    return df;

In [7]:
ceo = ceo_name(ceo)

In [8]:
ceo = ceo.drop(['first','last'], axis=1)

In [9]:
ceo.head()

Unnamed: 0,ceo_full
0,Tom Horton
1,Patti Hart
2,Jamie Dimon
3,Steve Cohen
4,Tim Cook


In [10]:
company = pd.read_csv("/Users/charlesmarshall/Desktop/IEMS 308/Project 3/all/companies.csv", engine = "python", names = ['company'])

In [11]:
company.head()

Unnamed: 0,company
0,Abaxis Inc
1,ACA Financial
2,Alibaba Group Holding Ltd
3,American Bell Telephone Co
4,American Express Co


In [12]:
file_list = glob.glob("/Users/charlesmarshall/Desktop/IEMS 308/Project 3/*/*.txt")

corpus = []

for file_path in file_list:
    with open(file_path,encoding='ISO-8859-1') as f_input:
        corpus.append(f_input.read())

In [13]:
len(corpus)

730

## Clean data

Remove all unicode and *.

In [14]:
print(corpus[0])

ReutersChina's seven day repo rose to a record high of 10.77% in Shanghai, the highest since March 2003, according to Bloomberg*. Meanwhile, the one-day rate hit a record 12.85%. And Zerohedge reported that overnight repo hit 25%. The liquidity squeeze in China first began ahead of the Dragon Boat festival earlier this month. Spikes in interbank rates are common right before holidays.Â  But Diana Choyleva at Lombard Street Research said this is symptomatic of a bigger problem. She said capital flows had "become a more important driver of domestic liquidity conditions in China's managed exchange rate system." In a new note to clients Bank of America's Ting Lu wrote: "There are many factors behind the interbank liquidity squeeze that might be cited, but we believe that the ultimate reason is the central bankâs tough stance as the PBOC can practically provide unlimited liquidity to ease every squeeze if it wishes to."Â  Banks have been clamoring for a reserve requirement ratio cut. So w

In [15]:
for text in range(len(corpus)):
    corpus[text] = re.sub(r'[^\x00-\x7f]|[*]',r'', corpus[text])

In [16]:
print(corpus[0])

ReutersChina's seven day repo rose to a record high of 10.77% in Shanghai, the highest since March 2003, according to Bloomberg. Meanwhile, the one-day rate hit a record 12.85%. And Zerohedge reported that overnight repo hit 25%. The liquidity squeeze in China first began ahead of the Dragon Boat festival earlier this month. Spikes in interbank rates are common right before holidays. But Diana Choyleva at Lombard Street Research said this is symptomatic of a bigger problem. She said capital flows had "become a more important driver of domestic liquidity conditions in China's managed exchange rate system." In a new note to clients Bank of America's Ting Lu wrote: "There are many factors behind the interbank liquidity squeeze that might be cited, but we believe that the ultimate reason is the central banks tough stance as the PBOC can practically provide unlimited liquidity to ease every squeeze if it wishes to." Banks have been clamoring for a reserve requirement ratio cut. So why isn't

## Tokenizing the Sentences

In [17]:
sentences = []

for text in range(len(corpus)):
    s = sent_tokenize(corpus[text])
    sentences.append(s)

In [18]:
len(sentences)

730

In [19]:
sentences = [item for sublist in sentences for item in sublist]

In [20]:
len(sentences)

695841

In [21]:
sentences[6940]

'Crablike, Mr Hollande is trying to do just enough on Europe, without aggravating nationalism at home.'

### Removing stop words in sentences:

None of the categories we are looking for (CEOs, percentages, or Companies) should include stop words,
so  removing them will not eliminate any candidates which simulataneously eliminating candidates which do not deserve to be picked

In [22]:
stop_words=sorted(set(stopwords.words("english")))

In [23]:
def drop_stop_words(ls):
    for i in range(len(ls)):
        tokenized_sent = word_tokenize(ls[i])
        ls[i] = ' '.join([word for word in tokenized_sent if word.lower() not in stop_words])
        
    return ls;

In [24]:
sentences = drop_stop_words(sentences)

In [25]:
sentences[6940]

'Crablike , Mr Hollande trying enough Europe , without aggravating nationalism home .'

## CEO's

1) Find all the names of people included in the corpus (potential CEO's). This is done by searching for any value that has two uppercase words in a row or just one uppercase word. It is not the most exact way to do this (for instance, there are lots of words at the beginning of sentences which are included, but many of these words should be eliminated in feature selection.

2) Blocks of text (paragraphs, windows, sentences, etc) will be inspected to come up with features.

- Potential Features:
1) CEO is in the same sentence (should correctly identify people who are obviously CEOs)
2) Word/ word phrase is longer than 3 characters (many of the stop words which are included in the potential ceo list are just words which start sentences, but can be eliminated because they have only a few characters)
3) I'm not sure - this might be good

3) A df will then be created with the row name being the name of each person and each column being a feature. 

4) Train a logistic regression model on half of the data

5) Test the model on the other half of the data. 

### Creating df for classification

In [26]:
def cap_letters(message):
    caps = sum(1 for c in message if c.isupper())
    return caps;

In [27]:
def cap_in_sent(ls):
    sent_caps = sum(1 for c in ls if c.isupper())
    return sent_caps;

In [28]:
def sentence_words(ls):
    ceos = 0
    sens = 0
    pres = 0
    inv = 0
    aut = 0
    represent = 0
    ambass = 0
    secr = 0
    exp = 0
    spok = 0
    gov = 0
    part = 0
    found = 0
    who=0
    
    if re.findall(r'CEO|ceo', ls) != []: 
        ceos = 1
    if re.findall(r'Senator|Sen.', ls) != []: 
        sens = 1
    if re.findall(r'President', ls) != []: 
        pres = 1
    if re.findall(r'investor|Investor', ls) != []: 
        inv = 1
    if re.findall(r'author|Author', ls) != []: 
        aut = 1
    if re.findall(r'Representative|Rep.', ls) != []: 
        represent = 1
    if re.findall(r'Ambassador|ambassador', ls) != []: 
        ambass = 1
    if re.findall(r'Secretary|secretary', ls) != []: 
        secr = 1
    if re.findall(r'Expert|expert', ls) != []: 
        exp = 1
    if re.findall(r'spokesman|spokeswoman|Spokesman|Spokeswoman', ls) != []: 
        spok = 1
    if re.findall(r'Governor|Gov.', ls) != []: 
        gov = 1
    if re.findall(r'partner|Partner', ls) != []: 
        part = 1
    if re.findall(r'founder|Founder', ls) != []:
        found = 1
    if re.findall(r'who|Who', ls) != []:
        who = 1
        
    return ceos, sens, pres, inv, aut, represent, ambass, secr, exp, spok, gov, part, found,who;

In [29]:
def person_two_before(sent,phrase_in_sent):
    try:
        who_two_before = 0
        ceo_two_before = 0
        sen_two_before = 0
        pres_two_before = 0
        inv_two_before = 0
        aut_two_before = 0
        rep_two_before = 0
        amb_two_before = 0
        sec_two_before = 0
        exp_two_before = 0
        spoke_two_before = 0
        gov_two_before = 0
        part_two_before = 0
        found_two_before = 0
        
        sec_word = ''

        sent_split = re.split(r'[ |,|.]', sent)
        last_word = re.split(r'[ ]', phrase_in_sent)[0]

        if last_word in sent_split:
            word_index = sent_split.index(last_word)
            sec_word = sent_split[word_index-2].lower()
            if word_index-2 >= 0:
                if sec_word == 'who':
                    who_two_before = 1;
                if sec_word == 'ceo':
                    ceo_two_before = 1;
                if sec_word == 'senator' or sec_word == 'sen':
                    sen_two_before = 1;
                if sec_word == 'president':
                    pres_two_before = 1;
                if sec_word == 'investor':
                    inv_two_before = 1;
                if sec_word == 'author':
                    aut_two_before = 1;
                if sec_word == 'representative' or sec_word == 'rep':
                    rep_two_before = 1;
                if sec_word == 'ambassador':
                    amb_two_before = 1;
                if sec_word == 'secretary':
                    sec_two_before = 1;
                if sec_word == 'expert':
                    exp_two_before = 1;
                if sec_word == 'spokesman' or sec_word == 'spokeswoman':
                    spoke_two_before = 1;
                if sec_word == 'governor':
                    gov_two_before = 1;
                if sec_word == 'partner':
                    part_two_before = 1;
                if sec_word == 'founder':
                    found_two_before = 1;
                return who_two_before,ceo_two_before,sen_two_before,pres_two_before,inv_two_before,aut_two_before,rep_two_before,amb_two_before,sec_two_before,exp_two_before,spoke_two_before,gov_two_before,part_two_before,found_two_before;
            else:
                return who_two_before,ceo_two_before,sen_two_before,pres_two_before,inv_two_before,aut_two_before,rep_two_before,amb_two_before,sec_two_before,exp_two_before,spoke_two_before,gov_two_before,part_two_before,found_two_before;
    except IndexError:  
        return who_two_before,ceo_two_before,sen_two_before,pres_two_before,inv_two_before,aut_two_before,rep_two_before,amb_two_before,sec_two_before,exp_two_before,spoke_two_before,gov_two_before,part_two_before,found_two_before;

In [30]:
def person_one_before(sent,phrase_in_sent):
    try:
        who_one_before = 0
        ceo_one_before = 0
        sen_one_before = 0
        pres_one_before = 0
        inv_one_before = 0
        aut_one_before = 0
        rep_one_before = 0
        amb_one_before = 0
        sec_one_before = 0
        exp_one_before = 0
        spoke_one_before = 0
        gov_one_before = 0
        part_one_before = 0
        found_one_before = 0
        
        sec_word = ''

        sent_split = re.split(r'[ |,|.]', sent)
        last_word = re.split(r'[ ]', phrase_in_sent)[0]

        if last_word in sent_split:
            word_index = sent_split.index(last_word)
            sec_word = sent_split[word_index - 1].lower()
            if word_index - 1 >= 0:
                if sec_word == 'who':
                    who_one_before = 1;
                if sec_word == 'ceo':
                    ceo_one_before = 1;
                if sec_word == 'senator' or sec_word == 'sen':
                    sen_one_before = 1;
                if sec_word == 'president':
                    pres_one_before = 1;
                if sec_word == 'investor':
                    inv_one_before = 1;
                if sec_word == 'author':
                    aut_one_before = 1;
                if sec_word == 'representative' or sec_word == 'rep':
                    rep_one_before = 1;
                if sec_word == 'ambassador':
                    amb_one_before = 1;
                if sec_word == 'secretary':
                    sec_one_before = 1;
                if sec_word == 'expert':
                    exp_one_before = 1;
                if sec_word == 'spokesman' or sec_word == 'spokeswoman':
                    spoke_one_before = 1;
                if sec_word == 'governor':
                    gov_one_before = 1;
                if sec_word == 'partner':
                    part_one_before = 1;
                if sec_word == 'founder':
                    found_one_before = 1;
                return who_one_before,ceo_one_before,sen_one_before,pres_one_before,inv_one_before,aut_one_before,rep_one_before,amb_one_before,sec_one_before,exp_one_before,spoke_one_before,gov_one_before,part_one_before,found_one_before;
            else:
                return who_one_before,ceo_one_before,sen_one_before,pres_one_before,inv_one_before,aut_one_before,rep_one_before,amb_one_before,sec_one_before,exp_one_before,spoke_one_before,gov_one_before,part_one_before,found_one_before;
    except IndexError:  
        return who_one_before,ceo_one_before,sen_one_before,pres_one_before,inv_one_before,aut_one_before,rep_one_before,amb_one_before,sec_one_before,exp_one_before,spoke_one_before,gov_one_before,part_one_before,found_one_before;

In [31]:
def person_one_after(sent,phrase_in_sent):
    try:
        who_one_after = 0
        ceo_one_after = 0
        sen_one_after = 0
        pres_one_after = 0
        inv_one_after = 0
        aut_one_after = 0
        rep_one_after = 0
        amb_one_after = 0
        sec_one_after = 0
        exp_one_after = 0
        spoke_one_after = 0
        gov_one_after = 0
        part_one_after = 0
        found_one_after = 0
        
        fst_word = ''

        sent_split = re.split(r'[ |,|.]', sent)
        last_word = re.split(r'[ ]', phrase_in_sent)[1]

        if last_word in sent_split:
            word_index = sent_split.index(last_word)
            fst_word = sent_split[word_index+1].lower()

            if fst_word == 'who':
                who_one_after = 1;
            if fst_word == 'ceo':
                ceo_one_after = 1;
            if fst_word == 'senator' or fst_word == 'sen':
                sen_one_after = 1;
            if fst_word == 'president':
                pres_one_after = 1;
            if fst_word == 'investor':
                inv_one_after = 1;
            if fst_word == 'author':
                aut_one_after = 1;
            if fst_word == 'representative'or fst_word == 'rep':
                rep_one_after = 1;
            if fst_word == 'ambassador':
                amb_one_after = 1;
            if fst_word == 'secretary':
                sec_one_after = 1;
            if fst_word == 'expert':
                exp_one_after = 1;
            if fst_word == 'spokesman' or fst_word == 'spokeswoman':
                spoke_one_after = 1;
            if fst_word == 'governor':
                gov_one_after = 1;
            if fst_word == 'partner':
                part_one_after = 1;
            if fst_word == 'founder':
                found_one_after = 1;
        return who_one_after,ceo_one_after,sen_one_after,pres_one_after,inv_one_after,aut_one_after,rep_one_after,amb_one_after,sec_one_after,exp_one_after,spoke_one_after,gov_one_after,part_one_after,found_one_after;
    except IndexError:  
        return who_one_after,ceo_one_after,sen_one_after,pres_one_after,inv_one_after,aut_one_after,rep_one_after,amb_one_after,sec_one_after,exp_one_after,spoke_one_after,gov_one_after,part_one_after,found_one_after;

In [32]:
def person_two_after(sent,phrase_in_sent):
    try:
        who_two_after = 0
        ceo_two_after = 0
        sen_two_after = 0
        pres_two_after = 0
        inv_two_after = 0
        aut_two_after = 0
        rep_two_after = 0
        amb_two_after = 0
        sec_two_after = 0
        exp_two_after = 0
        spoke_two_after = 0
        gov_two_after = 0
        part_two_after = 0
        found_two_after = 0
        
        sec_word = ''

        sent_split = re.split(r'[ |,|.]', sent)
        last_word = re.split(r'[ ]', phrase_in_sent)[1]

        if last_word in sent_split:
            word_index = sent_split.index(last_word)
            sec_word = sent_split[word_index+2].lower()

            if sec_word == 'who':
                who_two_after = 1;
            if sec_word == 'ceo':
                ceo_two_after = 1;
            if sec_word == 'senator' or sec_word == 'sen':
                sen_two_after = 1;
            if sec_word == 'president':
                pres_two_after = 1;
            if sec_word == 'investor':
                inv_two_after = 1;
            if sec_word == 'author':
                aut_two_after = 1;
            if sec_word == 'representative'or sec_word == 'rep':
                rep_two_after = 1;
            if sec_word == 'ambassador':
                amb_two_after = 1;
            if sec_word == 'secretary':
                sec_two_after = 1;
            if sec_word == 'expert':
                exp_two_after = 1;
            if sec_word == 'spokesman' or sec_word == 'spokeswoman':
                spoke_two_after = 1;
            if sec_word == 'governor':
                gov_two_after = 1;
            if sec_word == 'partner':
                part_two_after = 1;
            if sec_word == 'founder':
                found_two_after = 1;
        return who_two_after,ceo_two_after,sen_two_after,pres_two_after,inv_two_after,aut_two_after,rep_two_after,amb_two_after,sec_two_after,exp_two_after,spoke_two_after,gov_two_after,part_two_after,found_two_after;
    except IndexError:  
        return who_two_after,ceo_two_after,sen_two_after,pres_two_after,inv_two_after,aut_two_after,rep_two_after,amb_two_after,sec_two_after,exp_two_after,spoke_two_after,gov_two_after,part_two_after,found_two_after;    

In [33]:
def ceo_word_within_two(sent,phrase):
    try:
        two_before = person_two_before(sent,phrase)
        one_before = person_one_before(sent,phrase)
        one_after = person_one_after(sent,phrase)
        two_after = person_two_after(sent,phrase)

        who = two_before[0] + one_before[0] + one_after[0] + two_after[0]
        ceo_in_sent = two_before[1] + one_before[1] + one_after[1] + two_after[1]
        senator = two_before[2] + one_before[2] + one_after[2] + two_after[2]
        president = two_before[3] + one_before[3] + one_after[3] + two_after[3]
        investor = two_before[4] + one_before[4] + one_after[4] + two_after[4]
        author = two_before[5] + one_before[5] + one_after[5] + two_after[5]
        rep = two_before[6] + one_before[6] + one_after[6] + two_after[6]
        ambassador = two_before[7] + one_before[7] + one_after[7] + two_after[7]
        secretary = two_before[8] + one_before[8] + one_after[8] + two_after[8]
        expert = two_before[9] + one_before[9] + one_after[9] + two_after[9]
        spokesman = two_before[10] + one_before[10] + one_after[10] + two_after[10]
        governor = two_before[11] + one_before[11] + one_after[11] + two_after[11]
        partner = two_before[12] + one_before[12] + one_after[12] + two_after[12]
        founder = two_before[13] + one_before[13] + one_after[13] + two_after[13]

        return who,ceo_in_sent,senator, president, investor, author, rep, ambassador, secretary, expert, spokesman, governor, partner,founder;
    except TypeError:
        return np.zeros(14)

In [34]:
def potential_ceo_df(ls):
    ceo_df = []
    sentences = []
    for i in range(len(ls)):
        p = re.findall(r'[A-Z]\w+ [A-Z]\w+', ls[i])
        if p != []:
            
            sent_caps = cap_in_sent(ls[i])
            sent_len = len(ls[i])
            
            for j in p:
                ceo_word = ceo_word_within_two(ls[i],j)
                who = ceo_word[0]
                ceos = ceo_word[1]
                sen_two = ceo_word[2]
                pres_two = ceo_word[3]
                inv_two = ceo_word[4]
                aut_two = ceo_word[5]
                rep_two = ceo_word[6]
                amb_two = ceo_word[7]
                sec_two = ceo_word[8]
                exp_two = ceo_word[9]
                spoke_two = ceo_word[10]
                gov_two = ceo_word[11]
                part_two = ceo_word[12]
                found_two = ceo_word[13]
                
                in_sent = sentence_words(ls[i])
                ceo_in_sent = in_sent[0]
                sens = in_sent[1]
                pres = in_sent[2]
                inv = in_sent[3]
                aut = in_sent[4]
                represent = in_sent[5]
                ambass = in_sent[6]
                secr = in_sent[7]
                exp = in_sent[8]
                spok = in_sent[9]
                gov = in_sent[10]
                part = in_sent[11]
                found = in_sent[12]
                who_in_sent = in_sent[13]
                            
                length = len(j)
                caps = cap_letters(j)
                ceo_df.append([j,length,sent_len,caps,sent_caps,who,ceos,sen_two,pres_two,inv_two,aut_two,rep_two,amb_two,sec_two,exp_two,spoke_two,gov_two,part_two,found_two,ceo_in_sent,sens,pres,inv,aut,represent,ambass,secr,exp,spok,gov,part,found,who_in_sent,ls[i],i])
                
    return ceo_df;

In [35]:
ceo_df = pd.DataFrame(potential_ceo_df(sentences), columns = ['Candidate','length','sent_len','caps','sent_caps','who','ceos_two','sen_two','pres_two','inv_two','aut_two','rep_two','amb_two','sec_two','exp_two','spoke_two','gov_two','part_two','found_two','ceo_in_sent','sens','pres','inv','aut','represent','ambass','secr','exp','spok','gov','part','found','who_in_sent','Sentence','index'])

In [36]:
ceo_df

Unnamed: 0,Candidate,length,sent_len,caps,sent_caps,who,ceos_two,sen_two,pres_two,inv_two,...,ambass,secr,exp,spok,gov,part,found,who_in_sent,Sentence,index
0,Dragon Boat,11,78,2,3,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,liquidity squeeze China first began ahead Drag...,3
1,Diana Choyleva,14,72,2,5,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,Diana Choyleva Lombard Street Research said sy...,5
2,Lombard Street,14,72,2,5,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,Diana Choyleva Lombard Street Research said sy...,5
3,Bank America,12,240,2,8,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,new note clients Bank America 's Ting Lu wrote...,7
4,Ting Lu,7,240,2,8,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,new note clients Bank America 's Ting Lu wrote...,7
5,Bank China,10,96,2,3,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,previously explained People 's Bank China seem...,10
6,China Banking,13,137,2,8,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,also comes time banks required meet loan-to-de...,12
7,Regulatory Commission,21,137,2,8,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,also comes time banks required meet loan-to-de...,12
8,Charlene Chu,12,97,2,5,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,Earlier week Fitch 's Charlene Chu warned Chin...,13
9,Lehman China,12,176,2,11,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,SHIBOR 25 % basically means functioning interb...,16


## CEO Logistic Regression

In [37]:
labels = []
values = ceo['ceo_full'].values

for i in range(len(ceo_df)):
    if ceo_df.loc[i,'Candidate'] in values:
        labels.append(1)
    else: 
        labels.append(0) 
ceo_df['label'] = labels

In [38]:
ceo_df_final = ceo_df.drop(['Sentence','index','Candidate'], axis=1)

In [39]:
ceo_df_final.sum(axis=0)

length          6095021.0
sent_len       84930658.0
caps            1130461.0
sent_caps       7142547.0
who                   0.0
ceos_two           2314.0
sen_two             445.0
pres_two           3351.0
inv_two            1159.0
aut_two             378.0
rep_two             434.0
amb_two              39.0
sec_two             662.0
exp_two             207.0
spoke_two           742.0
gov_two             616.0
part_two            393.0
found_two           824.0
ceo_in_sent       14741.0
sens               9398.0
pres              15564.0
inv               18252.0
aut                6236.0
represent         23753.0
ambass              302.0
secr               4284.0
exp                1918.0
spok               1941.0
gov                5625.0
part               6710.0
found              6066.0
who_in_sent        4830.0
label             15503.0
dtype: float64

In [40]:
yceo = ceo_df_final.loc[:, ceo_df_final.columns == 'label']
Xceo = ceo_df_final.loc[:, ceo_df_final.columns != 'label']

### Two Model Types:
    1) Over-Sampling model
        a. With all features from RFE
        b. With select features from RFE
    2) Regular model

### Model 1a: Over-Sampling (All Features)

In [41]:
os = SMOTE(random_state=0)
Xceo_train, Xceo_test, yceo_train, yceo_test = train_test_split(Xceo, yceo, test_size=0.5, random_state=0)
columns = Xceo_train.columns

os_ceo_X,os_ceo_y=os.fit_sample(Xceo_train, yceo_train)
os_ceo_X = pd.DataFrame(data=os_ceo_X,columns=columns )
os_ceo_y= pd.DataFrame(data=os_ceo_y,columns=['label'])

  y = column_or_1d(y, warn=True)


In [42]:
print("length of oversampled ceos is ",len(os_ceo_X))
print("Number of non-CEOs in oversampled ceos",len(os_ceo_y[os_ceo_y['label']==0]))
print("Number of CEOs",len(os_ceo_y[os_ceo_y['label']==1]))
print("Proportion of non-ceos in oversampled ceos is ",len(os_ceo_y[os_ceo_y['label']==0])/len(os_ceo_X))
print("Proportion of ceos in oversampled ceos is ",len(os_ceo_y[os_ceo_y['label']==1])/len(os_ceo_X))

length of oversampled ceos is  434878
Number of non-CEOs in oversampled ceos 217439
Number of CEOs 217439
Proportion of non-ceos in oversampled ceos is  0.5
Proportion of ceos in oversampled ceos is  0.5


In [43]:
logceo = LogisticRegression()
rfe = RFE(logceo)
rfe = rfe.fit(os_ceo_X, os_ceo_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)



[False False  True False False  True  True  True  True  True  True False
  True  True  True False False  True  True False False False False  True
  True False False  True False False  True False]
[12 16  1 14 17  1  1  1  1  1  1  8  1  1  1  2 13  1  1 15  7  5  9  1
  1 11  3  1  6  4  1 10]


In [44]:
predictors=['caps','ceos_two','sen_two','pres_two','inv_two','aut_two','rep_two','sec_two','exp_two','spoke_two','found_two','ceo_in_sent','represent','ambass','spok','found'] 
X=os_ceo_X[predictors]
y=os_ceo_y['label']

In [45]:
logit_model=sm.Logit(y,X)
result=logit_model.fit(method='bfgs')
print(result.summary2())

         Current function value: 0.638986
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.078      
Dependent Variable: label            AIC:              555793.5278
Date:               2020-03-09 13:44 BIC:              555969.2529
No. Observations:   434878           Log-Likelihood:   -2.7788e+05
Df Model:           15               LL-Null:          -3.0143e+05
Df Residuals:       434862           LLR p-value:      0.0000     
Converged:          0.0000           Scale:            1.0000     
-------------------------------------------------------------------
              Coef.   Std.Err.      z      P>|z|    [0.025   0.975]
-------------------------------------------------------------------
caps         -0.1568    0.0014  -110.9222  0.0000  -0.1595  -0.1540
ceos_two      2.4973    0.0527    47.4108  0.0000   2.3941   2.6006
sen_two      -0.



In [46]:
predictors=['caps','ceos_two','pres_two','inv_two','sec_two','found_two','ceo_in_sent','represent','spok','found'] 
Xceo_os=os_ceo_X[predictors]
yceo_os=os_ceo_y['label']

In [47]:
logit_model=sm.Logit(yceo_os,Xceo_os)
result=logit_model.fit(method='bfgs')
print(result.summary2())

         Current function value: 0.639072
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.078      
Dependent Variable: label            AIC:              555856.7887
Date:               2020-03-09 13:44 BIC:              555966.6169
No. Observations:   434878           Log-Likelihood:   -2.7792e+05
Df Model:           9                LL-Null:          -3.0143e+05
Df Residuals:       434868           LLR p-value:      0.0000     
Converged:          0.0000           Scale:            1.0000     
-------------------------------------------------------------------
              Coef.   Std.Err.      z      P>|z|    [0.025   0.975]
-------------------------------------------------------------------
caps         -0.1569    0.0014  -111.0667  0.0000  -0.1596  -0.1541
ceos_two      2.4907    0.0526    47.3355  0.0000   2.3876   2.5939
pres_two      1.



In [48]:
Xceo_os_train, Xceo_os_test, yceo_os_train, yceo_os_test = train_test_split(Xceo_os, yceo_os, test_size=0.5, random_state=0)
log_ceo0S = LogisticRegression()
log_ceo0S.fit(Xceo_os_train, yceo_os_train)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [49]:
yceoOS_pred = log_ceo0S.predict(Xceo_os_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_ceo0S.score(Xceo_os_test, yceo_os_test)))

Accuracy of logistic regression classifier on test set: 0.60


In [50]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(yceo_os_test.tolist(), yceoOS_pred.tolist())
print(confusion_matrix)

[[ 26054  82924]
 [  3107 105354]]


In [51]:
print(classification_report(yceo_os_test, yceoOS_pred))

              precision    recall  f1-score   support

           0       0.89      0.24      0.38    108978
           1       0.56      0.97      0.71    108461

   micro avg       0.60      0.60      0.60    217439
   macro avg       0.73      0.61      0.54    217439
weighted avg       0.73      0.60      0.54    217439



### Model 1b: Over-Sampling (Select Features)

In [52]:
predictors1b=['ceos_two','pres_two','inv_two','sec_two','found_two'] 
Xceo1b_os=os_ceo_X[predictors1b]
yceo_os=os_ceo_y['label']

In [53]:
logit_model=sm.Logit(yceo_os,Xceo1b_os)
result=logit_model.fit(method='bfgs')
print(result.summary2())

         Current function value: 0.666609
         Iterations: 35
         Function evaluations: 39
         Gradient evaluations: 39
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.038      
Dependent Variable: label            AIC:              579796.9241
Date:               2020-03-09 13:44 BIC:              579851.8382
No. Observations:   434878           Log-Likelihood:   -2.8989e+05
Df Model:           4                LL-Null:          -3.0143e+05
Df Residuals:       434873           LLR p-value:      0.0000     
Converged:          0.0000           Scale:            1.0000     
-------------------------------------------------------------------
                Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
-------------------------------------------------------------------
ceos_two        4.5101    0.0608  74.2053  0.0000   4.3910   4.6292
pres_two        1.3387    0.0349  38.3158  0.0000   1.2702   1.4071
inv_two         



In [54]:
Xceo1b_os_train, Xceo1b_os_test, yceo_os_train, yceo_os_test = train_test_split(Xceo1b_os, yceo_os, test_size=0.5, random_state=0)
log_1bceo0S = LogisticRegression()
log_1bceo0S.fit(Xceo1b_os_train, yceo_os_train)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [55]:
yceoOS_pred1b = log_1bceo0S.predict(Xceo1b_os_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_1bceo0S.score(Xceo1b_os_test, yceo_os_test)))

Accuracy of logistic regression classifier on test set: 0.57


In [56]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(yceo_os_test.tolist(), yceoOS_pred1b.tolist())
print(confusion_matrix)

[[107546   1432]
 [ 91380  17081]]


In [57]:
print(classification_report(yceo_os_test, yceoOS_pred1b))

              precision    recall  f1-score   support

           0       0.54      0.99      0.70    108978
           1       0.92      0.16      0.27    108461

   micro avg       0.57      0.57      0.57    217439
   macro avg       0.73      0.57      0.48    217439
weighted avg       0.73      0.57      0.48    217439



### Model 2: Non-OS

In [58]:
Xceo_train, Xceo_test, yceo_train, yceo_test = train_test_split(Xceo, yceo, test_size=0.5, random_state=0)

In [59]:
ceo_log = LogisticRegression()
ceo_log.fit(Xceo_train[predictors], yceo_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [60]:
ceo_log.coef_

array([[-3.18701281,  1.65352576,  1.15458466,  1.977172  , -1.37935403,
         1.13521655,  1.705737  , -0.87980497, -0.76998029,  0.83704402]])

In [61]:
ceo_pred = ceo_log.predict(Xceo_test[predictors])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(ceo_log.score(Xceo_test[predictors], yceo_test)))

Accuracy of logistic regression classifier on test set: 0.97


In [62]:
sum(ceo_pred)

945

In [63]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(yceo_test['label'].tolist(), ceo_pred.tolist())
print(confusion_matrix)

[[217028    435]
 [  7230    510]]


In [64]:
print(classification_report(yceo_test, ceo_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    217463
           1       0.54      0.07      0.12      7740

   micro avg       0.97      0.97      0.97    225203
   macro avg       0.75      0.53      0.55    225203
weighted avg       0.95      0.97      0.95    225203



### Testing of Models on Entire Dataset:

### Model 1a

In [65]:
yfinal_pred = log_ceo0S.predict(Xceo[predictors])
print('Accuracy of logistic regression classifier on entire data: {:.2f}'.format(log_ceo0S.score(Xceo[predictors], yceo)))

Accuracy of logistic regression classifier on entire data: 0.27


In [66]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(yceo['label'].tolist(), yfinal_pred.tolist())
print(confusion_matrix)

[[105440 329462]
 [   305  15198]]


In [67]:
print(classification_report(yceo, yfinal_pred))

              precision    recall  f1-score   support

           0       1.00      0.24      0.39    434902
           1       0.04      0.98      0.08     15503

   micro avg       0.27      0.27      0.27    450405
   macro avg       0.52      0.61      0.24    450405
weighted avg       0.96      0.27      0.38    450405



### Model 1b

In [68]:
yfinal_pred1b = log_1bceo0S.predict(Xceo[predictors1b])
print('Accuracy of logistic regression classifier on entire data: {:.2f}'.format(log_1bceo0S.score(Xceo[predictors1b], yceo)))

Accuracy of logistic regression classifier on entire data: 0.96


In [69]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(yceo['label'].tolist(), yfinal_pred1b.tolist())
print(confusion_matrix)

[[429171   5731]
 [ 13742   1761]]


In [70]:
print(classification_report(yceo, yfinal_pred1b))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98    434902
           1       0.24      0.11      0.15     15503

   micro avg       0.96      0.96      0.96    450405
   macro avg       0.60      0.55      0.57    450405
weighted avg       0.94      0.96      0.95    450405



### Model 2

In [71]:
ceo_pred_full = ceo_log.predict(Xceo[predictors])
sum(ceo_pred_full)

1887

In [72]:
print('Accuracy of logistic regression classifier on entire data: {:.2f}'.format(ceo_log.score(Xceo[predictors], yceo)))

Accuracy of logistic regression classifier on entire data: 0.97


In [73]:
print(classification_report(yceo, ceo_pred_full))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    434902
           1       0.53      0.06      0.12     15503

   micro avg       0.97      0.97      0.97    450405
   macro avg       0.75      0.53      0.55    450405
weighted avg       0.95      0.97      0.95    450405



In [74]:
from sklearn.metrics import confusion_matrix
cmf = confusion_matrix(yceo['label'].tolist(), ceo_pred_full.tolist())
print(cmf)

[[434022    880]
 [ 14496   1007]]


### Model blank is best by inspection

In [75]:
# Model 1a
ceo_df['pred'] = yfinal_pred
ceo_final = ceo_df[ceo_df['pred']==1]
ceo_final = ceo_final.reset_index(drop=True)
CEOs = list(ceo_final['Candidate'])
set(CEOs)

{'Jurassic Park',
 'Give Fed',
 'School Advancement',
 'Ben Milne',
 'Facial Fuel',
 'Donna Chapman',
 'Messages Photos',
 'Haters Guide',
 'Good Criminal',
 'Sudanese President',
 'Sales Domino',
 'Tokyos Mothers',
 'Europe Struggle',
 'Charles Wyly',
 'In Corners',
 'The Fly',
 'Illegal Street',
 'Mr Davis',
 'Jose Ugaz',
 'Since Washington',
 'Maybe Mayer',
 'Sharon Fay',
 'Julien Grout',
 'Steve Sisney',
 'University Tennessee',
 'Stepping Aside',
 'Berlin Denhams',
 'Wimbledon Tennis',
 'Ukraine Europe',
 'Consumer Product',
 'Vernon Louis',
 'Lie Zeckendorf',
 'Altcoins Ive',
 'Derby Amateur',
 'Chilean Andes',
 'Robert Rodriguez',
 'Getting Clobbered',
 'Dow Vintage',
 'Fredo Corleone',
 'Tobias Three',
 'Monza Brianaza',
 'Training Center',
 'Mega Projects',
 'Follow Curt',
 'Elvira Nabiullina',
 'Implications Monetary',
 'Bakersfield California',
 'Property Auctions',
 'Caroline Garcia',
 'Central Parkwere',
 'Best Manage',
 'Office Max',
 'Apples Attempt',
 'Chanos Exactly',


In [76]:
# Model 1b
ceo_df['pred'] = yfinal_pred1b
ceo_final = ceo_df[ceo_df['pred']==1]
ceo_final = ceo_final.reset_index(drop=True)
CEOs = list(ceo_final['Candidate'])
set(CEOs)

{'Ben Milne',
 'Michael Bloomberg',
 'Americans Tax',
 'Peter Cressy',
 'Amado Boudou',
 'Consumer Financial',
 'Blackstone Group',
 'Pierre Omidyar',
 'Tom Werner',
 'Rand Paul',
 'Strategic Development',
 'Markets BlackRock',
 'CEO Michael',
 'Advanced Portfolio',
 'Dilma Rousseff',
 'Michael Strianese',
 'Courtesy DeBraggaDeBragga',
 'Sir Martin',
 'Frenchman Lvy',
 'Global Index',
 'Michael Bloombergthe',
 'CEO TIAA',
 'Hu Jintao',
 'Former Autonomy',
 'Petro Poroshenko',
 'Blankfein COO',
 'Wells Fargo',
 'Carlos Brito',
 'Bernanke Speaks',
 'Opel Chief',
 'PLEPLER ANNOUNCES',
 'Oregon Northwest',
 'Carson Block',
 'Prodigy Network',
 'Advisor Vice',
 'Enrique Salem',
 'Chris Suh',
 'GlobalStrategistof Euro',
 'Barry Norris',
 'Sanda Pianalto',
 'Lone Ranger',
 'Bloomberg Television',
 'Time Inc',
 'Howard Taft',
 'December Vice',
 'Dave Lewis',
 'The Sentix',
 'Blankfeinand Sting',
 'Waffle House',
 'Hammerstone Group',
 'Telecom Italia',
 'Like Activist',
 'Peter Vosertold',
 'L

In [77]:
# Model 2
ceo_df['pred'] = ceo_pred_full
ceo_final = ceo_df[ceo_df['pred']==1]
ceo_final = ceo_final.reset_index(drop=True)
CEOs = list(ceo_final['Candidate'])
set(CEOs)

{'Aaron Levie',
 'Aaron Regent',
 'Abigail Johnson',
 'According Australian',
 'According Curalate',
 'According Nanex',
 'Ackman Valeant',
 'Advisor Solutions',
 'Advisory Group',
 'Aer Lingus',
 'Afghan United',
 'Airbus Group',
 'Alan Breed',
 'Alan Joyce',
 'Alan Mulally',
 'Alan Mulallyis',
 'Alan Mullaly',
 'Aleksey Miller',
 'Alex Algard',
 'Allen Questrom',
 'Altegris President',
 'America Founding',
 'American Apparel',
 'American Eagle',
 'American European',
 'American Express',
 'Analyst Earnings',
 'Anand Nallathambi',
 'Anders Gustafsson',
 'Andersen Tax',
 'Anderson Real',
 'Andrei Bugrov',
 'Andrei Cherny',
 'Andrei Kostin',
 'Andy Grove',
 'Angela Ahrendts',
 'Angelo Mozilo',
 'Anglo Irish',
 'Antonio Horta',
 'Antony Jenkins',
 'Apple Pay',
 'Ari Reichental',
 'Armstrong Fired',
 'Art Levinson',
 'Asia Pacific',
 'Asset Management',
 'Australian Institute',
 'Authority Chairman',
 'Auto Nation',
 'Automated Insights',
 'Avishai Abrahami',
 'Bank America',
 'Bank Ameri

### Model 2 preferred because of precision

In [78]:
finalCEO = set(CEOs)
finalCEO = pd.DataFrame(finalCEO)
finalCEO.to_csv("ExtractedCEOs.csv",header=False,index=False)

## Companies

In [79]:
def company_in_sentence(sentence):
    ret = 0
    if re.search(r'company', sentence.lower()) != None:
        ret = 1
    return ret

In [80]:
def stock_in_sentence(sentence):
    ret = 0
    if re.search(r'stock', sentence.lower()) != None:
        ret = 1
    return ret

In [81]:
def shares_in_sentence(sentence):
    ret = 0
    if re.search(r'share', sentence.lower()) != None:
        ret = 1
    return ret

In [82]:
def trade_in_sentence(sentence):
    ret = 0
    if re.search(r'trad', sentence.lower()) != None:
        ret = 1
    return ret

### Company Specific 

In [83]:
def length_of_company(item):
    return len(item)

In [84]:
def plural_word(item):
    plural = 0
    if item[len(item) - 1] == 's':
        plural = 1
    return plural

In [85]:
def number_of_words(words):
    return len(words)

In [86]:
def location_at_start(sentence, item):
    start = 0
    if re.search(re.compile(item), sentence).start() == 0:
        start = 1;
    else:
        start = 0;
    return start;

In [87]:
def company_words(word_phrase):
    corp = 0
    corporation = 0
    group = 0
    holding = 0
    inc = 0
    company = 0
    association = 0
    foundation = 0

    for word in word_phrase:
        if word == "Corp" or word == 'Corp.' or word == 'Corporation':
            corp = 1;
        if word == "Group":
            group = 1;
        if word == "Holding":
            holding = 1;
        if word == "Inc" or word == "Inc.":
            inc = 1;
        if word == "Company":
            company = 1;
        if word == "Association":
            association = 1;
        if word == "Foundation":
            foundation = 1;

    return corp, group, holding, inc, company, association, foundation

In [88]:
def feature_creator_companies(sentences):
    candidates = []
    for i in range(len(sentences)):
        x = re.findall(r'(([A-Z][A-Za-z0-9]+[ -]?)+)', sentences[i])
        extract = [i[0] for i in x]
        if extract != []:
            comp_in_sent = company_in_sentence(sentences[i])
            stock = stock_in_sentence(sentences[i])
            shares = shares_in_sentence(sentences[i])
            trade = trade_in_sentence(sentences[i]) 
            for j in extract:
                
                new_j = j
                if new_j[-1] == ' ':
                    new_len = len(new_j)-1
                    new_j = new_j[0:new_len]
                
                words = re.split(r'[ ]', new_j)
                length = length_of_company(new_j)
                plural = plural_word(new_j)
                number_words = number_of_words(words)
                location = location_at_start(sentences[i], new_j)
                comp = company_words(words)
                corp = comp[0]
                group = comp[1]
                holding = comp[2]
                inc = comp[3]
                company = comp[4]
                association = comp[5]
                foundation = comp[6]
                candidates.append([new_j,comp_in_sent,stock,shares,trade,length,plural,number_words,location,corp,group,holding,inc,company,association,foundation,sentences[i],i])
    return candidates

In [89]:
comp_df = pd.DataFrame(feature_creator_companies(sentences), columns = ['Candidate','comp_in_sent','stock','shares','trade','length','plural', 'number_words','location' , 'corp', 'group', 'holding', 'inc', 'company', 'association','foundation','sentence','index'])

In [90]:
comp_df

Unnamed: 0,Candidate,comp_in_sent,stock,shares,trade,length,plural,number_words,location,corp,group,holding,inc,company,association,foundation,sentence,index
0,ReutersChina,0,0,0,0,12,0,1,1,0,0,0,0,0,0,0,ReutersChina 's seven day repo rose record hig...,0
1,Shanghai,0,0,0,0,8,0,1,0,0,0,0,0,0,0,0,ReutersChina 's seven day repo rose record hig...,0
2,March,0,0,0,0,5,0,1,0,0,0,0,0,0,0,0,ReutersChina 's seven day repo rose record hig...,0
3,Bloomberg,0,0,0,0,9,0,1,0,0,0,0,0,0,0,0,ReutersChina 's seven day repo rose record hig...,0
4,Meanwhile,0,0,0,0,9,0,1,1,0,0,0,0,0,0,0,"Meanwhile , one-day rate hit record 12.85 % .",1
5,Zerohedge,0,0,0,0,9,0,1,1,0,0,0,0,0,0,0,Zerohedge reported overnight repo hit 25 % .,2
6,China,0,0,0,0,5,0,1,0,0,0,0,0,0,0,0,liquidity squeeze China first began ahead Drag...,3
7,Dragon Boat,0,0,0,0,11,0,2,0,0,0,0,0,0,0,0,liquidity squeeze China first began ahead Drag...,3
8,Spikes,0,0,0,0,6,1,1,1,0,0,0,0,0,0,0,Spikes interbank rates common right holidays .,4
9,Diana Choyleva Lombard Street Research,0,0,0,0,38,0,5,1,0,0,0,0,0,0,0,Diana Choyleva Lombard Street Research said sy...,5


### Logistic Regression for Companies

In [91]:
comp_labels = []
values = set(company['company'].values)
candidates = comp_df['Candidate'].tolist()

for i in range(len(comp_df)):
    if candidates[i] in values:
        comp_labels.append(1)
    else: 
        comp_labels.append(0)
comp_df['label'] = comp_labels

In [92]:
comp_df_final = comp_df.drop(['sentence','index','Candidate'], axis=1)

In [93]:
comp_df_final.sum(axis=0)

comp_in_sent       50327
stock              48089
shares             38788
trade              48762
length          11634569
plural            151365
number_words     1771426
location          311224
corp                1680
group               2650
holding              223
inc                 2792
company             1306
association          781
foundation           471
label             103225
dtype: int64

In [94]:
ycomp = comp_df_final.loc[:, comp_df_final.columns == 'label']
Xcomp = comp_df_final.loc[:, comp_df_final.columns != 'label']

In [95]:
Xcomp_train, Xcomp_test, ycomp_train, ycomp_test = train_test_split(Xcomp, ycomp, test_size=0.5, random_state=0)

### Three Models:
    1) Features based on p-values
    2) Features based on RFE
    3) Over-sampling model using RFE samples

### Model 1: P-values

In [96]:
logit_model=sm.Logit(ycomp_train,Xcomp_train)
result=logit_model.fit(method='bfgs')
print(result.summary2())

         Current function value: 0.306407
         Iterations: 35
         Function evaluations: 37
         Gradient evaluations: 37




                          Results: Logit
Model:              Logit            Pseudo R-squared: -0.014     
Dependent Variable: label            AIC:              352090.6038
Date:               2020-03-09 13:45 BIC:              352259.5225
No. Observations:   574498           Log-Likelihood:   -1.7603e+05
Df Model:           14               LL-Null:          -1.7359e+05
Df Residuals:       574483           LLR p-value:      1.0000     
Converged:          0.0000           Scale:            1.0000     
------------------------------------------------------------------
                  Coef.  Std.Err.     z     P>|z|   [0.025  0.975]
------------------------------------------------------------------
comp_in_sent      0.5852   0.0181   32.3502 0.0000  0.5497  0.6206
stock             0.3112   0.0201   15.4841 0.0000  0.2718  0.3506
shares            0.6558   0.0197   33.2636 0.0000  0.6172  0.6945
trade             0.0164   0.0215    0.7588 0.4479 -0.0259  0.0586
length            0.0

#### Choosing factors with p-values under 0.05

In [97]:
predictors1=['comp_in_sent','stock','shares','length','plural','number_words','location','corp','group','inc'] 
Xcomp_train1=Xcomp_train[predictors1]

In [98]:
logit_model=sm.Logit(ycomp_train,Xcomp_train1)
result=logit_model.fit(method='bfgs')
print(result.summary2())

         Current function value: 0.306417
         Iterations: 35
         Function evaluations: 37
         Gradient evaluations: 37




                          Results: Logit
Model:              Logit            Pseudo R-squared: -0.014     
Dependent Variable: label            AIC:              352092.0574
Date:               2020-03-09 13:45 BIC:              352204.6699
No. Observations:   574498           Log-Likelihood:   -1.7604e+05
Df Model:           9                LL-Null:          -1.7359e+05
Df Residuals:       574488           LLR p-value:      1.0000     
Converged:          0.0000           Scale:            1.0000     
------------------------------------------------------------------
                  Coef.  Std.Err.     z     P>|z|   [0.025  0.975]
------------------------------------------------------------------
comp_in_sent      0.5845   0.0179   32.6366 0.0000  0.5494  0.6196
stock             0.3114   0.0200   15.5381 0.0000  0.2721  0.3507
shares            0.6557   0.0197   33.3204 0.0000  0.6171  0.6942
length            0.0682   0.0016   43.4091 0.0000  0.0651  0.0713
plural           -0.0

In [99]:
log_comp1 = LogisticRegression()
log_comp1.fit(Xcomp_train1, ycomp_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [100]:
ycomp_pred1 = log_comp1.predict(Xcomp_test[predictors1])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_comp1.score(Xcomp_test[predictors1], ycomp_test)))

Accuracy of logistic regression classifier on test set: 0.91


In [101]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(ycomp_test.iloc[:,0].tolist(), ycomp_pred1.tolist())
print(confusion_matrix)

[[522252    631]
 [ 51019    596]]


In [102]:
print(classification_report(ycomp_test, ycomp_pred1))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95    522883
           1       0.49      0.01      0.02     51615

   micro avg       0.91      0.91      0.91    574498
   macro avg       0.70      0.51      0.49    574498
weighted avg       0.87      0.91      0.87    574498



### Model 2: RFE

In [103]:
logcomp = LogisticRegression()
rfe = RFE(logcomp)
rfe = rfe.fit(Xcomp_train, ycomp_train)
print(rfe.support_)
print(rfe.ranking_)

  y = column_or_1d(y, warn=True)


[False False  True False False False False False  True  True False  True
  True  True  True]
[2 6 1 8 9 7 3 5 1 1 4 1 1 1 1]


In [104]:
predictors2=['shares','corp','group','inc','company','association','foundation'] 
Xcomp_train2=Xcomp_train[predictors2]

In [105]:
logit_model=sm.Logit(ycomp_train,Xcomp_train2)
result=logit_model.fit(method='bfgs')
print(result.summary2())

         Current function value: 0.684804
         Iterations: 35
         Function evaluations: 39
         Gradient evaluations: 39




                          Results: Logit
Model:              Logit            Pseudo R-squared: -1.266     
Dependent Variable: label            AIC:              786850.9341
Date:               2020-03-09 13:45 BIC:              786929.7629
No. Observations:   574498           Log-Likelihood:   -3.9342e+05
Df Model:           6                LL-Null:          -1.7359e+05
Df Residuals:       574491           LLR p-value:      1.0000     
Converged:          0.0000           Scale:            1.0000     
-------------------------------------------------------------------
               Coef.   Std.Err.     z      P>|z|    [0.025   0.975]
-------------------------------------------------------------------
shares        -1.4599    0.0185  -79.0546  0.0000  -1.4961  -1.4237
corp           0.5126    0.0727    7.0459  0.0000   0.3700   0.6551
group         -0.2601    0.0566   -4.5954  0.0000  -0.3710  -0.1491
inc            0.1400    0.0549    2.5502  0.0108   0.0324   0.2476
company       

In [106]:
log_comp2 = LogisticRegression()
log_comp2.fit(Xcomp_train2, ycomp_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [107]:
ycomp_pred2 = log_comp2.predict(Xcomp_test[predictors2])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_comp2.score(Xcomp_test[predictors2], ycomp_test)))

Accuracy of logistic regression classifier on test set: 0.91


In [108]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(ycomp_test.iloc[:,0].tolist(), ycomp_pred2.tolist())
print(confusion_matrix)

[[522341    542]
 [ 50985    630]]


In [109]:
print(classification_report(ycomp_test, ycomp_pred2))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95    522883
           1       0.54      0.01      0.02     51615

   micro avg       0.91      0.91      0.91    574498
   macro avg       0.72      0.51      0.49    574498
weighted avg       0.88      0.91      0.87    574498



### Model 3: Over-Sampling

In [110]:
os = SMOTE(random_state=0)
columns = Xcomp_train.columns

os_comp_X,os_comp_y=os.fit_sample(Xcomp_train, ycomp_train)
os_comp_X = pd.DataFrame(data=os_comp_X,columns=columns )
os_comp_y= pd.DataFrame(data=os_comp_y,columns=['label'])

  y = column_or_1d(y, warn=True)


In [111]:
print("length of oversampled comps is ",len(os_comp_X))
print("Number of non-companies in oversampled comps",len(os_comp_y[os_comp_y['label']==0]))
print("Number of companies",len(os_comp_y[os_comp_y['label']==1]))
print("Proportion of non-companies in oversampled comps is ",len(os_comp_y[os_comp_y['label']==0])/len(os_comp_X))
print("Proportion of companies in oversampled comps is ",len(os_comp_y[os_comp_y['label']==1])/len(os_comp_X))

length of oversampled comps is  1045776
Number of non-companies in oversampled comps 522888
Number of companies 522888
Proportion of non-companies in oversampled comps is  0.5
Proportion of companies in oversampled comps is  0.5


In [112]:
rfe = rfe.fit(os_comp_X, os_comp_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)



[False False  True False False False False False  True  True False  True
  True  True  True]
[2 4 1 8 9 7 3 5 1 1 6 1 1 1 1]


In [113]:
os_comp_X.columns

Index(['comp_in_sent', 'stock', 'shares', 'trade', 'length', 'plural',
       'number_words', 'location', 'corp', 'group', 'holding', 'inc',
       'company', 'association', 'foundation'],
      dtype='object')

In [114]:
predictors3=['corp','group','inc','company','association','foundation'] 
Xcomp_os=os_comp_X[predictors3]
ycomp_os=os_comp_y['label']

In [115]:
logit_model=sm.Logit(ycomp_os,Xcomp_os)
result=logit_model.fit(method='bfgs')
print(result.summary2())

         Current function value: 0.687099
         Iterations: 35
         Function evaluations: 39
         Gradient evaluations: 39
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.009       
Dependent Variable: label            AIC:              1437114.5569
Date:               2020-03-09 13:45 BIC:              1437185.7185
No. Observations:   1045776          Log-Likelihood:   -7.1855e+05 
Df Model:           5                LL-Null:          -7.2488e+05 
Df Residuals:       1045770          LLR p-value:      0.0000      
Converged:          0.0000           Scale:            1.0000      
--------------------------------------------------------------------
                Coef.   Std.Err.     z      P>|z|    [0.025   0.975]
--------------------------------------------------------------------
corp            2.4920    0.0574   43.4118  0.0000   2.3795   2.6045
group           1.7712    0.0391   45.2438  0.0000   1.6945   1.8480
inc 



In [116]:
Xcomp_os_train, Xcomp_os_test, ycomp_os_train, ycomp_os_test = train_test_split(Xcomp_os, ycomp_os, test_size=0.5, random_state=0)
log_compOS = LogisticRegression()
log_compOS.fit(Xcomp_os_train, ycomp_os_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [117]:
ycomp_os_pred = log_compOS.predict(Xcomp_os_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_compOS.score(Xcomp_os_test, ycomp_os_test)))

Accuracy of logistic regression classifier on test set: 0.51


In [118]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(ycomp_os_test.tolist(), ycomp_os_pred.tolist())
print(confusion_matrix)

[[260649    903]
 [253798   7538]]


In [119]:
print(classification_report(ycomp_os_test, ycomp_os_pred))

              precision    recall  f1-score   support

           0       0.51      1.00      0.67    261552
           1       0.89      0.03      0.06    261336

   micro avg       0.51      0.51      0.51    522888
   macro avg       0.70      0.51      0.36    522888
weighted avg       0.70      0.51      0.36    522888



### Comparison of 3 models on full dataset:

### Model 1:

In [120]:
comp1_pred = log_comp1.predict(Xcomp[predictors1])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_comp1.score(Xcomp[predictors1], ycomp)))

Accuracy of logistic regression classifier on test set: 0.91


In [121]:
sum(comp1_pred)

2512

In [122]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(ycomp.iloc[:,0].tolist(), comp1_pred.tolist())
print(confusion_matrix)

[[1044475    1296]
 [ 102009    1216]]


In [123]:
print(classification_report(ycomp, comp1_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95   1045771
           1       0.48      0.01      0.02    103225

   micro avg       0.91      0.91      0.91   1148996
   macro avg       0.70      0.51      0.49   1148996
weighted avg       0.87      0.91      0.87   1148996



### Model 2

In [124]:
comp2_pred = log_comp2.predict(Xcomp[predictors2])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_comp2.score(Xcomp[predictors2], ycomp)))

Accuracy of logistic regression classifier on test set: 0.91


In [125]:
sum(comp2_pred)

2311

In [126]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(ycomp.iloc[:,0].tolist(), comp2_pred.tolist())
print(confusion_matrix)

[[1044697    1074]
 [ 101988    1237]]


In [127]:
print(classification_report(ycomp, comp2_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95   1045771
           1       0.54      0.01      0.02    103225

   micro avg       0.91      0.91      0.91   1148996
   macro avg       0.72      0.51      0.49   1148996
weighted avg       0.88      0.91      0.87   1148996



### Model 3

In [128]:
comp3_pred = log_compOS.predict(Xcomp[predictors3])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_compOS.score(Xcomp[predictors3], ycomp)))

Accuracy of logistic regression classifier on test set: 0.91


In [129]:
sum(comp3_pred)

6866

In [130]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(ycomp.iloc[:,0].tolist(), comp3_pred.tolist())
print(confusion_matrix)

[[1042125    3646]
 [ 100005    3220]]


In [131]:
print(classification_report(ycomp, comp3_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95   1045771
           1       0.47      0.03      0.06    103225

   micro avg       0.91      0.91      0.91   1148996
   macro avg       0.69      0.51      0.51   1148996
weighted avg       0.87      0.91      0.87   1148996



### Either the second or third model is the best based on confusion matrix and precision/recall on full dataset

In [132]:
predictors2

['shares', 'corp', 'group', 'inc', 'company', 'association', 'foundation']

In [133]:
comp_df['pred'] = comp2_pred
comp_final = comp_df[comp_df['pred']==1]
comp_final = comp_final.reset_index(drop=True)
comps2 = list(comp_final['Candidate'])
comp_final

Unnamed: 0,Candidate,comp_in_sent,stock,shares,trade,length,plural,number_words,location,corp,group,holding,inc,company,association,foundation,sentence,index,label,pred
0,News Corp,0,0,0,0,9,0,2,0,1,0,0,0,0,0,0,assets include stakes top Western firms Citigr...,328,1,1
1,Thorium Storage Industrial Products Corporation,0,0,0,0,47,0,5,0,1,0,0,0,0,0,0,Kennedy proposed creation Thorium Storage Indu...,1345,0,1
2,Murphy Oil Corp,0,0,0,0,15,0,3,0,1,0,0,0,0,0,0,"Inc , Murphy Oil Corp , Delphi Corp American I...",1574,1,1
3,Delphi Corp American International Group Inc,0,0,0,0,44,0,6,0,1,1,0,1,0,0,0,"Inc , Murphy Oil Corp , Delphi Corp American I...",1574,0,1
4,Overseas Shipholding Group Inc,0,0,0,0,30,0,4,0,0,1,0,1,0,0,0,"top fiver losers Apple , Inc. , Gold , Short ,...",1582,1,1
5,Los Angeles TCW Group Inc,0,0,0,0,25,0,5,0,0,1,0,1,0,0,0,particularly gratified see friend David Loevin...,1627,0,1
6,Bank America Corp,0,0,0,0,17,0,3,0,1,0,0,0,0,0,0,Lenders including Bank America Corp. Citigroup...,1973,0,1
7,Goldman Sachs Group Inc,0,0,1,0,23,0,4,0,0,1,0,1,0,0,0,Buffetts firm investments least four seven big...,1975,1,1
8,Corp,0,0,0,0,4,0,1,0,1,0,0,0,0,0,0,scandal erupted official China Central Televis...,2072,0,1
9,Goldman Sachs Group Inc,0,0,0,0,23,0,4,0,0,1,0,1,0,0,0,proved wrong last year investors would done be...,2489,1,1


In [134]:
comp_df['pred'] = comp3_pred
comp_final = comp_df[comp_df['pred']==1]
comp_final = comp_final.reset_index(drop=True)
comps1 = list(comp_final['Candidate'])
set(comps1)
comp_final

Unnamed: 0,Candidate,comp_in_sent,stock,shares,trade,length,plural,number_words,location,corp,group,holding,inc,company,association,foundation,sentence,index,label,pred
0,News Corp,0,0,0,0,9,0,2,0,1,0,0,0,0,0,0,assets include stakes top Western firms Citigr...,328,1,1
1,Credit Suisse Group AG Julius BaerGroup Ltd,0,0,0,0,43,0,7,0,0,1,0,0,0,0,0,law wouldnt affected least 12 financial instit...,351,0,1
2,Tamir Shemesh Corcoran Group,0,0,0,0,28,0,4,1,0,1,0,0,0,0,0,Tamir Shemesh Corcoran Group represented buyer...,1014,0,1
3,Eurasia Group,0,0,0,0,13,0,2,0,0,1,0,0,0,0,0,European Separatism One 2013 's Biggest Red He...,1149,1,1
4,Thorium Storage Industrial Products Corporation,0,0,0,0,47,0,5,0,1,0,0,0,0,0,0,Kennedy proposed creation Thorium Storage Indu...,1345,0,1
5,Apple Inc,0,0,0,0,9,0,2,0,0,0,0,1,0,0,0,WSJ : Mr. Loeb taken short position J.C. Penne...,1386,1,1
6,Inc,0,0,0,0,3,0,1,1,0,0,0,1,0,0,0,"Inc , Murphy Oil Corp , Delphi Corp American I...",1574,0,1
7,Murphy Oil Corp,0,0,0,0,15,0,3,0,1,0,0,0,0,0,0,"Inc , Murphy Oil Corp , Delphi Corp American I...",1574,1,1
8,Delphi Corp American International Group Inc,0,0,0,0,44,0,6,0,1,1,0,1,0,0,0,"Inc , Murphy Oil Corp , Delphi Corp American I...",1574,0,1
9,Inc,0,0,0,0,3,0,1,0,0,0,0,1,0,0,0,"top fiver losers Apple , Inc. , Gold , Short ,...",1582,0,1


### Based on inspection of extracted values, the third model presents a better list

In [135]:
finalCompany = set(comps1)
finalCompany = pd.DataFrame(finalCompany)
finalCompany.to_csv("ExtractedCompanies.csv",header=False,index=False)

## Percentages

In [136]:
def percent_after(sent,num):
    try:
        perc = 0
        nxt = ''        
        split = re.split(r'[ ]', sent)
        if num in split:
            num_index = split.index(num)
            nxt = split[num_index+1].lower()
            if nxt == 'percentage' or nxt == "percent":
                perc = 1;
                return perc;
        char_index = re.search(num, sent.lower()).start() + len(num)
        if sent[char_index] == '%' or sent[char_index+1] == '%':
            perc = 1;
            return perc;
        else: perc = 0;
            
    except IndexError:
        perc = 0;
    return perc;

In [137]:
def greater_than_1800(num):
    try:
        year = 0
        num = int(num)
        
        if num > 1800: year = 1;
        else: year = 0;
    except ValueError: pass
    return year;

In [138]:
def feature_creator_percent(ls):
    numbers = []
    for i in range(len(ls)):
        re1 = re.findall(r'\d*\.?\d+', ls[i])
        re2 = re.findall(r'one[\s|-]?hundred|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen', ls[i].lower())
        re3 = re.findall(r'((twenty|thirty|fourty|fifty|sixty|seventy|eighty|ninety)(\s|-)?(one|two|three|four|five|six|seven|eight|nine)?)', ls[i].lower())
        re3 = [i[0] for i in re3]
        extract = re1 + re2 + re3
        if extract != []:
            for item in extract:
                year = greater_than_1800(item)
                perc = percent_after(ls[i],item)
                numbers.append([item, year, perc,i])
    return numbers

In [139]:
numbers = pd.DataFrame(feature_creator_percent(sentences), columns = ['numbers','year','perc','sentence'])

In [140]:
numbers

Unnamed: 0,numbers,year,perc,sentence
0,10.77,0,1,0
1,2003,1,0,0
2,seven,0,0,0
3,12.85,0,1,1
4,one,0,0,1
5,25,0,1,2
6,zero,0,0,2
7,25,0,1,16
8,one,0,0,16
9,one,0,0,17


## Logistic Regression for Percentages

In [141]:
def add_percentages(sent,num):
    percentage = num
    nxt = ''
    split = re.split(r'[ ]', sent)
    try:
        char_index = re.search(num, sent.lower()).start() + len(num)
        if sent[char_index] == '%':
            percentage = num + '%'
            return percentage;
        if sent[char_index+1] == '%':
            percentage = num + '%'
            return percentage; 

        if num in split:
            num_index = split.index(num)
            nxt = split[num_index+1].lower()
            if nxt == 'percentage':
                percentage = num + ' ' + nxt
                return percentage;
            if nxt == 'percent':
                percentage = num + ' ' + nxt
                return percentage;
            else:
                return percentage;
        else:
            return percentage;
    except IndexError:
        return percentage;

In [142]:
percentages = []
for i in range(len(numbers)):
    sent = sentences[numbers.iloc[i,3]]
    num = numbers.iloc[i,0]
    percentages.append(add_percentages(sent,num))
    
numbers['numbers'] = percentages

In [143]:
labels=[]
candidates = numbers['numbers'].tolist()
values = percent['perc'].values

for i in range(len(candidates)):
    if candidates[i] in values:
        labels.append(1)
    else: 
        labels.append(0) 
numbers['label'] = labels

In [144]:
numbers.head()

Unnamed: 0,numbers,year,perc,sentence,label
0,10.77%,0,1,0,1
1,2003,1,0,0,0
2,seven,0,0,0,0
3,12.85%,0,1,1,1
4,one,0,0,1,0


In [145]:
perc_df = numbers.drop(['numbers','sentence'], axis=1)

In [146]:
perc_df.sum(axis=0)

year      73185
perc      75544
label    251092
dtype: int64

In [147]:
yperc = perc_df.loc[:, perc_df.columns == 'label']
Xperc = perc_df.loc[:, perc_df.columns != 'label']

In [148]:
Xperc_train, Xperc_test, yperc_train, yperc_test = train_test_split(Xperc, yperc, test_size=0.5, random_state=0)

In [149]:
logit_model=sm.Logit(yperc_train,Xperc_train)
result=logit_model.fit(method='bfgs')
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.598782
         Iterations: 19
         Function evaluations: 21
         Gradient evaluations: 21
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.122      
Dependent Variable: label            AIC:              353987.1984
Date:               2020-03-09 13:47 BIC:              354008.3918
No. Observations:   295586           Log-Likelihood:   -1.7699e+05
Df Model:           1                LL-Null:          -2.0156e+05
Df Residuals:       295584           LLR p-value:      0.0000     
Converged:          1.0000           Scale:            1.0000     
--------------------------------------------------------------------
           Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
--------------------------------------------------------------------
year      -7.3707     0.2085   -35.3426   0.0000   -7.7794   -6.9619
perc       0.7930     0.0111    71.3905   

In [150]:
log_perc = LogisticRegression()
log_perc.fit(Xperc_train, yperc_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [151]:
log_perc.coef_

array([[-6.90494752,  0.99247244]])

In [152]:
yperc_pred = log_perc.predict(Xperc_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_perc.score(Xperc_test, yperc_test)))


Accuracy of logistic regression classifier on test set: 0.62


In [153]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(yperc_test.iloc[:,0].tolist(), yperc_pred.tolist())
print(confusion_matrix)

[[158487  11678]
 [ 99369  26053]]


In [154]:
print(classification_report(yperc_test, yperc_pred))

              precision    recall  f1-score   support

           0       0.61      0.93      0.74    170165
           1       0.69      0.21      0.32    125422

   micro avg       0.62      0.62      0.62    295587
   macro avg       0.65      0.57      0.53    295587
weighted avg       0.65      0.62      0.56    295587



In [155]:
full_perc_pred = log_perc.predict(Xperc)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_perc.score(Xperc, yperc)))

Accuracy of logistic regression classifier on test set: 0.62


In [156]:
sum(full_perc_pred)

75515

In [157]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(yperc.iloc[:,0].tolist(), full_perc_pred.tolist())
print(confusion_matrix)

[[316632  23449]
 [199026  52066]]


In [158]:
print(classification_report(yperc, full_perc_pred))

              precision    recall  f1-score   support

           0       0.61      0.93      0.74    340081
           1       0.69      0.21      0.32    251092

   micro avg       0.62      0.62      0.62    591173
   macro avg       0.65      0.57      0.53    591173
weighted avg       0.65      0.62      0.56    591173



In [159]:
numbers['pred'] = full_perc_pred
perc_df_extract = numbers[numbers['pred']==1]
perc_df_extract = perc_df_extract.reset_index(drop=True)

In [160]:
percentages = perc_df_extract['numbers']

In [161]:
finalPercentage = pd.DataFrame(percentages)
finalPercentage.to_csv("ExtractedPercantages.csv",header=False,index=False)