In [17]:
import pandas as pd
from textblob import TextBlob
import os
import datetime as dt

In [18]:
symbols = []
text = []
date = []
with open("newsdata.txt", 'rb') as myfile:
    for line in myfile.readlines():
        line = line.strip("\n").split("||")
        symbols.append(line[0])
        text.append(line[1])
        date.append(line[2])
news_data = pd.DataFrame({"company": symbols, "text": text, "date":date})
news_data['date'] = pd.to_datetime(news_data['date'])
#news_data.to_csv("new_data.csv", index =False)

In [19]:
news_data.head()

Unnamed: 0,company,date,text
0,MAC,2017-10-16,Breaking: Macerich (NYSE:MAC) Buy Rating Maint...
1,MAC,2017-10-14,Chilton Capital Management Has Trimmed Maceric...
2,MAC,2017-10-14,Chilton Capital Management Lowered Its Stake i...
3,MAC,2017-10-16,Indus Capital Partners Has Lifted By $912520 I...
4,MAC,2017-10-16,Analysts See $0.98 EPS for Macerich Co (MAC); ...


#### Develop individual-event level features

In [20]:
#some direct text feature
news_data['scandal'] = news_data['text'].map(lambda x : 1 if 'scandal' in x.lower() else 0)
#news_data['CFO'] = news_data['text'].map(lambda x : 1 if 'CFO' in x.lower() else 0) #none of these have CFO
news_data['buy'] = news_data['text'].map(lambda x : 1 if 'buy' in x.lower() else 0)
news_data['sell'] = news_data['text'].map(lambda x : 1 if 'sell' in x.lower() else 0)
news_data['decline'] = news_data['text'].map(lambda x : 1 if 'decline' in x.lower() else 0)
news_data['decline'].value_counts()
#sentiment feature
#bag-of-words

0    18016
1      148
Name: decline, dtype: int64

#### Sentiment

In [21]:
def get_polarity(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity
def get_subjectivity(text):
    blob = TextBlob(text)
    return blob.sentiment.subjectivity

In [51]:
news_data["positivity"] = news_data['text'].map(lambda x : get_polarity(x))
news_data["subjectivity"] = news_data['text'].map(lambda x : get_subjectivity(x))
news_data["financial_report_quarter"] = 0
news_data.ix[news_data['text'].str.upper().str.contains('REPORTS Q'),"financial_report_quarter"] = pd.to_datetime(news_data['date']).dt.quarter
news_data.head()

Unnamed: 0,company,date,text,scandal,buy,sell,decline,positivity,subjectivity,financial_report_quarter
0,MAC,2017-10-16,Breaking: Macerich (NYSE:MAC) Buy Rating Maint...,0,1,0,0,0.0,0.0,0
1,MAC,2017-10-14,Chilton Capital Management Has Trimmed Maceric...,0,0,0,0,0.0,0.0,0
2,MAC,2017-10-14,Chilton Capital Management Lowered Its Stake i...,0,0,0,0,0.0,0.0,0
3,MAC,2017-10-16,Indus Capital Partners Has Lifted By $912520 I...,0,0,0,0,0.0,0.0,0
4,MAC,2017-10-16,Analysts See $0.98 EPS for Macerich Co (MAC); ...,0,0,0,0,0.0,0.0,0


In [54]:
for_analysis = news_data[['company', 'date', 'scandal', 'buy', 'sell', 'decline', 'positivity', 'subjectivity','financial_report_quarter']]
for_analysis = for_analysis.groupby(['company', 'date'], as_index=False).agg(sum)
for_analysis.head()

Unnamed: 0,company,date,scandal,buy,sell,decline,positivity,subjectivity,financial_report_quarter
0,COL,2015-10-30,0,0,0,0,0.0,0.0,0
1,COL,2015-11-02,0,0,0,0,0.0,0.076923,0
2,COL,2015-11-11,0,0,0,0,0.0,0.0,0
3,COL,2016-01-26,0,0,0,0,0.0,0.0,0
4,COL,2016-02-01,0,0,0,0,0.0,0.076923,0


In [55]:
folder = "nlp_by_company_fr"
os.mkdir(folder)
for company  in list(for_analysis['company']):
    data = for_analysis[for_analysis['company'] == company]
    data = data[['date', 'scandal', 'buy', 'sell', 'decline', 'positivity', 'subjectivity', 'financial_report_quarter']]
    data.to_csv(folder + "/"+ company + ".csv", index=False)

In [56]:
for_analysis.to_csv("preliminary_features.csv")

#### Develop Company-Level Features

In [57]:
functs = {"date": ['min', 'max'], "text": 'count'}
#add average sentiment, sentement std and 
news_data.groupby('company')['date'].agg(functs)

Unnamed: 0_level_0,date,date,text
Unnamed: 0_level_1,min,max,date
company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
COL,2015-10-30,2017-10-16,121
CRM,2015-11-18,2017-10-16,122
DGX,2015-10-22,2017-10-16,115
FOX,2016-01-18,2017-10-16,153
FOXA,2016-01-18,2017-10-16,153
FTI,2017-10-03,2017-10-15,15
JWN,2015-11-12,2017-10-16,125
KORS,2016-06-01,2017-10-16,115
LUV,2015-11-04,2017-10-16,161
M,2015-11-11,2017-10-16,138
