In [52]:
import pandas as pd
import numpy as np
from glob import glob
from nltk.corpus import stopwords
stop = stopwords.words('english')
import re
import nltk
pd.set_option('display.max_colwidth', -1)
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from datetime import datetime

companies_mapping = {'AZN_stocks.csv':"AstraZeneca", 'RHHBY_stocks.csv':"Roche", 'PFE_stocks.csv':"Pfizer", 
                     'NVS_stocks.csv':"Novartis",'BAYRY_stocks.csv':"BayerPharma", 'MRK_stocks.csv':"Merck", 
                     'GSK_stocks.csv':"GSK", 'SNY_stocks.csv':"Sanofi"}

In [53]:
twitter = glob('input/tweets/*.csv')
df_tweets = pd.concat(pd.read_csv(file).assign(filename = file) for file in twitter)

stock = glob('input/stock/*.csv')
df_stock = pd.concat(pd.read_csv(file).assign(filename = file) for file in stock)

df_stock.filename = df_stock.filename.str.split(pat ="\\", expand = True)[1]
df_stock['company'] = df_stock.filename.map(companies_mapping)

df_covid = pd.read_csv('input/covid_data.csv')
df_tweets.reset_index(inplace = True)




del df_tweets['filename']
del df_tweets['index']

del df_stock['filename']

In [54]:
def return_hashes(col):

    return list(set([re.sub('[^\w\s]','', word) for word in col.split() if word[0] == '#']))

def return_ats(col):
    return list(set(  [re.sub('[^\w\s]','', word) for word in col.split() if word[0] == '@']  ))

def remove_stopwords(col):
    return ' '.join([word for word in col.split() if word not in stop])

def remove_https(col):
    return ' '.join([word for word in col.split() if word[0:6] != 'https:'])

def text_tokenized(col):
    return word_tokenize(col)

def pos_tagged(col):
    return pos_tag(col)

In [56]:
# lower column names
df_tweets.columns = map(str.lower, df_tweets.columns)
df_stock.columns = map(str.lower, df_stock.columns)

df_tweets.rename(columns = {'text':'text_original'}, inplace = True)

df_tweets['text_modified'] = df_tweets['text_original'].str.lower() # małe znaki
df_tweets['hash'] = df_tweets['text_modified'].apply(return_hashes) # wybieranie unikalnych hashtagów (bez punktuacji)
df_tweets['at'] = df_tweets['text_modified'].apply(return_ats) # wybieranie unikalnych odnośników (bez punktuacji)
df_tweets['text_modified'] = df_tweets['text_modified'].apply(remove_https)
df_tweets['text_modified'] = df_tweets['text_modified'].str.replace('[^\w\s]','') # usuwanie punktuacji; można usuwać # ze zdań jeśli usunie się ten znak z regular expression
df_tweets['text_modified'] = df_tweets['text_modified'].apply(remove_stopwords)
df_tweets['text_tokenized'] = df_tweets['text_modified'].apply(text_tokenized)
df_tweets['text_pos_tagged'] = df_tweets['text_tokenized'].apply(pos_tagged)

df_tweets.created_at = pd.to_datetime(df_tweets.created_at)
df_tweets['date'] = df_tweets.created_at.dt.date

df_stock.date = pd.to_datetime(df_stock.date)
df_stock['date'] = df_stock.date.dt.date


## Data preview

In [59]:
companies = df_tweets.company.unique()

for company in companies:
    dat1 = min(df_tweets.loc[df_tweets['company'] == company].created_at)
    dat2 = max(df_tweets.loc[df_tweets['company'] == company].created_at)
    
    print(f'Zakres datowy dla tweetów   {dat1}   -   {dat2}   :  {company}')

Zakres datowy dla tweetów   2020-02-03 11:30:17   -   2020-05-06 13:13:41   :  AstraZeneca
Zakres datowy dla tweetów   2020-02-03 16:39:56   -   2020-05-06 16:40:37   :  BayerPharma
Zakres datowy dla tweetów   2020-02-05 12:26:23   -   2020-05-07 15:41:11   :  GSK
Zakres datowy dla tweetów   2020-02-04 14:59:51   -   2020-05-07 18:10:43   :  Merck
Zakres datowy dla tweetów   2020-02-03 13:22:07   -   2020-05-07 19:55:40   :  Novartis
Zakres datowy dla tweetów   2020-02-17 13:55:00   -   2020-05-07 20:02:01   :  Pfizer
Zakres datowy dla tweetów   2020-02-03 09:01:21   -   2020-05-07 14:24:05   :  Roche
Zakres datowy dla tweetów   2020-02-03 10:03:21   -   2020-05-05 12:59:16   :  Sanofi


Ze względy na ograniczenie API Twittera dla każdej firmy pobranych zostało 200 tweetów. Zakres czasowy ich występowania różni się dla danych firm. Dlatego ustalony zostaje wspólny okres badania: od 1 lutego do 7 maja. Początego tego okresu można uznać za początek epidemii koronawirusa w Europie i Ameryce Północnej.

In [58]:
lower_date = datetime.strptime('2020-02-02','%Y-%m-%d').date()

df_tweets = df_tweets.loc[df_tweets['date'] >= lower_date]
df_stock = df_stock.loc[df_stock['date'] >= lower_date]

Ponowne sprawdzenie zakresu dat, tym razem z ilością tweetów pozostałą po ograniczeniu zbioru danych dla każdej z firm.

In [61]:
companies = df_tweets.company.unique()

for company in companies:
    dat1 = min(df_tweets.loc[df_tweets['company'] == company].created_at)
    dat2 = max(df_tweets.loc[df_tweets['company'] == company].created_at)
    ilosc = len(df_tweets.loc[df_tweets['company'] == company])
    print(f'Zakres datowy dla tweetów   {dat1}   -   {dat2}   :  {company}; il. tweetów: {ilosc}')

Zakres datowy dla tweetów   2020-02-03 11:30:17   -   2020-05-06 13:13:41   :  AstraZeneca; il. tweetów: 107
Zakres datowy dla tweetów   2020-02-03 16:39:56   -   2020-05-06 16:40:37   :  BayerPharma; il. tweetów: 65
Zakres datowy dla tweetów   2020-02-05 12:26:23   -   2020-05-07 15:41:11   :  GSK; il. tweetów: 200
Zakres datowy dla tweetów   2020-02-04 14:59:51   -   2020-05-07 18:10:43   :  Merck; il. tweetów: 200
Zakres datowy dla tweetów   2020-02-03 13:22:07   -   2020-05-07 19:55:40   :  Novartis; il. tweetów: 106
Zakres datowy dla tweetów   2020-02-17 13:55:00   -   2020-05-07 20:02:01   :  Pfizer; il. tweetów: 200
Zakres datowy dla tweetów   2020-02-03 09:01:21   -   2020-05-07 14:24:05   :  Roche; il. tweetów: 195
Zakres datowy dla tweetów   2020-02-03 10:03:21   -   2020-05-05 12:59:16   :  Sanofi; il. tweetów: 193


Jak widać dla BayerPharmy ta ilość jest mniejsza niż poprzednio połowa. Ewentualnością będzie wykluczenie tej firmy z badania.

In [41]:
#https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

In [64]:
df_tweets.head(3)

Unnamed: 0,company,text_original,created_at,favourite_count,retweet_count,text_modified,hash,at,text_tokenized,text_pos_tagged,date
0,AstraZeneca,"Together with partners across industry, academia and government, we are taking a multipronged approach to helping patients around the world facing #COVID19. https://t.co/uQuHj6BkBN",2020-05-06 13:13:41,44,8,together partners across industry academia government taking multipronged approach helping patients around world facing covid19,[covid19],[],"[together, partners, across, industry, academia, government, taking, multipronged, approach, helping, patients, around, world, facing, covid19]","[(together, RB), (partners, NNS), (across, IN), (industry, NN), (academia, NN), (government, NN), (taking, VBG), (multipronged, VBD), (approach, NN), (helping, VBG), (patients, NNS), (around, IN), (world, NN), (facing, NN), (covid19, NN)]",2020-05-06
1,AstraZeneca,"On #GivingTuesdayNow we stand with our partners @Plan_UK @Unicef_UK @ProjectHopeorg @NCDAlliance in their efforts responding to the unique health needs of groups vulnerable to #COVID19, such as those living with NCDs and young people. Get involved: https://t.co/YGRHLGqct6 https://t.co/vePEeAne49",2020-05-05 16:27:03,32,8,givingtuesdaynow stand partners plan_uk unicef_uk projecthopeorg ncdalliance efforts responding unique health needs groups vulnerable covid19 living ncds young people get involved,"[covid19, givingtuesdaynow]","[projecthopeorg, unicef_uk, plan_uk, ncdalliance]","[givingtuesdaynow, stand, partners, plan_uk, unicef_uk, projecthopeorg, ncdalliance, efforts, responding, unique, health, needs, groups, vulnerable, covid19, living, ncds, young, people, get, involved]","[(givingtuesdaynow, JJ), (stand, VBP), (partners, NNS), (plan_uk, VBP), (unicef_uk, JJ), (projecthopeorg, JJ), (ncdalliance, NN), (efforts, NNS), (responding, VBG), (unique, JJ), (health, NN), (needs, NNS), (groups, NNS), (vulnerable, JJ), (covid19, JJ), (living, NN), (ncds, JJ), (young, JJ), (people, NNS), (get, VBP), (involved, VBN)]",2020-05-05
2,AstraZeneca,We’re #standingtogether4asthma with patients and the respiratory community during these times of uncertainty. Visit @WEF to learn more about what we’re doing to play our part in the fight against #COVID19: #WorldAsthmaDay \r\nhttps://t.co/fWE7ik8rNs https://t.co/Z54pyHBENq,2020-05-05 12:30:15,19,7,standingtogether4asthma patients respiratory community times uncertainty visit wef learn play part fight covid19 worldasthmaday,"[covid19, standingtogether4asthma, worldasthmaday]",[wef],"[standingtogether4asthma, patients, respiratory, community, times, uncertainty, visit, wef, learn, play, part, fight, covid19, worldasthmaday]","[(standingtogether4asthma, NN), (patients, NNS), (respiratory, VBP), (community, NN), (times, NNS), (uncertainty, NN), (visit, NN), (wef, NN), (learn, VBP), (play, VB), (part, NN), (fight, NN), (covid19, NN), (worldasthmaday, NN)]",2020-05-05


In [65]:
df_stock.head(3)

Unnamed: 0,date,open,high,low,close,adj close,volume,company
63,2020-02-03,48.619999,48.93,48.450001,48.509998,47.541225,1990900,AstraZeneca
64,2020-02-04,48.759998,49.09,48.720001,48.759998,47.786232,1698800,AstraZeneca
65,2020-02-05,49.459999,49.849998,49.23,49.73,48.736862,2303000,AstraZeneca


In [40]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Darciu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Darciu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
