In [17]:
import pandas as pd
import numpy as np
from glob import glob
from nltk.corpus import stopwords
stop = stopwords.words('english')
import re

pd.set_option('display.max_colwidth', -1)


In [29]:
csvs = glob('input/tweets/*.csv')
df_tweets = pd.concat(pd.read_csv(file).assign(filename = file) for file in csvs)
df_covid = pd.read_csv('input/covid_data.csv')
df_tweets.reset_index(inplace = True)
del df_tweets['filename']
del df_tweets['index']

In [30]:
def return_hashes(col):

    return list(set([re.sub('[^\w\s]','', word) for word in col.split() if word[0] == '#']))

def return_ats(col):
    return list(set(  [re.sub('[^\w\s]','', word) for word in col.split() if word[0] == '@']  ))

def remove_stopwords(col):
    return ' '.join([word for word in col.split() if word not in stop])

def remove_https(col):
    return ' '.join([word for word in col.split() if word[0:7] != 'https:'])

In [31]:
df_tweets.rename(columns = {'text':'text_original'}, inplace = True)

df_tweets['text_modified'] = df_tweets['text_original'].str.lower() # małe znaki
df_tweets['hash'] = df_tweets['text_modified'].apply(return_hashes) # wybieranie unikalnych hashtagów (bez punktuacji)
df_tweets['at'] = df_tweets['text_modified'].apply(return_ats) # wybieranie unikalnych odnośników (bez punktuacji)
df_tweets['text_modified'] = df_tweets['text_modified'].apply(remove_https)
df_tweets['text_modified'] = df_tweets['text_modified'].str.replace('[^\w\s#@]','') # usuwanie punktuacji; można usuwać # ze zdań jeśli usunie się ten znak z regular expression
df_tweets['text_modified'] = df_tweets['text_modified'].apply(remove_stopwords)

df_tweets.created_at = pd.to_datetime(df_tweets.created_at)
df_tweets['date'] = df_tweets.created_at.dt.date


## Data preview

In [15]:
companies = df_tweets.company.unique()

for company in companies:
    dat1 = min(df_tweets.loc[df_tweets['company'] == company].created_at)
    dat2 = min(df_tweets.loc[df_tweets['company'] == company].created_at)
    
    print(f'Zakres datowy dla tweetów   {dat1}   -   {dat2}   :  {company}')

Zakres datowy dla tweetów   2019-12-11 16:58:23   -   2019-12-11 16:58:23   :  AstraZeneca
Zakres datowy dla tweetów   2019-09-26 07:00:01   -   2019-09-26 07:00:01   :  BayerPharma
Zakres datowy dla tweetów   2020-02-05 12:26:23   -   2020-02-05 12:26:23   :  GSK
Zakres datowy dla tweetów   2020-02-04 14:59:51   -   2020-02-04 14:59:51   :  Merck
Zakres datowy dla tweetów   2019-10-17 21:23:19   -   2019-10-17 21:23:19   :  Novartis
Zakres datowy dla tweetów   2020-02-17 13:55:00   -   2020-02-17 13:55:00   :  Pfizer
Zakres datowy dla tweetów   2020-01-30 14:19:25   -   2020-01-30 14:19:25   :  Roche
Zakres datowy dla tweetów   2020-01-29 14:29:27   -   2020-01-29 14:29:27   :  Sanofi


In [32]:
df_tweets.head(3)

Unnamed: 0,company,text_original,created_at,favourite_count,retweet_count,text_modified,hash,at,date
0,AstraZeneca,"Together with partners across industry, academia and government, we are taking a multipronged approach to helping patients around the world facing #COVID19. https://t.co/uQuHj6BkBN",2020-05-06 13:13:41,44,8,together partners across industry academia government taking multipronged approach helping patients around world facing #covid19 httpstcouquhj6bkbn,[covid19],[],2020-05-06
1,AstraZeneca,"On #GivingTuesdayNow we stand with our partners @Plan_UK @Unicef_UK @ProjectHopeorg @NCDAlliance in their efforts responding to the unique health needs of groups vulnerable to #COVID19, such as those living with NCDs and young people. Get involved: https://t.co/YGRHLGqct6 https://t.co/vePEeAne49",2020-05-05 16:27:03,32,8,#givingtuesdaynow stand partners @plan_uk @unicef_uk @projecthopeorg @ncdalliance efforts responding unique health needs groups vulnerable #covid19 living ncds young people get involved httpstcoygrhlgqct6 httpstcovepeeane49,"[covid19, givingtuesdaynow]","[ncdalliance, projecthopeorg, plan_uk, unicef_uk]",2020-05-05
2,AstraZeneca,We’re #standingtogether4asthma with patients and the respiratory community during these times of uncertainty. Visit @WEF to learn more about what we’re doing to play our part in the fight against #COVID19: #WorldAsthmaDay \r\nhttps://t.co/fWE7ik8rNs https://t.co/Z54pyHBENq,2020-05-05 12:30:15,19,7,#standingtogether4asthma patients respiratory community times uncertainty visit @wef learn play part fight #covid19 #worldasthmaday httpstcofwe7ik8rns httpstcoz54pyhbenq,"[worldasthmaday, covid19, standingtogether4asthma]",[wef],2020-05-05
