In [1]:
import pandas as pd
import spacy

from iua.lexical import determine_keywords_count, fix_apostrophes

# Load the data

In [14]:
tweets_path = "./../data/KyivIndependent_tweets.csv"
tweets = pd.read_csv(tweets_path, index_col=0)
tweets.head()

Unnamed: 0_level_0,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,cashtags,user_id,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1488303588229230594,1488303588229230594,1643674000000.0,2022-02-01 01:10:01,200,,The UN Security Council meeting on threats to ...,en,[],[],1462548977367359490,...,,,,,,[],,,,
1488540215018938372,1488540215018938372,1643731000000.0,2022-02-01 16:50:17,200,,"According to Foreign Minister Dmytro Kuleba, t...",en,[],[],1462548977367359490,...,,,,,,[],,,,
1488551955668611076,1488551955668611076,1643733000000.0,2022-02-01 17:36:56,200,,"The document, signed to “bolster the nation’s ...",en,[],[],1462548977367359490,...,,,,,,[],,,,
1488579472165163014,1488579472165163014,1643740000000.0,2022-02-01 19:26:17,200,,For the cargo pallets on the runway contained ...,en,[],[],1462548977367359490,...,,,,,,[],,,,
1488603295593766918,1488603295593766918,1643746000000.0,2022-02-01 21:00:57,200,,"According to the Bureau, the searches were par...",en,[],[],1462548977367359490,...,,,,,,[],,,,


In [15]:
tweets = fix_apostrophes(tweets)

In [68]:
death_keywords = ["kill", "death", "killed", "deaths", "toll", "killings"]
injured_keywords = ["wound", "wounded", "injure"]
casualties_keywords = death_keywords + injured_keywords
kidnapping_keywords = ["kidnap", "kidnapping", "kidnapped"]
rape_keywords = ["rape", "rapist"]

In [69]:
def collect_lemmas(text: str, nlp):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return lemmas

In [70]:
nlp = spacy.load("en_core_web_sm")
tweet_lemmas = tweets["tweet"].apply(collect_lemmas, args=(nlp,))

In [71]:
tweet_lemmas

id
1488303588229230594    [the, UN, Security, Council, meeting, on, thre...
1488540215018938372    [accord, to, Foreign, Minister, Dmytro, Kuleba...
1488551955668611076    [the, document, ,, sign, to, ", bolster, the, ...
1488579472165163014    [for, the, cargo, pallet, on, the, runway, con...
1488603295593766918    [accord, to, the, Bureau, ,, the, search, be, ...
                                             ...                        
1509965911192711172    [⚡, ️, state, budget, collect, $, 3.5, billion...
1509969228350296069    [⚡, ️, Metsola, :, Ukraine, 's, candidacy, can...
1509979949494583314    [European, Parliament, President, Roberta, Met...
1509990736011350019    [⚡, ️, Ukraine, 's, Air, Force, :, russian, ai...
1509995123534450689    [⚡, ️, government, increase, army, 's, reserve...
Name: tweet, Length: 2340, dtype: object

## Check tweet count

In [73]:
kidnapping_count, kidnapping_mask = determine_keywords_count(tweet_lemmas, kidnapping_keywords)
kidnapping_count

11

In [74]:
casualties_count, casualties_mask = determine_keywords_count(tweet_lemmas, casualties_keywords)
casualties_count

257

In [75]:
death_mentions_count, death_mentions_mask = determine_keywords_count(tweet_lemmas, death_keywords)
death_mentions_count

223

In [76]:
injured_mentions_count, injured_mentions_mask = determine_keywords_count(tweet_lemmas, injured_keywords)
injured_mentions_count

113

In [24]:
rape_count, rape_mask = determine_keywords_count(tweet_lemmas, rape_keywords)
rape_count

1

#### Save only selected posts

Casualties

In [77]:
necessary_columns = ["date", "link"]

In [78]:
excel_casualties = tweets.loc[casualties_mask, necessary_columns]

In [79]:
excel_casualties

Unnamed: 0_level_0,date,link
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1494283213644181511,2022-02-17 13:10:55,https://twitter.com/KyivIndependent/status/149...
1495364937019019266,2022-02-20 12:49:18,https://twitter.com/KyivIndependent/status/149...
1495450672606629889,2022-02-20 18:29:59,https://twitter.com/KyivIndependent/status/149...
1496771173291642883,2022-02-24 09:57:11,https://twitter.com/KyivIndependent/status/149...
1496785738045829122,2022-02-24 10:55:03,https://twitter.com/KyivIndependent/status/149...
...,...,...
1509486350684368897,2022-03-31 13:02:45,https://twitter.com/KyivIndependent/status/150...
1509510190512709632,2022-03-31 14:37:29,https://twitter.com/KyivIndependent/status/150...
1509664637037555714,2022-04-01 00:51:12,https://twitter.com/KyivIndependent/status/150...
1509773512927547396,2022-04-01 08:03:50,https://twitter.com/KyivIndependent/status/150...


In [80]:
excel_casualties["your id"] = ""
excel_casualties["correct tweet (yes/no)"] = ""
excel_casualties["which side? (UA/RU)"] = ""
excel_casualties["number of dead"] = ""
excel_casualties["dead: in total/new"] = ""
excel_casualties["dead-who?: (civilians/soldiers/children)"] = ""
excel_casualties["number of injured/wounded"] = ""
excel_casualties["injured: in total/new"] = ""
excel_casualties["injured-who?: civilians/soldiers/children"] = ""
excel_casualties["place"] = ""
excel_casualties["deaths date"] = ""
excel_casualties["additional info"] = ""

In [81]:
save_path_casualties = "./../data/KyivIndependentTweetsCasualties.xlsx"

In [82]:
excel_casualties.to_excel(save_path_casualties)

## Other statistics

In [12]:
tweets["nlikes"].max()

287701

In [13]:
tweets["nretweets"].max()

40889