In [1]:
import json
import pandas as pd
from tqdm import tqdm
from pathlib import Path 
from ast import literal_eval

## Annotations

Collected tweets are linked to DBpedia and Wikidata with `spacy` and two libraries built on top of `spacy`:
* [DBpedia Spotlight for SpaCy](https://spacy.io/universe/project/spacy-dbpedia-spotlight)
* [spaCyOpenTapioca](https://spacy.io/universe/project/spacyopentapioca)

In [19]:
db_tweets = pd.read_csv('../data/db_annotated_tweets.csv', index_col=False)
db_tweets = db_tweets.fillna('')
db_tweets['UserName'] = db_tweets['UserName'].str.replace('@', '')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
db_tweets.shape[0]

231711

In [14]:
db_tweets.columns

Index(['Unnamed: 0', 'index', 'Unnamed: 0.1', 'UserName', 'TweetText',
       'Timestamp', 'Tweet URL', 'collector', 'TweetID', 'Entities', 'UserID',
       'retweet', 'mentions', 'db_ents', 'db_ents_list'],
      dtype='object')

In [12]:
db_tweets['UserName'].str.replace('@', '').nunique()

141862

In [20]:
db_tweets.groupby(['UserName'])['TweetText'].count().reset_index().describe()

Unnamed: 0,TweetText
count,141862.0
mean,1.633355
std,3.067153
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,578.0


In [None]:
db_tweets.groupby(['UserName'])['TweetText'].count().reset_index().describe()

In [4]:
# wd_tweets = pd.read_csv('../data/wd_annotated_tweets.csv', index_col=False)
# wd_tweets = wd_tweets.fillna('')

In [5]:
db_tweets['db_ents_list'] = db_tweets['db_ents'].apply(lambda x: [item.split(',') for item in x.replace('"', '').split(';')])
# wd_tweets['wd_ents_list'] = wd_tweets['wd_ents'].apply(lambda x: [item.split(',') for item in x.replace('"', '').split(';')])

## Entity and Entity Type Counts

#### DBpedia

In [45]:
db_ent_token_counts = db_tweets['db_ents_list'].apply(lambda x: [i[0] for i in x]).explode().value_counts().reset_index()
db_ent_token_counts

Unnamed: 0,index,db_ents_list
0,Kyle,118142
1,Rittenhouse,35649
2,,29847
3,Kenosha,6948
4,vax,6671
...,...,...
35276,Marxist4Life1956.com,1
35277,Rittenhouse trial6,1
35278,filthy animals,1
35279,Ghislainne,1


In [46]:
db_type_counts = db_tweets['db_ents_list'].apply(lambda x: [i[1] if len(i) >= 2 else '' for i in x]).explode().value_counts().reset_index()
db_type_counts

Unnamed: 0,index,db_ents_list
0,DBPEDIA_ENT,419892
1,ORG,53196
2,,29886
3,PERSON,21503
4,DATE,6967
...,...,...
155,wrong time,1
156,jury,1
157,Maryland,1
158,November,1


In [47]:
db_ent_counts = db_tweets['db_ents_list'].apply(lambda x: [i[2] if len(i) >= 3 else '' for i in x]).explode().value_counts().reset_index()
db_ent_counts

Unnamed: 0,index,db_ents_list
0,,125318
1,http://dbpedia.org/resource/Kyle_Broflovski,117400
2,http://dbpedia.org/resource/White_supremacy,8774
3,http://dbpedia.org/resource/Wisconsin,8156
4,http://dbpedia.org/resource/VAX,6709
...,...,...
18599,http://dbpedia.org/resource/Rumor,1
18600,http://dbpedia.org/resource/Pushback,1
18601,http://dbpedia.org/resource/Psychological_testing,1
18602,http://dbpedia.org/resource/The_finger,1


#### Wikidata

In [48]:
wd_tweets['wd_ents_list'].apply(lambda x: [(i[0], i[2]) if len(i) >= 3 else '' for i in x]).explode().value_counts().reset_index()[:50]

Unnamed: 0,index,wd_ents_list
0,"(Kyle Rittenhouse, Q98663135)",68147
1,"(Rittenhouse, )",39861
2,,25173
3,"(Kyle Rittenhouse, )",15439
4,"(video, Q9022)",5687
5,"(A, Q48)",5479
6,"(can, Q16)",5119
7,"(Kyle Rittenhouses, )",5074
8,"(American, )",4613
9,"(Kyle Rittenhouse's, Q98663135)",4598


In [49]:
wd_ent_token_counts = wd_tweets['wd_ents_list'].apply(lambda x: [i[0] for i in x]).explode().value_counts().reset_index()
wd_ent_token_counts

Unnamed: 0,index,wd_ents_list
0,Kyle Rittenhouse,83588
1,Rittenhouse,41472
2,,24210
3,video,5687
4,A,5479
...,...,...
24778,Jussie Rittenhouse,1
24779,Recordindependentsentinel.comKamala Harris,1
24780,someoneBellaBliss420 Oct Replying,1
24781,thousands every year,1


In [50]:
wd_ent_type_counts = wd_tweets['wd_ents_list'].apply(lambda x: [i[1] if len(i) >= 2 else '' for i in x]).explode().value_counts().reset_index()
wd_ent_type_counts[:20]

Unnamed: 0,index,wd_ents_list
0,PERSON,187755
1,ORG,99627
2,LOC,77971
3,,24354
4,DATE,12271
5,GPE,9626
6,NORP,7083
7,ORGLOC,2725
8,ORDINAL,2473
9,WORK_OF_ART,1509


In [51]:
# Mentions
# wd_tweets['MentionedUser'] = wd_tweets['mentions'].apply(lambda x: [i.split(',')[0] for i in x.split(';')])
# mentions = wd_tweets.explode('MentionedUser')[['UserName', 'MentionedUser']]

# mentions['Source'] = mentions['UserName']
# mentions['Target'] = mentions['MentionedUser']

# mentions[['Source', 'Target']].to_csv('../data/mention.csv')

In [52]:
db_entities = db_tweets.explode('db_ents_list')[['UserName', 'TweetText', 'db_ents_list']]

In [53]:
db_entities['ent_type'] = db_entities['db_ents_list'].apply(lambda x: x[1] if len(x) > 1 else '')
db_entities

Unnamed: 0,UserName,TweetText,db_ents_list,ent_type
0,@JoyceGarbaciak,Judge sets final evidence rules for before Kyl...,"[Kyle, DBPEDIA_ENT, http://dbpedia.org/resourc...",DBPEDIA_ENT
0,@JoyceGarbaciak,Judge sets final evidence rules for before Kyl...,"[Kyle, DBPEDIA_ENT, http://dbpedia.org/resourc...",DBPEDIA_ENT
0,@JoyceGarbaciak,Judge sets final evidence rules for before Kyl...,"[Kenosha County, DBPEDIA_ENT, http://dbpedia.o...",DBPEDIA_ENT
0,@JoyceGarbaciak,Judge sets final evidence rules for before Kyl...,"[Kyle, DBPEDIA_ENT, http://dbpedia.org/resourc...",DBPEDIA_ENT
1,@atdavidhoffman,"Guns don’t kill people,\nKyle Rittenhouse kill...","[Kyle, DBPEDIA_ENT, http://dbpedia.org/resourc...",DBPEDIA_ENT
...,...,...,...,...
30100,PhillySportsTk,RT @GeorgePapa19: The average savings account ...,"[savings account, DBPEDIA_ENT, http://dbpedia....",DBPEDIA_ENT
30100,PhillySportsTk,RT @GeorgePapa19: The average savings account ...,"[inflation, DBPEDIA_ENT, http://dbpedia.org/re...",DBPEDIA_ENT
30101,PhillySportsTk,@Yellowstone Go woke go broke! Done with y’all!,"[Go, DBPEDIA_ENT, http://dbpedia.org/resource/...",DBPEDIA_ENT
30102,PhillySportsTk,RT @ScotsFyre: @DianaNTaylor @elonmusk @SenWar...,[],


In [54]:
db_entities['ent_type'].value_counts().reset_index()[:10]

Unnamed: 0,index,ent_type
0,DBPEDIA_ENT,419892
1,ORG,53196
2,,29886
3,PERSON,21503
4,DATE,6967
5,GPE,6149
6,NORP,2210
7,ORDINAL,1963
8,TIME,1171
9,WORK_OF_ART,767


In [55]:
db_ents_with_id = db_entities[db_entities['ent_type'] == 'DBPEDIA_ENT']
db_ents_with_id['wikidata'] = db_ents_with_id.apply(lambda x: [item for item in x['db_ents_list'][3:] if 'Wikidata' in item], axis=1)
db_ents_with_id['dbpedia'] = db_ents_with_id.apply(lambda x: [item for item in x['db_ents_list'][3:] if 'DBpedia' in item], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [56]:
db_ents_with_id_dbp = db_ents_with_id.explode('dbpedia')
db_ents_with_id['dbpedia']

0              [DBpedia:Agent, DBpedia:FictionalCharacter]
0              [DBpedia:Agent, DBpedia:FictionalCharacter]
0        [DBpedia:Region, DBpedia:PopulatedPlace, DBped...
0              [DBpedia:Agent, DBpedia:FictionalCharacter]
1              [DBpedia:Agent, DBpedia:FictionalCharacter]
                               ...                        
30098                                                   []
30098                      [DBpedia:Agent, DBpedia:Person]
30100                                                   []
30100                                                   []
30101                     [DBpedia:Activity, DBpedia:Game]
Name: dbpedia, Length: 419892, dtype: object

In [57]:
dbr_counts = db_ents_with_id[db_ents_with_id['dbpedia'].apply(len) == 0]['db_ents_list'].apply(lambda x: x[2] if len(x) > 2 else '').value_counts().reset_index()
dbr_counts

Unnamed: 0,index,db_ents_list
0,http://dbpedia.org/resource/White_supremacy,8774
1,http://dbpedia.org/resource/VAX,6709
2,http://dbpedia.org/resource/Hurricane_Floyd,5324
3,http://dbpedia.org/resource/Self-defense,3600
4,http://dbpedia.org/resource/Kenosha_Comets,3399
...,...,...
8778,http://dbpedia.org/resource/Ferguson_unrest,1
8779,http://dbpedia.org/resource/Helter_skelter_(ride),1
8780,http://dbpedia.org/resource/Jean_Marie_Bosser,1
8781,http://dbpedia.org/resource/Liberal_conservatism,1


In [58]:
dbr_counts = dbr_counts.rename(columns={'index':'uri','db_ents_list':'count'})
dbr_counts.to_csv('../data/dbpedia_resources.csv', index=False)

### Entity Cooccurrences

In [59]:
from itertools import product
entity_counts = db_ents_with_id['dbpedia'].apply(lambda x: list(product(x, x))).explode().value_counts().reset_index()

In [60]:
entity_counts

Unnamed: 0,index,dbpedia
0,"(DBpedia:Agent, DBpedia:Agent)",197425
1,"(DBpedia:Agent, DBpedia:FictionalCharacter)",121298
2,"(DBpedia:FictionalCharacter, DBpedia:Fictional...",121298
3,"(DBpedia:FictionalCharacter, DBpedia:Agent)",121298
4,"(DBpedia:Person, DBpedia:Agent)",47391
...,...,...
1772,"(DBpedia:Poem, DBpedia:WrittenWork)",1
1773,"(DBpedia:EducationalInstitution, DBpedia:Library)",1
1774,"(DBpedia:SnookerChamp, DBpedia:SnookerPlayer)",1
1775,"(DBpedia:Place, DBpedia:LaunchPad)",1


In [61]:
unique_counts = entity_counts[entity_counts['index'].apply(lambda x: x[0] == x[1])]
unique_counts

Unnamed: 0,index,dbpedia
0,"(DBpedia:Agent, DBpedia:Agent)",197425
2,"(DBpedia:FictionalCharacter, DBpedia:Fictional...",121298
5,"(DBpedia:Person, DBpedia:Person)",47391
7,"(DBpedia:Place, DBpedia:Place)",35178
8,"(DBpedia:Location, DBpedia:Location)",35178
...,...,...
1747,"(DBpedia:Fungus, DBpedia:Fungus)",1
1751,"(DBpedia:HorseRider, DBpedia:HorseRider)",1
1752,"(DBpedia:SnookerPlayer, DBpedia:SnookerPlayer)",1
1756,"(DBpedia:SiteOfSpecialScientificInterest, DBpe...",1


In [62]:
db_ents_with_id['dbpedia_top'] = db_ents_with_id['dbpedia'].apply(lambda x: x[0] if len(x) > 0 else '')
db_ents_with_id['dbpedia_top'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


                                 153374
DBpedia:Agent                    140061
DBpedia:Person                    29847
DBpedia:Organisation              21597
DBpedia:Work                      17188
DBpedia:Region                    14910
DBpedia:PopulatedPlace            13179
DBpedia:Settlement                 4447
DBpedia:Disease                    2616
DBpedia:WrittenWork                2540
DBpedia:EthnicGroup                2265
DBpedia:FictionalCharacter         2130
DBpedia:TimeInterval               1931
DBpedia:Place                      1920
DBpedia:Language                   1670
DBpedia:Activity                   1627
DBpedia:SportsLeague               1027
DBpedia:SportsClub                 1007
DBpedia:Food                        873
DBpedia:SportsTeam                  849
DBpedia:RacingDriver                678
DBpedia:Stream                      607
DBpedia:TopicalConcept              550
DBpedia:AnatomicalStructure         486
DBpedia:Device                      451


In [63]:
db_ents_with_id_dbp['dbpedia'].value_counts()[:40]

DBpedia:Agent                     197425
DBpedia:FictionalCharacter        121298
DBpedia:Person                     47391
DBpedia:Location                   35178
DBpedia:Place                      35178
DBpedia:PopulatedPlace             32536
DBpedia:Organisation               28736
DBpedia:Work                       19728
DBpedia:Politician                 15690
DBpedia:AdministrativeRegion       14910
DBpedia:Region                     14910
DBpedia:Country                     9411
DBpedia:Broadcaster                 8666
DBpedia:Athlete                     8633
DBpedia:Settlement                  7727
DBpedia:BasketballPlayer            6380
DBpedia:TelevisionStation           6062
DBpedia:Website                     5797
DBpedia:MusicalWork                 5194
DBpedia:PoliticalParty              4172
DBpedia:Company                     3676
DBpedia:Song                        2740
DBpedia:Disease                     2616
DBpedia:WrittenWork                 2547
DBpedia:Group   

In [64]:
db_entities[db_entities['ent_type'] == 'DBPEDIA_ENT'].iloc[39]['db_ents_list']

['Donald Trump Jr.',
 'DBPEDIA_ENT',
 'http://dbpedia.org/resource/Donald_Trump_Jr.',
 '1.0',
 'Http://xmlns.com/foaf/0.1/Person',
 'Wikidata:Q5',
 'Wikidata:Q24229398',
 'Wikidata:Q215627',
 'DUL:NaturalPerson',
 'DUL:Agent',
 'Schema:Person',
 'DBpedia:Agent',
 'DBpedia:Person']