In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('data/database/tweets.csv', 'r') as file:
    tweets_data = pd.read_csv(file)
    
with open('data/database/entities.csv', 'r') as file:
    concepts_data = pd.read_csv(file)
    
with open('data/database/users.csv', 'r') as file:
    users_data = pd.read_csv(file)

In [3]:
#only tweets in english are kept 
tweets_data = tweets_data[tweets_data.lang == 'en']

In [4]:
'''
First option: join between tweets_data and concepts_data
-tweets possibly repeated in the table 
-unique row for each concept (now: just hashtags)
-keys: {index}
-in general, concepts can refer to different tweets and viceversa
'''
#join
joined_data = tweets_data.join(concepts_data.set_index('tweet_id_str'), on='id_str', how = 'left',lsuffix='_left', rsuffix='_right')
#only useful columns are kept
joined_data = joined_data.loc[:, ('id_str','created_at', 'text_left', 'text_right', 'truncated', 'retweet_count', 'type', 'in_reply_to_status_id_str', 'in_reply_to_user_id_str')]
joined_data = joined_data.rename(columns= {'text_left' : 'text_tweet', 'text_right' : 'text_concept'}).reset_index().iloc[:,1:]
joined_data

Unnamed: 0,id_str,created_at,text_tweet,text_concept,truncated,retweet_count,type,in_reply_to_status_id_str,in_reply_to_user_id_str
0,1207806578650468352,Thu Dec 19 23:35:02 +0000 2019,These events around the world prompted importa...,ThisHappened,True,53,hashtag,,
1,1207779681627447296,Thu Dec 19 21:48:10 +0000 2019,RT @wef: Women's pay equality has slipped back...,,False,0,,,
2,1207778191173201920,Thu Dec 19 21:42:14 +0000 2019,Happy birthday @Alyssa_Milano!🎈 Thank you for ...,,False,191,,,
3,1207761285045260294,Thu Dec 19 20:35:04 +0000 2019,“This is my charge to everyone:\n\nWe have to ...,WomenInSport,True,88,hashtag,,
4,1207732772451889152,Thu Dec 19 18:41:46 +0000 2019,8 Posters At The CAA Protests In Delhi That Sc...,,False,7,,,
5,1207731206835576833,Thu Dec 19 18:35:32 +0000 2019,Victim Blaming In The Garb of Women Empowermen...,,False,6,,,
6,1207715986075467778,Thu Dec 19 17:35:04 +0000 2019,🧥\n👖\n💃\n👙\n👷‍♀️\n👗\n👩‍🎤\n🩱\n👩‍🎓\n🩲\n🤸‍♀️\n👚\n...,,True,2447,,,
7,1207714713326510081,Thu Dec 19 17:30:00 +0000 2019,Politics and law making institutions have alwa...,LokSabha,True,3,hashtag,,
8,1207714713326510081,Thu Dec 19 17:30:00 +0000 2019,Politics and law making institutions have alwa...,WomenInPolitics,True,3,hashtag,,
9,1207699612359749632,Thu Dec 19 16:30:00 +0000 2019,"""Every domain of life, be it society, polity, ...",IndiaRejectsCAA,True,7,hashtag,,


In [5]:
'''
Second option: concatenation of tweets_data and concepts_data
-unique row for each tweet/concept
-keys: {index, text}
-in general, hashtags refer to different tweets: id_str could contain more then one id per concept
'''
#concatenation
concat_data = pd.concat([tweets_data, concepts_data], sort = False)
#identify tweets' rows (in the type attribute)
concat_data.type.fillna('tweet', inplace=True)
#id_str and tweet_id_str columns are merged
concat_data.loc[concat_data.id_str.isna(),'id_str'] = concat_data.loc[concat_data.tweet_id_str.notna(),'tweet_id_str']
#only useful columns are kept
concat_data = concat_data.loc[:, ('id_str','created_at', 'text', 'truncated', 'retweet_count', 'type', 'in_reply_to_status_id_str', 
                          'in_reply_to_user_id_str')].reset_index().iloc[:,1:]
concat_data.head()

Unnamed: 0,id_str,created_at,text,truncated,retweet_count,type,in_reply_to_status_id_str,in_reply_to_user_id_str
0,1.207807e+18,Thu Dec 19 23:35:02 +0000 2019,These events around the world prompted importa...,True,53.0,tweet,,
1,1.20778e+18,Thu Dec 19 21:48:10 +0000 2019,RT @wef: Women's pay equality has slipped back...,False,0.0,tweet,,
2,1.207778e+18,Thu Dec 19 21:42:14 +0000 2019,Happy birthday @Alyssa_Milano!🎈 Thank you for ...,False,191.0,tweet,,
3,1.207761e+18,Thu Dec 19 20:35:04 +0000 2019,“This is my charge to everyone:\n\nWe have to ...,True,88.0,tweet,,
4,1.207733e+18,Thu Dec 19 18:41:46 +0000 2019,8 Posters At The CAA Protests In Delhi That Sc...,False,7.0,tweet,,


In [6]:
joined_data.to_csv('data/database/joined_data.csv')
concat_data.to_csv('data/database/concat_data.csv')