In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('data/tweets.csv', 'r') as file:
    tweets_data = pd.read_csv(file,index_col=0)
    
with open('data/entities.csv', 'r') as file:
    concepts_data = pd.read_csv(file,index_col=0)
    
with open('data/users.csv', 'r') as file:
    users_data = pd.read_csv(file,index_col=0)

In [3]:
#only tweets in english are kept 
tweets_data = tweets_data[tweets_data.lang == 'en']

In [4]:
'''
First option: join between tweets_data and concepts_data
-tweets possibly repeated in the table 
-unique row for each concept (now: just hashtags)
-keys: {index}
-in general, concepts can refer to different tweets and viceversa
'''
#join
joined_data = tweets_data.join(concepts_data.set_index('tweet_id_str'), on='id_str', how = 'left',lsuffix='_left', rsuffix='_right')
#only useful columns are kept
joined_data = joined_data.loc[:, ('id_str','created_at', 'text_left', 'text_right', 'truncated', 'retweet_count', 'type',
                                  'in_reply_to_status_id_str', 'in_reply_to_user_id_str')]
joined_data = joined_data.rename(columns= {'text_left' : 'text_tweet', 'text_right' : 'text_concept'}).reset_index()
joined_data.head()

Unnamed: 0,index,id_str,created_at,text_tweet,text_concept,truncated,retweet_count,type,in_reply_to_status_id_str,in_reply_to_user_id_str
0,0,1209764168741474304,Wed Dec 25 09:13:48 +0000 2019,When Santa hits you with that bestseller every...,,False,2,,,
1,2,1209753143518433281,Wed Dec 25 08:30:00 +0000 2019,Why is the #media quiet about #sexualviolence ...,media,False,1,hashtag,,
2,2,1209753143518433281,Wed Dec 25 08:30:00 +0000 2019,Why is the #media quiet about #sexualviolence ...,sexualviolence,False,1,hashtag,,
3,2,1209753143518433281,Wed Dec 25 08:30:00 +0000 2019,Why is the #media quiet about #sexualviolence ...,LGBTQIA,False,1,hashtag,,
4,3,1209738044246351873,Wed Dec 25 07:30:00 +0000 2019,"The future is #queer , and so was this year!\n...",queer,False,2,hashtag,,


In [5]:
'''
Second option: concatenation of tweets_data and concepts_data
-unique row for each tweet/concept
-keys: {index, text}
-in general, hashtags refer to different tweets: id_str could contain more then one id per concept
'''
#concatenation
concat_data = pd.concat([tweets_data, concepts_data], sort = False)
#identify tweets' rows (in the type attribute)
concat_data.type.fillna('tweet', inplace=True)
#id_str and tweet_id_str columns are merged
concat_data.loc[concat_data.id_str.isna(),'id_str'] = concat_data.loc[concat_data.tweet_id_str.notna(),'tweet_id_str']
#only useful columns are kept
concat_data = concat_data.loc[:, ('id_str','created_at', 'text', 'truncated', 'retweet_count', 'type', 'in_reply_to_status_id_str', 
                          'in_reply_to_user_id_str')].reset_index()
concat_data.head()

Unnamed: 0,index,id_str,created_at,text,truncated,retweet_count,type,in_reply_to_status_id_str,in_reply_to_user_id_str
0,0,1.209764e+18,Wed Dec 25 09:13:48 +0000 2019,When Santa hits you with that bestseller every...,False,2.0,tweet,,
1,2,1.209753e+18,Wed Dec 25 08:30:00 +0000 2019,Why is the #media quiet about #sexualviolence ...,False,1.0,tweet,,
2,3,1.209738e+18,Wed Dec 25 07:30:00 +0000 2019,"The future is #queer , and so was this year!\n...",False,2.0,tweet,,
3,4,1.209723e+18,Wed Dec 25 06:30:00 +0000 2019,#Manusmriti was unjust towards “Untouchables” ...,False,5.0,tweet,,
4,5,1.209693e+18,Wed Dec 25 04:30:00 +0000 2019,""" #Section144 is a piece of colonial-era legis...",False,5.0,tweet,,


In [6]:
joined_data.to_csv('data/database/joined_data.csv')
concat_data.to_csv('data/database/concat_data.csv')