In [1]:
import pandas as pd
import numpy as np
from nltk.stem import SnowballStemmer

In [2]:
with open('data/database/tweets.csv', 'r') as file:
    tweets_data = pd.read_csv(file)
    
with open('data/database/entities.csv', 'r') as file:
    hashtags_data = pd.read_csv(file)
    
with open('data/database/users.csv', 'r') as file:
    users_data = pd.read_csv(file)
    
with open('data/database/words.csv', 'r') as file:
    words_data = pd.read_csv(file)
del words_data['xpos']

In [3]:
#tweets and hashtags data cleaning

#only tweets in english are kept 
tweets_data = tweets_data[tweets_data.lang == 'en']
#lowercase hashtags
hashtags_data.loc[:,'text'] = hashtags_data.text.str.lower()

In [4]:
#words_data cleaning

#relevant words selection
mask_w = ['ADJ', 'ADV', 'NOUN', 'VERB']
words_data = words_data.loc[words_data.upos.isin(mask_w),:]

#words stemming 
stemmer = SnowballStemmer("english")
words_data.loc[:,'text'] = words_data.text.apply(stemmer.stem)

#lowercase words
words_data.loc[:,'text'] = words_data.text.str.lower()

#deletion of concepts already stored as hashtags
hashtags = set(hashtags_data.text.values)
words_data = words_data.loc[~words_data.text.isin(hashtags),:]

In [None]:
'''
First option: join between tweets_data and hashtags_data
-tweets possibly repeated in the table 
-unique row for each concept (now: just hashtags)
-keys: {index}
-in general, concepts can refer to different tweets and viceversa
-WORDS MISSING: sentence field must be fixed to allow a join (no key to build a relationship)
'''
#join
joined_data = tweets_data.join(hashtags_data.set_index('tweet_id_str'), on='id_str', how = 'left',lsuffix='_left', rsuffix='_right')
#only useful columns are kept
joined_data = joined_data.loc[:, ('id_str','created_at', 'text_left', 'text_right', 'truncated', 'retweet_count', 'type', 'in_reply_to_status_id_str', 'in_reply_to_user_id_str')]
joined_data = joined_data.rename(columns= {'text_left' : 'text_tweet', 'text_right' : 'text_concept'}).reset_index().iloc[:,1:]
joined_data

In [13]:
'''
Second option: concatenation of tweets_data and concepts_data
-unique row for each tweet/concept
-keys: {index, text}
-in general, hashtags refer to different tweets: id_str could contain more then one id per concept
-
'''
#concatenation of tweets and hashtags
concat_data = pd.concat([tweets_data, hashtags_data], sort = False)
#identify tweets' rows (in the type attribute)
concat_data.type.fillna('tweet', inplace=True)
#id_str and tweet_id_str columns are merged
concat_data.loc[concat_data.id_str.isna(),'id_str'] = concat_data.loc[concat_data.tweet_id_str.notna(),'tweet_id_str']

#concatenation of concat and words
concat_data = pd.concat([concat_data, words_data], sort = False)
#identify words' rows (in the type attribute)
concat_data.type.fillna('words', inplace=True)
#WARNING: id_str field still has to be filled (no key to build a relationship)

#only useful columns are kept
concat_data = concat_data.loc[:, ('id_str','created_at', 'text', 'truncated', 'retweet_count', 'type', 'in_reply_to_status_id_str', 
                          'in_reply_to_user_id_str')].reset_index().iloc[:,1:]
concat_data.head()

Unnamed: 0,id_str,created_at,text,truncated,retweet_count,type,in_reply_to_status_id_str,in_reply_to_user_id_str
0,1.207807e+18,Thu Dec 19 23:35:02 +0000 2019,These events around the world prompted importa...,True,53.0,tweet,,
1,1.207780e+18,Thu Dec 19 21:48:10 +0000 2019,RT @wef: Women's pay equality has slipped back...,False,0.0,tweet,,
2,1.207778e+18,Thu Dec 19 21:42:14 +0000 2019,Happy birthday @Alyssa_Milano!🎈 Thank you for ...,False,191.0,tweet,,
3,1.207761e+18,Thu Dec 19 20:35:04 +0000 2019,“This is my charge to everyone:\n\nWe have to ...,True,88.0,tweet,,
4,1.207733e+18,Thu Dec 19 18:41:46 +0000 2019,8 Posters At The CAA Protests In Delhi That Sc...,False,7.0,tweet,,
5,1.207731e+18,Thu Dec 19 18:35:32 +0000 2019,Victim Blaming In The Garb of Women Empowermen...,False,6.0,tweet,,
6,1.207716e+18,Thu Dec 19 17:35:04 +0000 2019,🧥\n👖\n💃\n👙\n👷‍♀️\n👗\n👩‍🎤\n🩱\n👩‍🎓\n🩲\n🤸‍♀️\n👚\n...,True,2447.0,tweet,,
7,1.207715e+18,Thu Dec 19 17:30:00 +0000 2019,Politics and law making institutions have alwa...,True,3.0,tweet,,
8,1.207700e+18,Thu Dec 19 16:30:00 +0000 2019,"""Every domain of life, be it society, polity, ...",True,7.0,tweet,,
9,1.207685e+18,Thu Dec 19 15:30:00 +0000 2019,Workplaces are notorious grounds for explicit ...,True,0.0,tweet,,


In [None]:
joined_data.to_csv('data/database/joined_data.csv')
concat_data.to_csv('data/database/concat_data.csv')