In [12]:
import pandas as pd
import re
import string

# progress bar
from tqdm import tqdm
tqdm.pandas()

TOPIC_MODELLING = 0
SENTIMENT_ANALYSIS = 1

# file paths
TWEET_CORPUS_DATA_IN = "../datain/clean/largest_community_tweets.jsonl"
MAY_1_DATA_OUT = "../dataout/general/may_1.csv"
MAY_5_DATA_OUT = "../dataout/general/may_5.csv"
MAY_22_DATA_OUT = "../dataout/general/may_22.csv"
MAY_25_DATA_OUT = "../dataout/general/may_25.csv"
MAY_31_DATA_OUT = "../dataout/general/may_31.csv"


In [3]:
def load_data():
    '''
        Import corpus data in json format.
        Filter to have only english tweets and remove retweets.

        Returns:
            imported english, non-retweeted data
    '''
    #import the data
    filename = TWEET_CORPUS_DATA_IN
    print("\tLoading json data...")
    print("\t\tThis can take a while (about ~10 minutes on current largest community data)")
    print("\t\tGo make yourself a cup of hot thing ;)")
    data = pd.read_json(filename, lines=True)

    # clean data: remove retweets and select only english tweets
    print("\tRemoving reweets and non-english tweets...")
    data = data[~data["text"].progress_apply(lambda x: x.startswith("RT"))]
    data = data[data["lang"].progress_apply(lambda x: x == "en")]
    data = data.rename(columns={'text': 'corpus'})
    print()

    return data



In [4]:
def clean_tweet(tweet, remove_stop):
    '''
        Cleans tweet from hashtags, mentions, special characters, html entities, numbers,
        links, and stop words. Converts text to lower case.

        Args:
            tweet: a single tweet (String)
            remove_stop: True if stopwords should be removed and False if they should not be removed.
        Returns:
            tweet: cleaned tweet (String)
    '''
    tweet = str.lower(tweet)
    tweet = ' '.join(re.sub("(@[A-Za-z0-9_]+)|(#[A-Za-z0-9_]+)", " ", tweet).split()) # remove mentions and hashtags
    tweet = re.sub("(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)", "", tweet, flags=re.MULTILINE) # remove links
    tweet = re.sub("0x([\da-z\.-]+)", "", tweet, flags=re.MULTILINE) # remove addresses/pointers
    tweet = re.sub('\&\w+', "", tweet) # remove html entities (example &amp)

    return tweet

In [5]:
df = load_data()
df

	Loading json data...
		This can take a while (about ~10 minutes on current largest community data)
		Go make yourself a cup of hot thing ;)
	Removing reweets and non-english tweets...


100%|██████████| 459479/459479 [00:00<00:00, 534160.39it/s]
100%|██████████| 459370/459370 [00:00<00:00, 622738.93it/s]





Unnamed: 0,lang,reply_settings,public_metrics,corpus,possibly_sensitive,entities,created_at,author_id,conversation_id,referenced_tweets,id,context_annotations,source,in_reply_to_user_id,attachments,geo,withheld
0,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Good project \n#BSC @DinoSour #NFT\n@yamin_rah...,False,"{'hashtags': [{'start': 14, 'end': 18, 'tag': ...",2021-05-31 23:59:42+00:00,1397168952908779520,1399515966774530048,"[{'type': 'quoted', 'id': '1398339274953564163'}]",1399515966774530048,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter Web App,,,,
1,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@AromaFinance Great project!\n\n$reset #bsc #B...,False,"{'cashtags': [{'start': 30, 'end': 36, 'tag': ...",2021-05-31 23:59:40+00:00,1395244202808680448,1399313027065810944,"[{'type': 'replied_to', 'id': '139931302706581...",1399515957362450432,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter Web App,1.390241e+18,,,
2,en,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",Such a beautiful project and congratulations t...,False,"{'hashtags': [{'start': 198, 'end': 213, 'tag'...",2021-05-31 23:59:35+00:00,1397350208489463808,1399515936093204480,"[{'type': 'quoted', 'id': '1399372631267287041'}]",1399515936093204480,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter for Android,,,,
4,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",I am participating in the CryptoUltraman NFT a...,False,"{'hashtags': [{'start': 60, 'end': 78, 'tag': ...",2021-05-31 23:58:47+00:00,2718560166,1399515734007447552,"[{'type': 'quoted', 'id': '1398277372651081732'}]",1399515734007447552,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter for Android,,,,
5,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@apenftorg @CoinMarketCap Nice to find this pr...,False,"{'cashtags': [{'start': 216, 'end': 220, 'tag'...",2021-05-31 23:58:44+00:00,1383794353760391168,1397848170739077120,"[{'type': 'replied_to', 'id': '139784817073907...",1399515723274280960,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter Web App,1.392094e+18,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459473,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",$DENA #NFT #Defi #YieldFarming\nGreat \n@lensa...,False,"{'hashtags': [{'start': 6, 'end': 10, 'tag': '...",2021-02-01 11:31:45+00:00,1322618452108931072,1356203583193063424,"[{'type': 'quoted', 'id': '1349729014944972800'}]",1356203583193063424,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter for Android,,,,
459474,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@SMATOOS_now @bagasadys @airdrophunter78 @jher...,False,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",2021-02-01 11:05:55+00:00,343817344,1349729014944972800,"[{'type': 'replied_to', 'id': '134972901494497...",1356197080272752640,,Twitter for Android,3.339523e+08,,,
459475,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@SMATOOS_now @bagasadys @airdrophunter78 @jher...,False,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",2021-02-01 10:49:53+00:00,1099564686788374528,1349729014944972800,"[{'type': 'replied_to', 'id': '134972901494497...",1356193045817872384,,Twitter for Android,3.339523e+08,,,
459476,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@ZthCrypto @AlienWorlds How are NFT rates dete...,False,"{'mentions': [{'start': 0, 'end': 10, 'usernam...",2021-02-01 09:29:19+00:00,1164164048046514176,1356157119909642240,"[{'type': 'replied_to', 'id': '135615711990964...",1356172769424244736,,Twitter Web App,1.138926e+18,,,


In [8]:
df['date'] = df['created_at'].dt.date
df['time'] = df['created_at'].dt.time

In [9]:
# group tweets by date and count number of entries per day
dates = df.groupby('date').count()

Unnamed: 0_level_0,lang,reply_settings,public_metrics,corpus,possibly_sensitive,entities,created_at,author_id,conversation_id,referenced_tweets,id,context_annotations,source,in_reply_to_user_id,attachments,geo,withheld,time
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-02-01,8,8,8,8,8,8,8,8,8,8,8,1,8,7,0,0,0,8
2021-02-02,9,9,9,9,9,9,9,9,9,6,9,7,9,4,0,0,0,9
2021-02-03,13,13,13,13,13,13,13,13,13,12,13,5,13,8,2,0,0,13
2021-02-04,3,3,3,3,3,3,3,3,3,3,3,1,3,1,0,0,0,3
2021-02-05,7,7,7,7,7,7,7,7,7,6,7,1,7,5,2,0,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-27,6417,6417,6417,6417,6417,6417,6417,6417,6417,5538,6417,5559,6417,3311,59,25,0,6417
2021-05-28,9037,9037,9037,9037,9037,9037,9037,9037,9037,8470,9037,8497,9037,3683,54,36,0,9037
2021-05-29,7751,7751,7751,7751,7751,7751,7751,7751,7751,6346,7751,7199,7751,3019,92,17,0,7751
2021-05-30,9150,9150,9150,9150,9150,9150,9150,9150,9150,7871,9150,8459,9150,4352,83,31,0,9150


In [10]:
dates[dates['created_at']>15000]

Unnamed: 0_level_0,lang,reply_settings,public_metrics,corpus,possibly_sensitive,entities,created_at,author_id,conversation_id,referenced_tweets,id,context_annotations,source,in_reply_to_user_id,attachments,geo,withheld,time
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-05-01,17097,17097,17097,17097,17097,17097,17097,17097,17097,3319,17097,16712,17097,2666,53,71,0,17097
2021-05-05,25580,25580,25580,25580,25580,25580,25580,25580,25580,1486,25580,25506,25580,18271,100,120,0,25580
2021-05-22,16535,16535,16535,16535,16535,16535,16535,16535,16535,15700,16535,16314,16535,7960,52,70,0,16535
2021-05-25,17292,17292,17292,17292,17292,17292,17292,17292,17292,11187,17292,15274,17292,8533,92,85,0,17292
2021-05-31,17893,17893,17893,17893,17893,17893,17893,17893,17893,17091,17893,17355,17893,7939,64,60,0,17893


In [11]:
may_1 = df[df['date'] == pd.to_datetime("2021-05-01")]
may_1
may_5 = df[df['date'] == pd.to_datetime("2021-05-05")]
may_5
may_22 = df[df['date'] == pd.to_datetime("2021-05-22")]
may_22
may_25 = df[df['date'] == pd.to_datetime("2021-05-25")]
may_25
may_31 = df[df['date'] == pd.to_datetime("2021-05-31")]
may_31

  result = libops.scalar_compare(x.ravel(), y, op)


Unnamed: 0,lang,reply_settings,public_metrics,corpus,possibly_sensitive,entities,created_at,author_id,conversation_id,referenced_tweets,id,context_annotations,source,in_reply_to_user_id,attachments,geo,withheld,date,time
0,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Good project \n#BSC @DinoSour #NFT\n@yamin_rah...,False,"{'hashtags': [{'start': 14, 'end': 18, 'tag': ...",2021-05-31 23:59:42+00:00,1397168952908779520,1399515966774530048,"[{'type': 'quoted', 'id': '1398339274953564163'}]",1399515966774530048,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter Web App,,,,,2021-05-31,23:59:42
1,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@AromaFinance Great project!\n\n$reset #bsc #B...,False,"{'cashtags': [{'start': 30, 'end': 36, 'tag': ...",2021-05-31 23:59:40+00:00,1395244202808680448,1399313027065810944,"[{'type': 'replied_to', 'id': '139931302706581...",1399515957362450432,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter Web App,1.390241e+18,,,,2021-05-31,23:59:40
2,en,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",Such a beautiful project and congratulations t...,False,"{'hashtags': [{'start': 198, 'end': 213, 'tag'...",2021-05-31 23:59:35+00:00,1397350208489463808,1399515936093204480,"[{'type': 'quoted', 'id': '1399372631267287041'}]",1399515936093204480,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter for Android,,,,,2021-05-31,23:59:35
4,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",I am participating in the CryptoUltraman NFT a...,False,"{'hashtags': [{'start': 60, 'end': 78, 'tag': ...",2021-05-31 23:58:47+00:00,2718560166,1399515734007447552,"[{'type': 'quoted', 'id': '1398277372651081732'}]",1399515734007447552,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter for Android,,,,,2021-05-31,23:58:47
5,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@apenftorg @CoinMarketCap Nice to find this pr...,False,"{'cashtags': [{'start': 216, 'end': 220, 'tag'...",2021-05-31 23:58:44+00:00,1383794353760391168,1397848170739077120,"[{'type': 'replied_to', 'id': '139784817073907...",1399515723274280960,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter Web App,1.392094e+18,,,,2021-05-31,23:58:44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22751,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Thanks for shared this wonderful opportunity. ...,False,"{'urls': [{'start': 196, 'end': 219, 'url': 'h...",2021-05-31 00:01:54+00:00,1022339239839641600,1399154129973387264,"[{'type': 'quoted', 'id': '1398339274953564163'}]",1399154129973387264,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter for Android,,,,,2021-05-31,00:01:54
22752,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@Enzo__NZO Click the link to receive 1213 $NZO...,False,"{'urls': [{'start': 153, 'end': 176, 'url': 'h...",2021-05-31 00:01:30+00:00,1264387223329345536,1399154029071147008,,1399154029071147008,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter for Android,1.386201e+18,,,,2021-05-31,00:01:30
22755,en,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",@DinoSourFamily @BinanceChain Good and strong ...,False,"{'hashtags': [{'start': 87, 'end': 91, 'tag': ...",2021-05-31 00:01:13+00:00,791618831378681856,1398339274953564160,"[{'type': 'replied_to', 'id': '139833927495356...",1399153960083066880,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter Web App,1.369325e+18,,,,2021-05-31,00:01:13
22758,en,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@asifaslam0 \n@MDALAMI16 \n@saiful04420060 \n\...,False,"{'urls': [{'start': 77, 'end': 100, 'url': 'ht...",2021-05-31 00:00:38+00:00,1245731791333289984,1399153812024086528,"[{'type': 'quoted', 'id': '1398339274953564163'}]",1399153812024086528,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",Twitter Web App,1.379746e+18,,,,2021-05-31,00:00:38


In [15]:
selected_columns = ["date", "id", "corpus", "public_metrics", "entities", "author_id", "conversation_id"] # output created_at, id, and cleaned_tweets to csv
may_1.to_csv(MAY_1_DATA_OUT, columns = selected_columns)
may_5.to_csv(MAY_5_DATA_OUT, columns = selected_columns)
may_22.to_csv(MAY_22_DATA_OUT, columns = selected_columns)
may_25.to_csv(MAY_25_DATA_OUT, columns = selected_columns)
may_31.to_csv(MAY_31_DATA_OUT, columns = selected_columns)

In [None]:
    '''
        Clean corpus for sentiment and topic modelling code.
    '''
    print("Cleaning corpus...")
    df = load_data()

    # cleaning for sentiment analysis (keep stop words)
    print("\tSentiment analysis cleaning...")
    remove_stop = False
    df['cleaned_tweet'] = df['corpus'].progress_apply(clean_tweet, remove_stop=remove_stop)

    print("\tWriting sentiment cleaned data to csv...")
    selected_columns = ["created_at", "id", "cleaned_tweet"] # output created_at, id, and cleaned_tweets to csv
    df.to_csv(SENTIMENT_DATA_OUT, columns = selected_columns)

    # cleaning for topic modelling (remove stop words)
    print("\tTopic modelling cleaning...")
    remove_stop = True
    df['cleaned_tweet'] = df['corpus'].progress_apply(clean_tweet, remove_stop=remove_stop)

    print("\tWriting topic modelling cleaned data to csv...")
    df.to_csv(FREQUENCY_DATA_OUT, columns = selected_columns) # frequency data needs dates
    selected_columns = ["id", "cleaned_tweet"] # BTM algorithm R script file format
    df.to_csv(BTM_DATA_OUT, columns = selected_columns, index=None)

    print("Finished cleaning corpus. The next steps will start in a few moments...")