# Sentiment based on tweet frequency

There are 407,489 tweets that need to be split up

In [1]:
import os # creating directories
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load cleaned tweet corpus data
df = pd.read_csv("../datain/topic_modelling/cleaned_tweets_largest_community.csv")
df = df.drop("Unnamed: 0", axis=1)
df

Unnamed: 0,created_at,id,cleaned_tweet
0,2021-05-31 23:59:42+00:00,1399515966774530048,good project
1,2021-05-31 23:59:40+00:00,1399515957362450432,great project reset
2,2021-05-31 23:59:35+00:00,1399515936093204480,beautiful project congratulations whole team h...
3,2021-05-31 23:58:47+00:00,1399515734007447552,participating cryptoultraman airdrop round
4,2021-05-31 23:58:44+00:00,1399515723274280960,nice find project project great invite many pe...
...,...,...,...
407484,2021-02-01 11:31:45+00:00,1356203583193063424,dena great
407485,2021-02-01 11:05:55+00:00,1356197080272752640,join gays dena
407486,2021-02-01 10:49:53+00:00,1356193045817872384,great project
407487,2021-02-01 09:29:19+00:00,1356172769424244736,rates determined higher chance worse low chanc...


In [4]:
# split df into groups of 7000
def split_df(df, size):
    total = len(df)
    r = total % size
    print(size, r)
    
    num_segments = total // size
    print(num_segments)
    
    segments = []
    for i in range(0, total - r + 1, size):
        sub_df = df[i:i+size]
        segments.append(sub_df)
#         print(i, i+size)
    return segments
        
split_df(df, 7000)

7000 1489
58


[                     created_at                   id  \
 0     2021-05-31 23:59:42+00:00  1399515966774530048   
 1     2021-05-31 23:59:40+00:00  1399515957362450432   
 2     2021-05-31 23:59:35+00:00  1399515936093204480   
 3     2021-05-31 23:58:47+00:00  1399515734007447552   
 4     2021-05-31 23:58:44+00:00  1399515723274280960   
 ...                         ...                  ...   
 6995  2021-05-31 09:42:02+00:00  1399300125390827520   
 6996  2021-05-31 09:42:01+00:00  1399300120374513664   
 6997  2021-05-31 09:41:55+00:00  1399300097930760192   
 6998  2021-05-31 09:41:48+00:00  1399300065613680640   
 6999  2021-05-31 09:41:46+00:00  1399300059145981952   
 
                                           cleaned_tweet  
 0                                          good project  
 1                                   great project reset  
 2     beautiful project congratulations whole team h...  
 3            participating cryptoultraman airdrop round  
 4     nice find pr

In [5]:
# split df into n groups of equal length
# https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

In [9]:
sub_dfs = list(split(df, 58))
sub_dfs

[                     created_at                   id  \
 0     2021-05-31 23:59:42+00:00  1399515966774530048   
 1     2021-05-31 23:59:40+00:00  1399515957362450432   
 2     2021-05-31 23:59:35+00:00  1399515936093204480   
 3     2021-05-31 23:58:47+00:00  1399515734007447552   
 4     2021-05-31 23:58:44+00:00  1399515723274280960   
 ...                         ...                  ...   
 7021  2021-05-31 09:39:51+00:00  1399299576180342784   
 7022  2021-05-31 09:39:45+00:00  1399299551928807424   
 7023  2021-05-31 09:39:41+00:00  1399299534434435072   
 7024  2021-05-31 09:39:40+00:00  1399299531053813760   
 7025  2021-05-31 09:39:39+00:00  1399299527186677760   
 
                                           cleaned_tweet  
 0                                          good project  
 1                                   great project reset  
 2     beautiful project congratulations whole team h...  
 3            participating cryptoultraman airdrop round  
 4     nice find pr

In [12]:
# remove any null created_at values from dataframe
df = df.drop(df[df['created_at'].isnull()].index)
# ensure that all values in created_at has 2021 (and not random strings)
df = df[df['created_at'].str.contains("2021")]

# split created_at into date and time columns
#https://intellipaat.com/community/13909/python-how-can-i-split-a-column-with-both-date-and-time-e-g-2019-07-02-00-12-32-utc-into-two-separate-columns
df['created_at'] = pd.to_datetime(df['created_at'])
df['date'] = df['created_at'].dt.date
df['time'] = df['created_at'].dt.time
df = df.drop("created_at", axis=1)

df

Unnamed: 0,id,cleaned_tweet,date,time
0,1399515966774530048,good project,2021-05-31,23:59:42
1,1399515957362450432,great project reset,2021-05-31,23:59:40
2,1399515936093204480,beautiful project congratulations whole team h...,2021-05-31,23:59:35
3,1399515734007447552,participating cryptoultraman airdrop round,2021-05-31,23:58:47
4,1399515723274280960,nice find project project great invite many pe...,2021-05-31,23:58:44
...,...,...,...,...
407484,1356203583193063424,dena great,2021-02-01,11:31:45
407485,1356197080272752640,join gays dena,2021-02-01,11:05:55
407486,1356193045817872384,great project,2021-02-01,10:49:53
407487,1356172769424244736,rates determined higher chance worse low chanc...,2021-02-01,09:29:19


In [7]:
# group tweets by date and count number of entries per day
dates = df.groupby('date').count()
dates

Unnamed: 0_level_0,created_at,id,cleaned_tweet,time
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-02-01,8,8,8,8
2021-02-02,9,9,9,9
2021-02-03,13,13,13,13
2021-02-04,3,3,3,3
2021-02-05,7,7,7,7
...,...,...,...,...
2021-05-27,6417,6417,6404,6417
2021-05-28,9037,9037,9028,9037
2021-05-29,7751,7751,7737,7751
2021-05-30,9150,9150,9122,9150
