In [None]:
'''
    Plot tweet frequency over time for large dataset
'''

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# file paths
OVERALL_DATA_IN = "../datain/topic_modelling/cleaned_tweets_largest_community.csv" # overall tweets
OVERALL_DATA_OUT = "../dataout/general/tweet_frequency_overall.jpeg" # overall tweets

TOPIC_DATA_IN_PREFIX = "../datain/topic_modelling/" # topic
TOPIC_DATA_OUT_PREFIX = "../dataout/general/" # topic

In [2]:
data_in = TOPIC_DATA_IN_PREFIX + "tweet_topic_subdf_topic_11.csv"
data_out = TOPIC_DATA_OUT_PREFIX + "tweet_frequency_topic_11.jpeg"

In [4]:
def plot_frequency_time(dates, data_out):
    '''
        Plot tweet frequency over time.

        Args:
            dates: df with count of number of tweets posted grouped by date
            overall: True if is for an overall analysis, False if it is for a topic's analysis.
            selected_topic: the selected topic number
            data_out: path to the file to which this function will output to.
    '''
    fig, ax = plt.subplots()
    ax.plot(dates.index, 'cleaned_tweet', data=dates)
    # Major ticks every 6 months.
    fmt_half_year = mdates.MonthLocator(interval=1)
    ax.xaxis.set_major_locator(fmt_half_year)
    # plot
    plt.title('Overall Tweet Frequency over time: 1 Feb - 31 May')
    plt.xlabel('Date')
    plt.ylabel('Number of Tweets')
#     plt.savefig(data_out)
#     plt.close()


In [11]:
# load tweet corpus data
df = pd.read_csv(data_in)
df = df.drop("Unnamed: 0", axis=1)

# remove any null created_at values from dataframe
df = df.drop(df[df['created_at'].isnull()].index)
# ensure that all values in created_at has 2021 (and not random strings)
df = df[df['created_at'].str.contains("2021")]
df

Unnamed: 0,id,created_at,cleaned_tweet
0,1397474559251476480,2021-05-26 08:47:53+00:00,im proud participate biggest project
1,1397474456101027840,2021-05-26 08:47:28+00:00,opportunity take part airdrop enthusiastic exc...
2,1397204829676650496,2021-05-25 14:56:04+00:00,good project reset
3,1390253049852039168,2021-05-06 10:32:11+00:00,great project
4,1397458349927641088,2021-05-26 07:43:28+00:00,good project
...,...,...,...
121138,1391068583472553984,2021-05-08 16:32:49+00:00,strong project good team
121139,1391068541483462656,2021-05-08 16:32:39+00:00,strong project good team
121142,1391068452647886848,2021-05-08 16:32:18+00:00,good project
121143,1396100690334732288,2021-05-22 13:48:37+00:00,good project strong team predictable transpare...


In [21]:
# split created_at into date and time columns
df['created_at'] = pd.to_datetime(df['created_at'])
df['date'] = df['created_at'].dt.date
df['month'] = df['created_at'].dt.month
df['week'] = df['created_at'].dt.week
df['time'] = df['created_at'].dt.time
df

  df['week'] = df['created_at'].dt.week


Unnamed: 0,id,created_at,cleaned_tweet,date,time,month,week
0,1397474559251476480,2021-05-26 08:47:53+00:00,im proud participate biggest project,2021-05-26,08:47:53,5,21
1,1397474456101027840,2021-05-26 08:47:28+00:00,opportunity take part airdrop enthusiastic exc...,2021-05-26,08:47:28,5,21
2,1397204829676650496,2021-05-25 14:56:04+00:00,good project reset,2021-05-25,14:56:04,5,21
3,1390253049852039168,2021-05-06 10:32:11+00:00,great project,2021-05-06,10:32:11,5,18
4,1397458349927641088,2021-05-26 07:43:28+00:00,good project,2021-05-26,07:43:28,5,21
...,...,...,...,...,...,...,...
121138,1391068583472553984,2021-05-08 16:32:49+00:00,strong project good team,2021-05-08,16:32:49,5,18
121139,1391068541483462656,2021-05-08 16:32:39+00:00,strong project good team,2021-05-08,16:32:39,5,18
121142,1391068452647886848,2021-05-08 16:32:18+00:00,good project,2021-05-08,16:32:18,5,18
121143,1396100690334732288,2021-05-22 13:48:37+00:00,good project strong team predictable transpare...,2021-05-22,13:48:37,5,20


In [22]:
# group tweets by date and count number of entries per day
dates = df.groupby('week').count()
dates

Unnamed: 0_level_0,id,created_at,cleaned_tweet,date,time,month
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,13,13,13,13,13,13
6,17,17,17,17,17,17
7,64,64,64,64,64,64
8,35,35,35,35,35,35
9,128,128,128,128,128,128
10,308,308,308,308,308,308
11,892,892,892,892,892,892
12,993,993,993,993,993,993
13,2923,2923,2923,2923,2923,2923
14,5414,5414,5414,5414,5414,5414


In [20]:
# group tweets by date and count number of entries per day
dates = df.groupby('month').count()
dates

Unnamed: 0_level_0,id,created_at,cleaned_tweet,date,time
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,129,129,129,129,129
3,2884,2884,2884,2884,2884
4,15213,15213,15213,15213,15213
5,82653,82653,82653,82653,82653


In [10]:
pd.to_numeric(dates["date"])

KeyError: 'date'

In [None]:
plot_frequency_time(dates, data_out)