In [4]:
"""
This is a heavily documented notebook with exploratory data analyses for Telegram Channel Archive project
https://github.com/githubbar/telegram_channel_archive


For working with the SQLite backend we are choosing Ibis over Pandas for data analysis because it does execuste a lot of stuff at the backend and is therefore more suitable for large databases.
see: https://voltrondata.com/blog/ibis-explained-making-dataframes-big-and-small-more-delightful
""" 

# TEMP: conda activate /N/slate/oleykin/.conda/envs/tele
# TEMP: conda list -e > requirements.txt

"""Connect to SQLite DB"""
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
import numpy as np
import ibis

ibis.options.interactive = True
ibis.options.repr.interactive.max_rows = 20

con = ibis.sqlite.connect('db.sqlite')
# con.list_tables()

channel  = con.table('channel')
msg = con.table('message')


In [15]:
# >>>>>> Check start/end date for each channel 
import datetime
from datetime import timedelta
import duckdb
from ibis import _
t = msg.join(channel, msg.channel_id == channel.id) 
print('Date ranges for 2022')
print(t.filter(_.date.cast("timestamp").year() == 2022).group_by('title').aggregate(
    min_time=_.date.min().cast("timestamp"), 
    max_time=_.date.max().cast("timestamp"),
)
)
# group by channel name
print('Date ranges for 2023')
print(t.filter(_.date.cast("timestamp").year() == 2023).group_by('title').aggregate(
    min_time=_.date.min().cast("timestamp"), 
    max_time=_.date.max().cast("timestamp"),
)
)

# TODO: timestamp diff not implemented in Ibis?
# t = t.mutate(
#     days_diff = _.max_time.delta(datetime.datetime.now(), 'day')    
# )


Date ranges for 2022


┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1mtitle[0m[1m                                 [0m[1m [0m┃[1m [0m[1mmin_time[0m[1m           [0m[1m [0m┃[1m [0m[1mmax_time[0m[1m           [0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ [2mstring[0m                                 │ [2mtimestamp[0m           │ [2mtimestamp[0m           │
├────────────────────────────────────────┼─────────────────────┼─────────────────────┤
│ [32mЗЕЛЕНАЯ ЛЕНТА                         [0m │ [35m2022-02-27 11:36:52[0m │ [35m2022-06-06 16:25:29[0m │
│ [32mМедиа Партизаны | Нет войне           [0m │ [35m2022-03-15 18:46:22[0m │ [35m2022-06-22 13:45:39[0m │
│ [32mФеминистское Антивоенное Сопротивление[0m │ [35m2022-02-25 13:35:37[0m │ [35m2022-06-04 18:43:01[0m │
└────────────────────────────────────────┴─────────────────────┴─────────────────────┘
Date ranges for 

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1mtitle[0m[1m                                 [0m[1m [0m┃[1m [0m[1mmin_time[0m[1m           [0m[1m [0m┃[1m [0m[1mmax_time[0m[1m           [0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ [2mstring[0m                                 │ [2mtimestamp[0m           │ [2mtimestamp[0m           │
├────────────────────────────────────────┼─────────────────────┼─────────────────────┤
│ [32mЗЕЛЕНАЯ ЛЕНТА                         [0m │ [35m2023-02-27 08:43:48[0m │ [35m2023-06-06 08:04:23[0m │
│ [32mМедиа Партизаны | Нет войне           [0m │ [35m2023-03-15 07:00:00[0m │ [35m2023-06-22 13:59:06[0m │
│ [32mФеминистское Антивоенное Сопротивление[0m │ [35m2023-02-25 07:07:32[0m │ [35m2023-06-04 17:23:06[0m │
└────────────────────────────────────────┴─────────────────────┴─────────────────────┘


In [None]:
# >>>>>> Look at nulls (gap analysis); make sure they make sense.
import datetime
from datetime import timedelta
from ibis import _

t = msg.join(channel, msg.channel_id == channel.id)
t = t.filter(_.last_edit_date == None).select(_.title, _.text, _.total_views, _.last_edit_date)
print(f'Found {t.title.count()} records with the field last_edit_date == NULL. These are mostly pinned posts')
# t = t.filter(_.text != '')
print(f'But some {t.title.count()} are not, because they have text in them')
# Looks like an forward announcement for the next post, e.g. https://t.me/femagainstwar/237 and https://t.me/femagainstwar/238
# or a group of images


In [12]:
# >>>>>> Posting frequencies (own posts vs forwards; normalize by .. number of posts in channel/subs?)
import datetime
from rich import print
from datetime import timedelta
from ibis import _

# Identify three 100 day periods: one for each target channel
PERIOD_DAYS = 100
periods = {
1166398892 : [[datetime.datetime(2022, 2, 25, 0, 0)], [datetime.datetime(2023, 2, 25, 0, 0)]],
1724215937 : [[datetime.datetime(2022, 3, 15, 0, 0)], [datetime.datetime(2023, 3, 15, 0, 0)]],
1744097497 : [[datetime.datetime(2022, 2, 27, 0, 0)], [datetime.datetime(2023, 2, 27, 0, 0)]]
}
for key in periods:
    periods[key] = [(start[0], start[0] + datetime.timedelta(days=PERIOD_DAYS)) for start in periods[key]]


t = msg.join(channel, msg.channel_id == channel.id).filter(_.last_edit_date != None)
# t1 = t.filter((_.fwd_username != None)).select(_.title, _.text, _.total_views, _.last_edit_date)
# print(f'Found {t1.title.count()} user forwards')
# t2 = t.filter((_.fwd_channel_id != None)).select(_.title, _.text, _.total_views, _.last_edit_date)
# print(f'Found {t2.title.count()} channel forwards')
# Remove forwards
tNoforwards = t.filter((_.fwd_channel_id == None) & (_.fwd_username == None) & (_.last_edit_date != None))
# tNoforwards = t
print('Post frequency per channel (forwards and pins removed)')
print(tNoforwards.group_by('title').aggregate(posts_per_day = _.count()/PERIOD_DAYS*2))


# Use separate period for each channel
print('Post frequency per period (forwards and pins removed)')
data = []
for key in periods:
    s1 = tNoforwards.filter((_.channel_id == key) & (_.date.cast("timestamp").between(periods[key][0][0], periods[key][0][1]))).count() \
        / (periods[key][0][1] - periods[key][0][0]).days
    s2 = tNoforwards.filter((_.channel_id == key) & (_.date.cast("timestamp").between(periods[key][1][0], periods[key][1][1]))).count() \
        / (periods[key][1][1] - periods[key][1][0]).days
    ttl = channel.filter(_.id == key).title.to_pandas()[0]
    data.append({"channel": ttl, "period1" : s1.to_pandas(), "period2" : s2.to_pandas()})
print(data)
t = ibis.memtable(data)
t.to_csv('peroid_freq.csv')


: 

In [None]:
# >>>>>> Visualize total posts; visualize over time
import altair as alt
t = msg.join(channel, msg.channel_id == channel.id)
# group by channel name
# chart = (
#     alt.Chart(t.group_by("title").aggregate(count=_.count()))
#     .mark_bar()
#     .encode(
#         x="title",
#         y="count",
#         tooltip=["title", "count"],
#     )
#     .properties(width=1024, height=600)
#     .interactive()
# )
# TODO implement bukcets for timestamps
chart = (
    alt.Chart(t.filter(t.title == "Феминистское Антивоенное Сопротивление").date.bucket())
    .mark_line()
    .encode(
        x="date:T",
        y="count:Q",
        tooltip=["title", "count"],
    )
    .properties(width=1024, height=600)
    .interactive()
)
chart