# Hashtag Counter
For each dataset, each dataset is stored in db-id, below are the related table

ret_available_db
ret_tweet

result stored in hashtag_counter with table structure of
id, int -> primary key
db_id, int -> foreign key to ret_available_db.db_id
tweet_id, int -> foreign to ret_tweet.id
hashtag_content, text -> the hashtag
hashtag_counter, int -> how many times hashtag is encountered
process_time, timestamp -> when the process is done


## First Step
query from database, that is not available in table hashtag_counter, take the last one first

In [1]:
import data_connector as dconn
import re
import pandas as pd
import string
from collections import Counter
from tqdm import tqdm
import psycopg2
from psycopg2 import sql, extras

sql = """
select 	*
from 	monitoring_search a 
where 	a.id not in (select monitoring_search_id from hashtag_counter_job_rec)
order by created_date desc
limit 1
"""

df_db_id = dconn.execute_query_psql(sql)
const_monitoring_search_id = df_db_id.iloc[0]['id']
print(const_monitoring_search_id)

# query monitoring-media-social
sql = """
select 	*
from monitoring_media_social 
where monitoring_search_id = '%s'
""" % (const_monitoring_search_id)
#
df_monitoring_media_social = dconn.execute_query_psql(sql)

# query monitoring-media-online
sql = """
select 	*
from monitoring_media_online
where monitoring_search_id = '%s'
""" % (const_monitoring_search_id)
#
df_monitoring_media_online = dconn.execute_query_psql(sql)

20c6b64a-a558-46af-8b8c-8a2c69ef4835


In [2]:
import re

def expand_hashtag(text):
    hashtags = []
    # print(text)
    for match in re.findall(r'#(\w+)', text):
        # print(match)
        hashtags.append(match)
    return hashtags

def process_posts(df_source, platform_id):
    # List to store the result
    result_list = []

    for index in tqdm(range(len(df_source)), desc="Processing posts"):
        strTweetString = str(df_source['post_content'].iloc[index])
        monitoring_id = str(df_source['monitoring_id'].iloc[index])
        # check if string is empty
        if strTweetString.strip() != "":
            list_res = expand_hashtag(strTweetString)
            # check if no hashtag found
            if len(list_res) > 0:
                for value in list_res:
                    new_row = {
                        'monitoring_id': monitoring_id,
                        'hashtag_content': value,
                        'platform_id': platform_id,
                        'post_id': df_source['post_id'].iloc[index]
                    }
                    result_list.append(new_row)

    # Convert the list of dictionaries to a DataFrame if needed
    # df_hashtag = pd.DataFrame(result_list)
    return result_list

class DatabaseConnection:
    def __init__(self, dbname, user, password, host, port):
        self.dbname = dbname
        self.user = user
        self.password = password
        self.host = host
        self.port = port
        self.conn = None

    def get_connection(self):
        if self.conn is None:
            self.conn = psycopg2.connect(
                dbname=self.dbname,
                user=self.user,
                password=self.password,
                host=self.host,
                port=self.port
            )
        return self.conn

    def execute_query_psql(self, query, values=None):
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            if values:
                cursor.execute(query, values)
            else:
                cursor.execute(query)
            conn.commit()
        except Exception as e:
            conn.rollback()
            print(f"Error executing query: {e}")
        finally:
            cursor.close()

    def close_connection(self):
        if self.conn:
            self.conn.close()
            self.conn = None

def insert_data(result_list, dc, batch_size=100):
    conn = dc.get_connection()
    cursor = conn.cursor()

    insert_query = """
        INSERT INTO public.hashtag_counter_monitor (media_social_monitoring_id, post_id, hashtag_content, platform_id, process_time)
        VALUES %s
    """

    values_template = "(%s, %s, %s, %s, NOW())"

    try:
        for i in tqdm(range(0, len(result_list), batch_size), desc="Inserting data"):
            batch = result_list[i:i + batch_size]
            values = [
                (record['monitoring_id'], str(record['post_id']), record['hashtag_content'], record['platform_id'])
                for record in batch
            ]

            extras.execute_values(
                cursor, insert_query, values, template=values_template
            )

            conn.commit()

    except Exception as e:
        conn.rollback()
        print(f"Error occurred: {e}")
    finally:
        cursor.close()
        dc.close_connection()
    

In [3]:
# Example usage
dc = DatabaseConnection(
    dbname='medols',
    user='postgres',
    password='FEWcTB3JIX5gK4T06c1MdkM9N2S8w9pb',
    host='98.98.117.105',
    port='5432'
)

In [4]:
iCtr = 0

if(len(df_monitoring_media_social) > 0):
    s_m_id = df_monitoring_media_social.iloc[0]['id']
    print("Twitter: %s" % df_db_id.iloc[0]['is_twitter'])
    print("Tiktok: %s" % df_db_id.iloc[0]['is_tiktok'])
    print("Facebook: %s" % df_db_id.iloc[0]['is_facebook'])
    print("Instagram: %s" % df_db_id.iloc[0]['is_instagram'])
    print("Youtube: %s" % df_db_id.iloc[0]['is_youtube'])
    
    # Processing jobs for each platform
    # 10 = tiktok
    # 20 = youtube
    # 30 = instagram_post
    # 40 = facebook_post
    # 50 = google_result
    # 60 = twitter_tweets
    
    # query the data source
    if (df_db_id.iloc[0]['is_twitter']):
        # query from twitter
        sql = """
        select 	a.id as post_id,
        		a.tweet as post_content,
                a.monitoring_id
        from 	twitter_tweets a
        where 	monitoring_id = '%s'
        """ % (s_m_id)
        print(sql)
        df_source = dconn.execute_query_psql(sql)
        if (len(df_source) > 0):
            df_result_hashtag = process_posts(df_source,'60')
            insert_data(df_result_hashtag, dc)
            iCtr = iCtr + len(df_result_hashtag)
    
    if (df_db_id.iloc[0]['is_tiktok']):
        # query from tiktok
        sql = '''
        select  id as post_id,
                tiktok."desc" as post_content, 
                monitoring_id
        from    tiktok
        where   monitoring_id = %s
        ''' % ("'" + str(s_m_id) + "'")
        print(sql)
        df_source = dconn.execute_query_psql(sql)
        if (len(df_source) > 0):
            df_result_hashtag = process_posts(df_source,'10')
            insert_data(df_result_hashtag, dc)
            iCtr = iCtr + len(df_result_hashtag) 
    
    if (df_db_id.iloc[0]['is_facebook']):
        # query from facebook
        sql = """
        select 	id as post_id,
        		description as post_content,
        		monitoring_id 
        from 	facebook_post 
        where 	monitoring_id = '%s'
        """ % (s_m_id)
        print(sql)
        df_source = dconn.execute_query_psql(sql)
        if (len(df_source) > 0):
            df_result_hashtag = process_posts(df_source,'40')
            insert_data(df_result_hashtag, dc)
            iCtr = iCtr + len(df_result_hashtag) 
    
    if (df_db_id.iloc[0]['is_instagram']):
        # query from instagram
        sql = """
        select 	id as post_id,
        		content as post_content,
        		monitoring_id
        from 	instagram_post 
        where 	monitoring_id = '%s'
        """ % (s_m_id)
        print(sql)
        df_source = dconn.execute_query_psql(sql)
        if (len(df_source) > 0):
            df_result_hashtag = process_posts(df_source,'30')
            insert_data(df_result_hashtag, dc)
            iCtr = iCtr + len(df_result_hashtag)
    
    if (df_db_id.iloc[0]['is_youtube']):
        # query from youtube
        sql = """
        select  id as post_id,
        		title as post_content,
        		monitoring_id 
        from 	youtube 
        where 	monitoring_id = '%s'
        """ % (s_m_id)
        print(sql)
        df_source = dconn.execute_query_psql(sql)
        if (len(df_source) > 0):
            df_result_hashtag = process_posts(df_source,'20')
            insert_data(df_result_hashtag, dc)
            iCtr = iCtr + len(df_result_hashtag)
            
    # if (len(df_monitoring_media_online) > 0):
        # query
        

    

Twitter: True
Tiktok: False
Facebook: False
Instagram: False
Youtube: False

        select 	a.id as post_id,
        		a.tweet as post_content,
                a.monitoring_id
        from 	twitter_tweets a
        where 	monitoring_id = '401e8c50-373d-470f-b141-2d98de024a74'
        


Processing posts:   0%|          | 0/66 [00:00<?, ?it/s]

Processing posts: 100%|██████████| 66/66 [00:00<00:00, 8550.55it/s]




Inserting data:   0%|          | 0/4 [00:00<?, ?it/s]

Inserting data: 100%|██████████| 4/4 [00:00<00:00, 97.61it/s]




## Record the result
into table hashtags_counter

In [5]:
ssql = """
insert into hashtag_counter_job_rec (monitoring_search_id, process_time, hashtag_count)
values ('%s',now(),'%s')
"""
ssql = ssql % (const_monitoring_search_id, iCtr)
print(ssql)
dconn.execute_query_psql(ssql)


insert into hashtag_counter_job_rec (monitoring_search_id, process_time, hashtag_count)
values ('20c6b64a-a558-46af-8b8c-8a2c69ef4835',now(),'335')



1

In [6]:
# wait 10 seconds before finished
import time
time.sleep(1)