In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import json

import data_connector

# Data cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Function to perform clustering and return the DataFrame with cluster numbers
def cluster_and_return(df, max_clusters=15):
    # Clean the text data
    df['cleaned_text'] = df['text_content'].apply(clean_text)

    # Feature extraction using TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    X = vectorizer.fit_transform(df['cleaned_text'])

    # Determine the optimal number of clusters using Elbow Method and Silhouette Score
    def determine_optimal_clusters(X, max_clusters):
        wcss = []
        silhouette_scores = []
        for i in range(2, max_clusters):  # start from 2 clusters
            kmeans = KMeans(n_clusters=i, random_state=42)
            kmeans.fit(X)
            wcss.append(kmeans.inertia_)
            if i > 1:
                silhouette_avg = silhouette_score(X, kmeans.labels_)
                silhouette_scores.append(silhouette_avg)

        # Determine optimal number of clusters based on highest silhouette score
        optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2  # +2 because silhouette_scores starts from 2 clusters
        return optimal_clusters

    optimal_clusters = determine_optimal_clusters(X, max_clusters)
    print(f'Optimal number of clusters: {optimal_clusters}')

    # Calculate average similarity score for each cluster
    cosine_sim_matrix = 1 - pairwise_distances(X, metric='cosine')
    cluster_scores = []

    for cluster in range(optimal_clusters):
        indices = np.where(kmeans.labels_ == cluster)[0]
        if len(indices) > 1:
            cluster_sim = cosine_sim_matrix[np.ix_(indices, indices)]
            avg_sim = cluster_sim.mean()
        else:
            avg_sim = 1.0  # If there's only one member in the cluster, similarity is 1
        cluster_scores.extend([avg_sim] * len(indices))

    df['cluster_score'] = cluster_scores

    # Perform clustering with the optimal number of clusters
    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
    kmeans.fit(X)
    df['cluster_no'] = kmeans.labels_

    # Drop the temporary cleaned_text column
    df = df.drop(columns=['cleaned_text'])

    return df



In [2]:
strSQL = """
select 	a.id as jobsid,
        a.*,
		c.id as key_monitoring_media_social, 
		d.id as key_monitoring_media_online,
		c.*
from 	screen_analisis_ai a 
		inner join monitoring_search b
			on cast(a.monitoring_id as varchar) = cast(b.id as varchar)
		inner join monitoring_media_social c
			on b.id = c.monitoring_search_id 
		left outer join  monitoring_media_online d
			on d.monitoring_search_id = c.monitoring_search_id 
where a.jenis_analisa = '10'
and a.status = 1
order by a.created desc 
limit 1
"""

df_job = data_connector.execute_query_psql(strSQL)
if len(df_job) == 0:
    # get out, nothing to do
    print('Zero jobs, quitting now')
    quit()
    
similarity_treshold = 0.9
i_process_id = df_job['jobsid'][0]
screen_name = ''
database_keyword_id = df_job['key_monitoring_media_social'][0]
social_media_monitoring_id = df_job['key_monitoring_media_social'][0]
media_online_monitoring_id = df_job['key_monitoring_media_online'][0]

# print(database_keyword_id)
print(similarity_treshold)
print(i_process_id)
print(social_media_monitoring_id)
print(media_online_monitoring_id)


# Prepare SQL Statement
print(i_process_id)
sql = "update screen_analisis_ai set status = 2, last_status_update = now(), start_process = now() where id = %s"
sql = sql.replace('%s', str(i_process_id))

print(sql)
row_count = data_connector.execute_query_psql(sql)
print('update ' + str(row_count) + ' rows')


Zero jobs, quitting now


KeyError: 0

In [None]:
# marking jobs
# Prepare SQL Statement
print(i_process_id)
sql = "update screen_analisis_ai set status = 2, last_status_update = now(), start_process = now() where id = %s"
sql = sql.replace('%s', str(i_process_id))

print(sql)
row_count = data_connector.execute_query_psql(sql)
print('update ' + str(row_count) + ' rows')

#
# Create Header Record
sql = "insert into ret_analysis_header (job_id, datetime_start, user_id) values (" + str(i_process_id) + ", now(), '1')"
# Execute the query
print(sql)
row_count = data_connector.execute_query_psql(sql)
#
# Create Parameter Record
sql = """
insert into ret_analysis_parameter 
(job_id, param_id, param_name, param_value) 
values (%s, %s, %s, %s)
""" % (str(i_process_id), '1', "'Similarity Treshold'", str(similarity_treshold))
# Execute the query
print(sql)
row_count = data_connector.execute_query_psql(sql)

sql = """
insert into ret_analysis_parameter 
(job_id, param_id, param_name, param_value) 
values (%s, %s, %s, '%s')
""" % (str(i_process_id), '1', "'DB_ID'", str(database_keyword_id))
print(sql)
row_count = data_connector.execute_query_psql(sql)



In [None]:
def record_result(df_result, platform_id):
    # id_key and text_content fields.
    # create statement template
    sql = """
    insert into ret_cluster_result_monitor (ref_id, cluster_no, platform_id, job_id) 
    values ('%s', %s, %s, %s)
    """
    for index, row in save_df.iterrows():    
        if row['cluster_number'] == '':
            s_cluster_number = "NULL"
        else:
            s_cluster_number = row['cluster_number']
            
        sql = sql % (row['ref_id'],row['cluster_no'], platform_id, i_process_id)
    
    print(sql)
    # data_connector.execute_query_psql(sql)
    
    

In [None]:
# 10 = tiktok
# 20 = youtube
# 30 = instagram_post
# 40 = facebook_post
# 50 = google_result

# Processing jobs for each platform
iRowCount = 0

if df_job['is_tiktok'][0]:
    sql = '''
    select id as ref_id, "desc" as text_content 
    from tiktok 
    where monitoring_id = '%s' -- 10
    '''
    df = data_connector.execute_query_psql(sql % (social_media_monitoring_id))
    if len(df) != 0:
        result_df = cluster_and_return(df)
        iRowCount = iRowCount + len(df)

if df_job['is_youtube'][0]:
    sql = '''
    select 	id as ref_id, title as text_content
    from youtube 
    where monitoring_id = '%s' -- 20
    '''
    df = data_connector.execute_query_psql(sql % (social_media_monitoring_id))
    if len(df) != 0:
        result_df = cluster_and_return(df)
        iRowCount = iRowCount + len(df)

if df_job['is_instagram'][0]:
    sql = '''
    select id as ref_id, content as text_content
    from instagram_post 
    where monitoring_id = '%s' -- 30
    '''
    df = data_connector.execute_query_psql(sql % (social_media_monitoring_id))
    if len(df) != 0:
        result_df = cluster_and_return(df)
        iRowCount = iRowCount + len(df)

if df_job['is_facebook'][0]:
    sql = '''
    select id as ref_id, "description" as text_content
    from facebook_post 
    where monitoring_id = '%s' -- 40
    and length(trim("description")) > 0
    '''
    df = data_connector.execute_query_psql(sql % (social_media_monitoring_id))
    if len(df) != 0:
        result_df = cluster_and_return(df)
        iRowCount = iRowCount + len(df)

if df_job['is_google'][0]:
    sql = '''
    select 	id as ref_id, "description" as text_content
    from google_result 
    where monitoring_id = '%s' -- 50
    '''
    df_google = data_connector.execute_query_psql(sql % (media_online_monitoring_id))
    if len(df) != 0:
        result_df = cluster_and_return(df)
        iRowCount = iRowCount + len(df)

if df_job['is_twitter'][0]:
    sql = '''
    select id as ref_id, tweet as text_content
    from twitter_tweets 
    where monitoring_id = '%s' -- 10
    '''
    df = data_connector.execute_query_psql(sql % (social_media_monitoring_id))
    if len(df) != 0:
        result_df = cluster_and_return(df)
        iRowCount = iRowCount + len(df)

print("Total Processed Data: " + str(iRowCount))

In [None]:
# Finishing Jobs
# Create Parameter Record
sql = "insert into ret_analysis_parameter (job_id, param_id, param_name, param_value) values (%s, %s, %s, %s)"
# Execute the query
data_connector.execute_query_psql(sql % (i_process_id, 1, "'#Content Processed'",iRowCount))

# Create Tweet Cluster Record
sql = "update ret_analysis_header set datetime_finish = NOW() where job_id = %s"
# Executing query
data_connector.execute_query_psql(sql % (i_process_id) )

sql = "update screen_analisis_ai set status = 3, duration = EXTRACT(EPOCH FROM (now() - start_process)), end_process = NOW() where id = %s"
data_connector.execute_query_psql(sql % (i_process_id))

print('inserting result finished')

# Wait 10 sec before release
time.sleep(10)

In [None]:
# # Perform clustering and return the updated DataFrame
# result_df = cluster_and_return(df)
# print(result_df)