In [1]:
import pandas as pd
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv("~\Desktop\JOTFORM\Summer2023\Anomaly-Detection\data\jotform_trends_keywords_intern.csv")

In [3]:
keywords = data.keyword.drop_duplicates()

In [4]:
len(data.keyword)

4185259

In [5]:
len(keywords)

227250

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
sentence_embeddings = model.encode(keywords.tolist(), show_progress_bar=True, convert_to_tensor=True,
    device="cpu")

In [8]:
cluster_keywords = pd.read_csv('~\Desktop\JOTFORM\Summer2023\Anomaly-Detection\data\industry_data.csv').industry.tolist()

In [9]:
cluster_embeddings = model.encode(cluster_keywords, show_progress_bar=True, convert_to_tensor=True,
    device="cpu")

Batches: 100%|██████████| 2/2 [00:00<00:00, 25.55it/s]


In [28]:
with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [29]:
with open('sentence_embeddings.pickle', 'wb') as f:
    pickle.dump(sentence_embeddings, f)

In [30]:
with open('cluster_embeddings.pickle', 'wb') as f:
    pickle.dump(cluster_embeddings, f)

In [10]:
data_to_cluster_mapping = {}
for i, data_keyword in enumerate(keywords):
    similarity_scores = cosine_similarity(sentence_embeddings[i].reshape(1, -1), cluster_embeddings)
    best_cluster_index = similarity_scores.argmax()
    data_to_cluster_mapping[data_keyword] = cluster_keywords[best_cluster_index]

In [11]:
df = pd.DataFrame(columns=['keyword', 'Cluster_Keyword'])

In [12]:
for data_keyword, cluster_keyword in data_to_cluster_mapping.items():
    df = df.append({'keyword': data_keyword, 'Cluster_Keyword': cluster_keyword}, ignore_index=True)
    #print(f"Data Keyword: {data_keyword} -> Cluster Keyword: {cluster_keyword}")

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227250 entries, 0 to 227249
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   keyword          227249 non-null  object
 1   Cluster_Keyword  227250 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB


In [16]:
df = df.dropna()

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 227249 entries, 0 to 227249
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   keyword          227249 non-null  object
 1   Cluster_Keyword  227249 non-null  object
dtypes: object(2)
memory usage: 5.2+ MB


In [18]:
data_df = data.copy()

In [19]:
merged_df = pd.merge(data_df, df, on='keyword', how='inner')

In [20]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4185186 entries, 0 to 4185185
Data columns (total 5 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   Unnamed: 0       int64 
 1   date             object
 2   keyword          object
 3   keyword_sum      int64 
 4   Cluster_Keyword  object
dtypes: int64(2), object(3)
memory usage: 191.6+ MB


In [21]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,date,keyword,keyword_sum,Cluster_Keyword
0,0,2023-07-22,presente,97,accommodation
1,13171,2023-07-21,presente,184,accommodation
2,36447,2023-07-20,presente,426,accommodation
3,60694,2023-07-19,presente,211,accommodation
4,79497,2023-07-18,presente,17,accommodation


In [22]:
merged_df['date'] = pd.to_datetime(merged_df['date'])

In [23]:
merged_df_sorted = merged_df.sort_values(by='date')

In [24]:
merged_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4185186 entries, 4185185 to 0
Data columns (total 5 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Unnamed: 0       int64         
 1   date             datetime64[ns]
 2   keyword          object        
 3   keyword_sum      int64         
 4   Cluster_Keyword  object        
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 191.6+ MB


In [25]:
merged_df_sorted.head()

Unnamed: 0.1,Unnamed: 0,date,keyword,keyword_sum,Cluster_Keyword
4185185,4290330,2023-01-01,scout cookie,5,fashion
289913,4288385,2023-01-01,payment method,365,finance
1482274,4288016,2023-01-01,sum,26,finance
3996302,4287198,2023-01-01,describes medical,5,beauty & personal care
3200627,4285615,2023-01-01,cords,9,telecommunications


In [26]:
print("done")

done


In [27]:
merged_df_sorted.to_csv("~\Desktop\JOTFORM\Summer2023\Anomaly-Detection\data\clustered_data.csv")