In [1]:
# Install necessary dependencies for the project
!pip install -r ../requirements.txt



In [2]:
# Import necessary libraries and functions
import pandas as pd
from sklearn.utils import resample
from preprocessing import *

[nltk_data] Downloading package punkt to /Users/gaia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/gaia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Path to the data file
data_path = '../dataset/events.csv'  # Replace with the actual path to your data file
output_path = '../dataset/cleaned_events.csv'  # Output path for the cleaned data
balanced_output_path = '../dataset/balanced_events.csv'  # Output path for the balanced data


In [4]:
# Step 1: Load the data
all_events = pd.read_csv(data_path, encoding='ISO-8859-1', encoding_errors='replace')
print("Initial data shape:", all_events.shape)

Initial data shape: (18154, 31)


In [5]:
# Step 2: Clean the data
cleaned_events_df = load_and_clean_data(data_path)
print("Data shape after initial cleaning:", cleaned_events_df.shape)

Data shape after initial cleaning: (15697, 31)


In [6]:
# Step 3: Filter and process the data
cleaned_events_df = filter_and_process_data(cleaned_events_df)
print("Data shape after preprocessing:", cleaned_events_df.shape)

Data shape after preprocessing: (10128, 33)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [7]:
# Step 4: Process the data for clustering
cleaned_events_df = process_for_clustering(cleaned_events_df)
print("Data shape after clustering preparation:", cleaned_events_df.shape)

Data shape after clustering preparation: (10128, 37)


In [9]:
# Step 5: Apply balancing to top N categories
balanced_df = filter_top_categories_and_balance(cleaned_events_df, top_n=5)
print("Data shape after balancing:", balanced_df.shape)

Data shape after balancing: (2835, 38)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['category_label'] = filtered_df['categories'].map(category_mapping)


In [10]:
# Step 6: Save the cleaned data
save_cleaned_data(cleaned_events_df, output_path)
save_cleaned_data(balanced_df, balanced_output_path)

The cleaned data has been saved to ../dataset/cleaned_events.csv
The cleaned data has been saved to ../dataset/balanced_events.csv


In [11]:
# Display the first few rows to verify
cleaned_events_df.head(35)

Unnamed: 0,thread_id,page_id,comment_id,user_id,username,post_time,replies,views,active,categories,...,participation_comment_count,mixed_comment_count,other_comment_count,author_comment_count,processed_clean_title,processed_post_tex,clean_title_tokens,post_tex_tokens,general_rules_tokens,combined_text
1,4895638,0,1,1281314.0,Risty Moon,14/08/2018 18:21,210.0,4260.0,False,Bounty,...,135.0,1.0,69.0,1.0,Angenium Project Bounty Thread,Angenium Bounty Campaign officially ended ...,project bounty thread,bounty campaign officially ended please submit...,week general join bounty first register via ca...,project bounty thread bounty campaign official...
2,2679019,0,1,86907.0,needmoney,02/01/2018 16:08,6144.0,47325.0,False,Bounty,...,5350.0,29.0,721.0,38.0,YOURBLOCK BOUNTY IS OVER IN TH WEEK,HELLO EVERYONE TEAM HAS DECIDED TO END THE...,bounty th week,everyone team decided end bounty count haste l...,first week bounty campaign start signature dis...,bounty th week everyone team decided end bount...
3,5252517,0,1,2794182.0,PorExchange,01/06/2020 09:43,1.0,97.0,False,Other,...,0.0,0.0,0.0,0.0,Campaign ProEX Magic Pool,ProEXs new campaign ProEX Magic Pool online J...,campaign magic pool,new campaign magic join roi information campai...,,campaign magic pool new campaign magic join ro...
4,3311610,0,1,1904909.0,LusoCoin,12/04/2018 22:45,13.0,379.0,False,"Bounty(LowQuality), ICO",...,0.0,0.0,0.0,0.0,LusoCoin reserved referral Phase,LUSO ICOBounty program We reserved a...,reserved phase,reserved available bounty amount bounty progra...,,reserved phase reserved available bounty amoun...
8,5032135,0,1,1153662.0,jimsteel,18/09/2018 15:09,13.0,415.0,False,Bounty,...,0.0,0.0,13.0,1.0,ONLINEJURY FEEDBACK BOUNTY,Y JURYONLINE THE PLACE FOR RESP...,feedback bounty,place responsible bounty powered platform k si...,,feedback bounty place responsible bounty power...
9,5112064,0,1,2356569.0,BountyZZ,20/02/2019 00:40,7662.0,52271.0,False,Bounty,...,6931.0,57.0,502.0,9.0,DigitalBits An Economy Reimagined,This image longer available Visit tinyp...,economy,image longer protocol layer designed help faci...,necessary join telegram main chat bounty chat ...,economy image longer protocol layer designed h...
10,5041613,0,1,2021066.0,B21Official,01/10/2018 13:33,5.0,471.0,False,"Bounty, Airdrop",...,0.0,0.0,3.0,3.0,BLife Cryptocurrency Education Training App,B ICO ailable AV ae App Store ...,education training,celebrate launch education training community ...,,education training celebrate launch education ...
11,2357581,0,1,1267364.0,etherflyerofficial,04/11/2017 15:01,131.0,15612.0,False,"Bounty, ICO",...,78.0,0.0,51.0,3.0,TCASH New Decentralized Exchange Platform,NOTE TCASH Official Main Thread TCASH Eth...,new exchange platform,official main thread truly solid exchange powe...,,new exchange platform official main thread tru...
13,2457005,0,1,1280873.0,kimmanuel,24/11/2017 21:28,2.0,211.0,False,Other,...,0.0,0.0,0.0,0.0,I newbie I would like join Bounties program,I new I understand much technical part I w...,would like join program,new understand much technical would like enter...,,would like join program new understand much te...
14,2767794,0,1,527272.0,Wapinter,16/01/2018 20:22,34.0,1166.0,False,Bounty,...,0.0,0.0,27.0,3.0,NapoleonX Translation Paid In ETH,Our community present following platform ...,translation,community present following twitter medium sla...,start working translation get approval,translation community present following twitte...
