# 1. Loading and Exploring the Data

In [16]:
import pandas as pd

# Load your CSV data
file_path = 'data/01.original/original_data.csv' 
data = pd.read_csv(file_path)

# Check the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


# 2. Break down each row with some related posts

In [18]:
# Function to split posts and preserve the unique identifier
def split_posts(row):
    posts = row['posts'].split('|||')
    return pd.DataFrame({'type': row['type'], 'post': posts})

# Apply the function to split posts and concatenate the results
split_df = pd.concat(data.apply(lambda row: split_posts(row), axis=1).tolist(), ignore_index=True)

# Save the split DataFrame to a new CSV file
split_df.to_csv('split_data.csv', index=False)
split_df.head()

Unnamed: 0,type,post
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...
2,INFJ,enfp and intj moments https://www.youtube.com...
3,INFJ,What has been the most life-changing experienc...
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...
...,...,...
995,INTP,"But either way, he'd get there by use of his N..."
996,INTP,Every edgy NTP and their mother acts on their ...
997,INTP,it's cause you spend all your time irl ignorin...
998,INTP,just gonna ignore all your rules. Si - Normal...


# 3. Cleaning the Data

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK resources (if not done already)
nltk.download('punkt')
nltk.download('stopwords')

# Tokenization, stopwords removal, and stemming function
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return ' '.join(stemmed_tokens)

# Apply preprocessing to the 'posts' column
split_df['post'] = split_df['post'].apply(preprocess_text)
split_df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alireza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alireza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,type,post
0,INFJ,'http : //www.youtube.com/watch ? v=qsxhcwe3krw
1,INFJ,http : //41.media.tumblr.com/tumblr_lfouy03pma...
2,INFJ,enfp intj moment http : //www.youtube.com/watc...
3,INFJ,life-chang experi life ?
4,INFJ,http : //www.youtube.com/watch ? v=vxzeywwrdw8...


# 4. Remove URLs

In [22]:
import re

# Function to remove URLs from text
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

# Function to remove invalid characters and non-alphabetic words
def clean_text(text):
    # Remove URLs
    text = remove_urls(text)
    # Remove non-alphabetic characters and extra spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(' +', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()

# Apply URL removal and text cleaning to the 'posts' column
split_df['post'] = split_df['post'].apply(clean_text)
split_df.head()

Unnamed: 0,type,post
0,INFJ,http vqsxhcwekrw
1,INFJ,http mediatumblrcomtumblrlfouypmaqarooojpg
2,INFJ,enfp intj moment http vizlegxm sportscent top ...
3,INFJ,lifechang experi life
4,INFJ,http vvxzeywwrdw http vuejamdp repeat today


# 5. Remove empty posts

In [25]:
# Remove records with empty post values
split_df = split_df[split_df['post'].str.strip() != '']

# 6. Saving the processed data 

In [26]:
split_df.to_csv('data/02.processed/processed_data.csv', index=False)