In [1]:
# Import nessesory modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nrclex import NRCLex
import re
import emoji
import nltk

In [2]:
#NLTK downloads

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True) # Needed for tokenization
nltk.download('wordnet', quiet=True) # Needed for lemmatization
nltk.download('punkt_tab', quiet=True) # Needed for nrclex

True

In [3]:
def process_hashtags(text):
    hashtags = re.findall(r'#(\w+)', text)
    for hashtag in hashtags:
        processed_hashtag = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', hashtag)
        text = text.replace(f'#{hashtag}', processed_hashtag.lower()) # Convert to lower case after adding space
    return text


In [4]:
# open bin_reddit1.csv

df = pd.read_csv('bin_reddit1.csv')
df.head()

Unnamed: 0,text,label,Unnamed: 3
0,aa glad fun paint night sky,0,
1,abandonment massive fear trigger suicidal,1,
2,ability induce anxiety gift god,0,
3,ability write complex business,0,
4,Q,0,


In [5]:
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string


# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


# Process the data remove last empty column, then devide into two class
# According to label

# Remove all urls
df['text'] = df['text'].str.replace(r'http\S+', '', regex=True)

# Remove all mentions
df['text'] = df['text'].str.replace(r'@\S+', '', regex=True)

# Normalize texts
df['text'] = df['text'].str.lower()

# Remove numbers
df['text'] = df['text'].str.replace(r'\d+', '', regex=True)

# Replace emojis with their text meaning
df['text'] = df['text'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))

# convert hashtags to normal texts
df['text'] = df['text'].apply(process_hashtags)

# Tokenize the text
df['tokens'] = df['text'].apply(lambda x: x.split())

# Remove punctuation from tokens
df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in string.punctuation])

# Remove stop words from tokens
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Replace enlogated words with normal words


# Perform lemmatization on tokens
df['tokens'] = df['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])


df_class_0 = df[df['label'] == 0]
df_class_1 = df[df['label'] == 1]

print("DataFrame for label 0:")
display(df_class_0.head())

print("\nDataFrame for label 1:")
display(df_class_1.head())

print("\nDataFrame with cleaned tokens:")
display(df.head())

DataFrame for label 0:


Unnamed: 0,text,label,Unnamed: 3,tokens
0,aa glad fun paint night sky,0,,"[aa, glad, fun, paint, night, sky]"
2,ability induce anxiety gift god,0,,"[ability, induce, anxiety, gift, god]"
3,ability write complex business,0,,"[ability, write, complex, business]"
4,q,0,,[q]
5,abortion contraception sex work sex educatio...,0,,"[abortion, contraception, sex, work, sex, educ..."



DataFrame for label 1:


Unnamed: 0,text,label,Unnamed: 3,tokens
1,abandonment massive fear trigger suicidal,1,,"[abandonment, massive, fear, trigger, suicidal]"
7,absence mental illness doesnt presence menta...,1,,"[absence, mental, illness, doesnt, presence, m..."
11,absolute bastard odd,1,,"[absolute, bastard, odd]"
14,absolute despair,1,,"[absolute, despair]"
17,absolute not enjoy,1,,"[absolute, enjoy]"



DataFrame with cleaned tokens:


Unnamed: 0,text,label,Unnamed: 3,tokens
0,aa glad fun paint night sky,0,,"[aa, glad, fun, paint, night, sky]"
1,abandonment massive fear trigger suicidal,1,,"[abandonment, massive, fear, trigger, suicidal]"
2,ability induce anxiety gift god,0,,"[ability, induce, anxiety, gift, god]"
3,ability write complex business,0,,"[ability, write, complex, business]"
4,q,0,,[q]


In [6]:
print("Size of DataFrame for label 0:", df_class_0.shape)
print("Size of DataFrame for label 1:", df_class_1.shape)

Size of DataFrame for label 0: (58405, 4)
Size of DataFrame for label 1: (41185, 4)


In [7]:
from imblearn.over_sampling import RandomOverSampler

X = df['text']
y = df['label']


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

X_tfidf = tfidf_vectorizer.fit_transform(X)

print("Shape of TF-IDF features:", X_tfidf.shape)

Shape of TF-IDF features: (99590, 5000)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from nrclex import NRCLex


# Assuming X_resampled_series contains your preprocessed text data
# Convert the Series to a list for nrclex processing
text_list = X.tolist()

# Function to extract NRC emotions and create a DataFrame
def extract_nrc_features(text_list):
    nrc_features = []
    for text in text_list:
        emotion_object = NRCLex(text)
        # Get the raw scores for each emotion
        scores = emotion_object.raw_emotion_scores
        nrc_features.append(scores)

    # Convert the list of dictionaries to a DataFrame
    nrc_df = pd.DataFrame(nrc_features)
    # Fill any potential NaN values with 0
    nrc_df = nrc_df.fillna(0)
    return nrc_df

# Extract NRC features from the resampled text data
X_nrc_features = extract_nrc_features(text_list)

print("Shape of NRC features:", X_nrc_features.shape)
display(X_nrc_features.head())

# Normalize the NRC features
scaler = MinMaxScaler()
X_nrc_features_scaled = scaler.fit_transform(X_nrc_features)

# Convert scaled NRC features to a DataFrame
# Combine TF-IDF features and scaled NRC features
# Assuming X_resampled_tfidf is already computed and is a sparse matrix
# Convert sparse TF-IDF matrix to dense for concatenation with dense NRC features
X_tfidf_dense = X_tfidf.todense()


Shape of NRC features: (99590, 10)


Unnamed: 0,anticipation,joy,positive,anger,fear,negative,sadness,surprise,disgust,trust
0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,3.0,3.0,3.0,2.0,1.0,1.0,0.0
2,3.0,2.0,3.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
from scipy.sparse import csr_matrix

X_combined_features = np.hstack((X_tfidf_dense, X_nrc_features_scaled))
X_combined_sparse = csr_matrix(X_combined_features)
print("\nShape of combined features:", X_combined_features.shape)


Shape of combined features: (99590, 5010)


In [11]:
save_folder='processed'

from scipy.sparse import save_npz
import numpy as np
import pandas as pd
import os

# Create the save folder if it doesn't exist
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

save_npz(os.path.join(save_folder, 'X_tfidf.npz'), X_tfidf)

# Save the scaled NRC features (as a dense numpy array)
np.save(os.path.join(save_folder, 'X_nrc_features_scaled.npy'), X_nrc_features_scaled)

# Save the combined features (as a dense numpy array)
save_npz(os.path.join(save_folder, 'X_combined_sparse.npz'), X_combined_sparse)

# Save the resampled target variable
np.save(os.path.join(save_folder, 'y.npy'), y)