In [2]:

import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import download as nltk_download
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import words as nltk_words

# Download necessary resources
nltk_download('stopwords')
nltk_download('punkt')
nltk_download('wordnet')
nltk_download('words')  # To use a dictionary-based filtering

# Load dataset
df = pd.read_csv('C:\\Users\\death\\Desktop\\minor_project\\datasets\\combined_dataset_before_pre_processing\\combined_data_before_preprocessing.csv')

# Dictionary of valid English words (lowercase for comparison)
valid_words = set(w.lower() for w in nltk_words.words())

# Function to preprocess tweets
def preprocess_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#', '', tweet)
    # Replace mentions (@user)
    tweet = re.sub(r'@[^\s]+', 'USER', tweet)
    # Remove continuously repeated symbols or characters
    tweet = re.sub(r'(.)\1+', r'\1', tweet)  # AAAB -> AA
    # Convert to lowercase
    tweet = tweet.lower()
    return tweet

# Apply preprocessing to tweets
df['clean_tweet'] = df['tweet'].apply(preprocess_tweet)

# Tokenization, Lemmatization, and Stopwords removal
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove stopwords, single characters, and gibberish-like words
    tokens = [token for token in tokens if len(token) > 1 and token not in stop_words and (token in valid_words or re.match(r'^[a-z]+$', token))]
    return tokens

# Apply tokenization and lemmatization
df['tokenized_tweet'] = df['clean_tweet'].apply(tokenize_and_lemmatize)

# Convert tokenized tweets back to strings
df['clean_tweet_processed'] = df['tokenized_tweet'].apply(lambda x: ' '.join(x))

# Keep only the 'class', 'tweet', and 'clean_tweet_processed' columns
df = df[['class', 'tweet', 'clean_tweet_processed']]

# Save preprocessed data
df.to_csv('preprocessed_data_multiclass.csv', index=False)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\death\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\death\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\death\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\death\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
import pandas as pd

# Load the CSV file into a DataFrame
csv_file_path = 'preprocessed_data_multiclass.csv'  # Change to your CSV file path
df = pd.read_csv(csv_file_path)

# Update the 'class' column: Keep 'CONTROL' as is, change others to 'DIAGNOSED'
df['class'] = df['class'].apply(lambda x: 'CONTROL' if x.upper() == 'CONTROL' else 'DIAGNOSED')

# Save the updated DataFrame to a new CSV file
output_csv_path = 'preprocessed_data_binary.csv'  # Output CSV file path
df.to_csv(output_csv_path, index=False)

print(f"Updated CSV file saved to {output_csv_path}")


Updated CSV file saved to preprocessed_data_binary.csv


In [1]:
import pandas as pd

# Load your CSV file
csv_file_path = 'preprocessed_data_multiclass.csv'  # Replace with your CSV file path
df = pd.read_csv(csv_file_path)

# Display the number of rows before removing missing data
print("Number of rows before removing rows with null values:", len(df))

# Drop rows with any missing/null values
df_cleaned = df.dropna()

# Display the number of rows after removing missing data
print("Number of rows after removing rows with null values:", len(df_cleaned))

# Optionally, you can save the cleaned DataFrame to a new CSV file
output_csv_path = 'C:\\Users\\death\\Desktop\\minor_project\\datasets\\main_dataset\\preprocessed_data_multiclass.csv'  # Change this to your desired output path
df_cleaned.to_csv(output_csv_path, index=False)

print("First few rows of the cleaned DataFrame:")
print(df_cleaned.head())  # Display the first few rows of the cleaned DataFrame


Number of rows before removing rows with null values: 5170493
Number of rows after removing rows with null values: 5163974
First few rows of the cleaned DataFrame:
  class                                              tweet  \
0  ADHD  "@USER AAABDVSGJS NOO you're too kind 😭🥺 more ...   
1  ADHD                    "wow! 🤩 two years here HTTPURL"   
2  ADHD  "nothing can compare to RK900 manboobies thoug...   
3  ADHD  "too bad his titties weren't bara sized huge a...   
4  ADHD  "ended up paying more than what was initially ...   

                               clean_tweet_processed  
0  user abdvsgjs kind like plagued everyone gdjsh...  
1                                wow two year htpurl  
2             nothing compare manbobies though setle  
3  bad tities bara sized huge rather wa flat cant...  
4  ended paying wa initialy agred hairdreser guy ...  


In [2]:
import pandas as pd

# Load your CSV file
csv_file_path = 'preprocessed_data_binary.csv'  # Replace with your CSV file path
df = pd.read_csv(csv_file_path)

# Display the number of rows before removing missing data
print("Number of rows before removing rows with null values:", len(df))

# Drop rows with any missing/null values
df_cleaned = df.dropna()

# Display the number of rows after removing missing data
print("Number of rows after removing rows with null values:", len(df_cleaned))

# Optionally, you can save the cleaned DataFrame to a new CSV file
output_csv_path = 'C:\\Users\\death\\Desktop\\minor_project\\datasets\\main_dataset\\preprocessed_data_binary.csv'  # Change this to your desired output path
df_cleaned.to_csv(output_csv_path, index=False)

print("First few rows of the cleaned DataFrame:")
print(df_cleaned.head())  # Display the first few rows of the cleaned DataFrame


Number of rows before removing rows with null values: 5170493
Number of rows after removing rows with null values: 5163974
First few rows of the cleaned DataFrame:
       class                                              tweet  \
0  DIAGNOSED  "@USER AAABDVSGJS NOO you're too kind 😭🥺 more ...   
1  DIAGNOSED                    "wow! 🤩 two years here HTTPURL"   
2  DIAGNOSED  "nothing can compare to RK900 manboobies thoug...   
3  DIAGNOSED  "too bad his titties weren't bara sized huge a...   
4  DIAGNOSED  "ended up paying more than what was initially ...   

                               clean_tweet_processed  
0  user abdvsgjs kind like plagued everyone gdjsh...  
1                                wow two year htpurl  
2             nothing compare manbobies though setle  
3  bad tities bara sized huge rather wa flat cant...  
4  ended paying wa initialy agred hairdreser guy ...  
