In [18]:
import os
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords

def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Remove "@name"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

def preprocess_data(input_file, output_file):
    # Load the dataset
    data = pd.read_csv(input_file)

    # Check for null values
    null_values = data.isnull().sum()
    print("Total null values in CSV:")
    print(null_values)

    # Remove rows with null values
    data.dropna(inplace=True)

    # Apply text preprocessing
    data['post_text'] = data['post_text'].apply(text_preprocessing)

    # Check for null values
    null_values = data.isnull().sum()
    print("Total null values in CSV:")
    print(null_values)

    # Remove rows with null values
    data.dropna(inplace=True)

    # Save the preprocessed data to a new CSV file
    data.to_csv(output_file, index=False)

if __name__ == "__main__":
    preprocess_data("dataset_3.csv", "preprocessed_data.csv")


Total null values in CSV:
post_text    0
label        0
dtype: int64
Total null values in CSV:
post_text    0
label        0
dtype: int64


In [19]:
df = pd.read_csv('preprocessed_data.csv')
df.head()

Unnamed: 0,post_text,label
0,2 years since diagnosed anxiety depression tod...,depression
1,sunday need break planning spend little time p...,depression
2,awake tired need sleep brain ideas,depression
3,rt retro bears make perfect gifts great beginn...,depression
4,hard say whether packing lists making life eas...,depression
