In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage
import re
import numpy as np
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


# sklearn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from sklearn.cluster import DBSCAN
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.cluster import OPTICS
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MiniBatchKMeans

# adaboost imports
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
# import cross_val_score
from sklearn.model_selection import cross_val_score

# LogisticRegression
from sklearn.linear_model import LogisticRegression

# Tree imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.tree import export_text

from tqdm import tqdm
# from alive_progress import alive_bar

# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
cancel_words = ['ocd','aut*','autism','obsess*','compuls*','disorder','diagnosis']
autism_columns_to_keep = ['author', 'author_flair_richtext', 'author_flair_type','created_utc', 'id', 'is_video', 'selftext', 'title', 'is_original_content','media_only', 'author_fullname','target']
ocd_columns_to_keep = ['author', 'author_flair_richtext', 'author_flair_type','created_utc', 'id', 'is_video', 'selftext', 'title', 'is_original_content','media_only', 'author_fullname','target']

# Helper Functions

In [4]:

def get_keywords(post):
    """Get the keywords from a post"""
    # Get the keywords from the post
    keywords = set()
    for word in re.split("\W+", post.text):
        if word in keywords:
            continue
        else:
            keywords.add(word)
    return keywords


# define the stop words list
stop_words = set(stopwords.words("english"))

# Remove Punctuation
def remove_punctuation(text):
    """Remove punctuation from a string"""
    return ''.join(ch for ch in text if ch not in stop_words)

# Lower Case
def lowercase(text):
    """Lower case a string"""
    return text.lower()

def censor_words(text):
    """
    censor_words takes in a string and replaces all words that are in the cancel_words list with ''

    Parameters

    :param text: string to be censored
    :type text: str
    :return: the censored string
    :rtype: str
    """
    text = str(text) # convert to string if not already

    text = text.lower()
    # Remove all words that begin with 'aut' from the sentence and return the result
    # regex pattern
    pattern = r'aut(.*?)[^a-zA-Z]' # aut followed by any number of characters then ending in any character that is not a letter
    # replace those pattern matches with '' (nothing)
    text =  re.sub(pattern, '', text) # replace the pattern matches with '' (nothing)
    
    # pattern 2 - remove all words that begin with 'ocd' from the sentence and return the result
    pattern = r'ocd(.*?)[^a-zA-Z]' # ocd followed by any number of characters then ending in any character that is not a letter
    # replace those pattern matches with '' (nothing)
    text =  re.sub(pattern, '', text) # replace the pattern matches with '' (nothing)

    # pattern 3 - remove all words that begin with 'obsess' from the sentence and return the result
    pattern = r'obsess|compuls(.*?)[^a-zA-Z]' # obsess followed by any number of characters then ending in any character that is not a letter
    # replace those pattern matches with '' (nothing)
    text =  re.sub(pattern, '', text) # replace the pattern matches with '' (nothing)
    return text # return the result


#* Process the text with these functions

def preprocess(df_ocd, df_autism, cancel_words, ocd_columns_to_keep, autism_columns_to_keep):
    """
    preprocess the dataframes by removing the columns that are not needed, removing the rows that have null values, and removing the rows that contain the cancel words

    _extended_summary_

    :param df_ocd: _description_
    :type df_ocd: _type_
    :param df_autism: _description_
    :type df_autism: _type_
    :param cancel_words: _description_
    :type cancel_words: _type_
    :param ocd_columns_to_keep: _description_
    :type ocd_columns_to_keep: _type_
    :param autism_columns_to_keep: _description_
    :type autism_columns_to_keep: _type_
    """
    # drop columns with more than 50% missing values from the dataframes
    print(f'Dimensions before dropping columns with more than 50% missing values: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')
    df_ocd = df_ocd.dropna(thresh=0.5*len(df_ocd), axis=1)
    df_autism = df_autism.dropna(thresh=0.5*len(df_autism), axis=1)
    print(f'Dimensions after dropping columns with more than 50% missing values: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')
    print(f'columns in df_ocd: {df_ocd.columns}')

    #* Only keep the columns in these two dataframes that are in both dataframes and are in the lists below

    # drop columns that are not in the lists above
    print(f'Dimensions before dropping columns that are not in the lists above: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')
    df_ocd = df_ocd[ocd_columns_to_keep] 
    df_autism = df_autism[autism_columns_to_keep]
    print(f'Dimensions after dropping columns that are not in the lists above: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')

    # Now remove any posts from these dataframes where the `is_video` or `media_only` columsn are True
    print(f'Dimensions before removing posts where `is_video` or `media_only` columns are True: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')
    df_ocd = df_ocd[(df_ocd['is_video'] == False) & (df_ocd['media_only'] == False)]
    df_autism = df_autism[(df_autism['is_video'] == False) & (df_autism['media_only'] == False)]
    print(f'Dimensions after removing posts where `is_video` or `media_only` columns are True: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')

    # and now we can drop the `is_video` and `media_only` columns
    df_ocd = df_ocd.drop(columns=['is_video', 'media_only'])
    df_autism = df_autism.drop(columns=['is_video', 'media_only'])
    print(f'Dropped the `is_video` and `media_only` columns')

    # some posts are in the title column and some are in the selftext column so we need to combine these columns into one column if they are long enough.
    # find the median length of the title and selftext columns combined for each dataframe
    med_len_title_selftext_ocd = df_ocd.title.str.len().add(df_ocd.selftext.str.len()).median()
    med_len_title_selftext_autism = df_autism.title.str.len().add(df_autism.selftext.str.len()).median()
    print(f'Median length of title and selftext columns combined for OCD: {med_len_title_selftext_ocd}')
    print(f'Median length of title and selftext columns combined for Autism: {med_len_title_selftext_autism}')

    # how many posts have a title and selftext combined that are longer than the median length of the title and selftext columns combined for each dataframe?
    print(f'Acceptable number of OCD posts: {len(df_ocd[df_ocd.title.str.len().add(df_ocd.selftext.str.len()) > med_len_title_selftext_ocd])}')
    print(f'Acceptable number of Autism posts: {len(df_autism[df_autism.title.str.len().add(df_autism.selftext.str.len()) > med_len_title_selftext_autism])}')

    # drop author_flair_richtext
    df_ocd = df_ocd.drop(columns=['author_flair_richtext'])
    df_autism = df_autism.drop(columns=['author_flair_richtext'])

    # how many posts are there for the top 100 authors in each dataframe?
    top_authors_ocd = df_ocd.author.value_counts().head(100)
    top_authors_byfullname_ocd = df_ocd.author_fullname.value_counts().head(100)
    top_authors_autism = df_autism.author.value_counts().head(100)
    top_authors_byfullname_autism = df_autism.author_fullname.value_counts().head(100)

    # are there any authors that are in both dataframes?
    print(f'Number of authors that are in both dataframes: {len(set(top_authors_ocd.index).intersection(set(top_authors_autism.index)))}')
    list_of_cross_posters = list(set(top_authors_ocd.index).intersection(set(top_authors_autism.index)))
    print(f'List of authors that are in both dataframes: {list_of_cross_posters}')

    # drop author_flair_type and author_fullname columns from both dataframes
    df_ocd = df_ocd.drop(columns=['author_flair_type', 'author_fullname'])
    df_autism = df_autism.drop(columns=['author_flair_type', 'author_fullname'])

    # combine the title and self text columns into one column with the format `title - selftext`
    df_ocd['title_selftext'] = df_ocd.title + ' - ' + df_ocd.selftext
    df_autism['title_selftext'] = df_autism.title + ' - ' + df_autism.selftext

    # drop the title and selftext columns
    df_ocd = df_ocd.drop(columns=['title', 'selftext'])
    df_autism = df_autism.drop(columns=['title', 'selftext'])

    # rename the `title_selftext` column to `selftext`
    df_ocd = df_ocd.rename(columns={'title_selftext': 'selftext'})
    df_autism = df_autism.rename(columns={'title_selftext': 'selftext'})

    # apply the censor_words function to the selftext column of each dataframe
    df_ocd['selftext'] = df_ocd['selftext'].apply(censor_words)

    # remove punctuation
    df_ocd['selftext'] = df_ocd['selftext'].str.replace('[^\w\s]','')
    # remove numbers
    df_ocd['selftext'] = df_ocd['selftext'].str.replace('\d+', '')
    # remove whitespace
    df_ocd['selftext'] = df_ocd['selftext'].str.replace('\s+', ' ')

    # do the same for the autism dataframe
    df_autism['selftext'] = df_autism['selftext'].apply(censor_words)
    # remove punctuation
    df_autism['selftext'] = df_autism['selftext'].str.replace('[^\w\s]','')
    # remove numbers
    df_autism['selftext'] = df_autism['selftext'].str.replace('\d+', '')
    # remove whitespace
    df_autism['selftext'] = df_autism['selftext'].str.replace('\s+', ' ')

    # remove words from posts that are in the cancel_words list. There are regex patterns in the cancel_words list so we need to use the `regex=True` parameter

    # then remove double spaces
    df_ocd['selftext'] = df_ocd['selftext'].str.replace('  ', ' ')
    df_autism['selftext'] = df_autism['selftext'].str.replace('  ', ' ')

    # make a new dataframe called df_reddit that combines the two dataframes

    df_reddit = pd.DataFrame(columns=df_ocd.columns)
    # what is the length of the shorter dataframe?
    if len(df_ocd) < len(df_autism): # if the OCD dataframe is shorter
        shorter_df = df_ocd # set the shorter dataframe to the OCD dataframe
        longer_df = df_autism # set the longer dataframe to the Autism dataframe
        df_reddit.append()
    else: # if the Autism dataframe is shorter
        shorter_df = df_autism
        longer_df = df_ocd

    # add the shorter dataframe to the new dataframe using concat
    df_reddit = pd.concat([df_reddit, shorter_df], axis=0)
    # shorten the longer dataframe to the length of the shorter dataframe
    longer_df = longer_df.head(len(shorter_df))
    # add the shortened longer dataframe to the new dataframe using concat
    df_reddit = pd.concat([df_reddit, longer_df], axis=0)

    # reset the index
    df_reddit = df_reddit.reset_index(drop=True)

    # shuffle the dataframe
    df_reddit = df_reddit.sample(frac=1).reset_index(drop=True)
    # check the dimensions of the new dataframe
    print(f'Dimensions of the new dataframe: {df_reddit.shape}')

    # double check that the number of posts for each subreddit is the same
    print(f'Number of posts for OCD: {len(df_reddit[df_reddit.target == 1])}')
    print(f'Number of posts for Autism: {len(df_reddit[df_reddit.target == 0])}')



# Section 1: Importing and Loading Data

In [5]:
# Section 1: Importing and Loading Data
# opening the scraped data saved in csv files and creating a dataframe for each
df_ocd = pd.read_csv('../data/ocd_thread.csv')
df_autism = pd.read_csv('../data/autism_thread.csv')

# creating a target column for each dataframe
df_ocd['target'] = 1
df_autism['target'] = 0

  df_ocd = pd.read_csv('../data/ocd_thread.csv')
  df_autism = pd.read_csv('../data/autism_thread.csv')


# Section One. Data Cleaning

In [7]:
# Section 2: Data Cleaning
preprocess(df_ocd, df_autism, cancel_words, ocd_columns_to_keep, autism_columns_to_keep) # calling the preprocess function


Dimensions before dropping columns with more than 50% missing values: (41449, 93) for OCD and (25750, 90) for Autism
Dimensions after dropping columns with more than 50% missing values: (41449, 51) for OCD and (25750, 52) for Autism
columns in df_ocd: Index(['author', 'author_flair_richtext', 'author_flair_type', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'id',
       'is_crosspostable', 'is_reddit_media_domain', 'is_self', 'is_video',
       'link_flair_richtext', 'link_flair_text_color', 'link_flair_type',
       'locked', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'retrieved_on',
       'score', 'selftext', 'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_type', 'thumbnail', 'title', 'url', 'whitelist_status',
       'send_replies', 'no_follow', 'subreddit_subscribers',
       'is_original_content', 'pwls', 'wls', 'media_only', 'is_meta',
       'author_fullname', 'gildi

AttributeError: 'float' object has no attribute 'lower'

#### A preview of each dataframe after cleaning

In [None]:
df_autism.head(2)

In [None]:
df_ocd.head(2)

In [None]:
# how many authors are in each dataframe?
print(f'Number of authors in df_ocd: {len(df_ocd.author.unique())}')
print(f'Number of authors in df_autism: {len(df_autism.author.unique())}')
