# Data Cleaning and Preprocessing for Modeling

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage
import re
import numpy as np
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas_profiling
from pandas_profiling import ProfileReport

# sklearn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from sklearn.cluster import DBSCAN
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.cluster import OPTICS
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MiniBatchKMeans

# adaboost imports
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
# import cross_val_score
from sklearn.model_selection import cross_val_score

from math import floor
from sklearn.preprocessing import StandardScaler

# LogisticRegression
from sklearn.linear_model import LogisticRegression

# Tree imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.tree import export_text

from tqdm import tqdm
# from alive_progress import alive_bar

# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Functions

In [2]:
def censor_words(text):
    # Remove all words that begin with 'aut' from the sentence and return the result
    # regex pattern
    pattern = r'aut(.*?)[^a-zA-Z]' # aut followed by any number of characters then ending in any character that is not a letter
    # replace those pattern matches with '' (nothing)
    text =  re.sub(pattern, '', text) # replace the pattern matches with '' (nothing)
    
    # pattern 2 - remove all words that begin with 'ocd' from the sentence and return the result
    pattern = r'ocd(.*?)[^a-zA-Z]' # ocd followed by any number of characters then ending in any character that is not a letter
    # replace those pattern matches with '' (nothing)
    text =  re.sub(pattern, '', text) # replace the pattern matches with '' (nothing)

    # pattern 3 - remove all words that begin with 'obsess' from the sentence and return the result
    pattern = r'obsess|compuls(.*?)[^a-zA-Z]' # obsess followed by any number of characters then ending in any character that is not a letter
    # replace those pattern matches with '' (nothing)
    text =  re.sub(pattern, '', text) # replace the pattern matches with '' (nothing)
    #  remove nonalphanumeric characters
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
    return text # return the result

def get_keywords(post):
    """Get the keywords from a post"""
    # Get the keywords from the post
    keywords = set()
    for word in re.split("\W+", post.text):
        if word in keywords:
            continue
        else:
            keywords.add(word)
    return keywords


# define the stop words list
stop_words = set(stopwords.words("english"))

# Remove Punctuation
def remove_punctuation(text):
    """Remove punctuation from a string"""
    return ''.join(ch for ch in text if ch not in stop_words)

# Lower Case
def lowercase(text):
    """Lower case a string"""
    return text.lower()



# Importing the Data

In [3]:
# drop rows where `is_video` or `media_only` columns are True
df_ocd = df_ocd[(df_ocd['is_video'] == False) & (df_ocd['media_only'] == False)]
df_autism = df_autism[(df_autism['is_video'] == False) & (df_autism['media_only'] == False)]
print(f'Dimensions after removing posts where `is_video` or `media_only` columns are True: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')

# opening the scraped data saved in csv files and creating a dataframe for each
df_ocd = pd.read_csv('../data/ocd_thread.csv')
df_autism = pd.read_csv('../data/autism_thread.csv')

# creating a target column for each dataframe
df_ocd['target'] = 1
df_autism['target'] = 0

# drop columns with more than 50% missing values from the dataframes
print(f'Dimensions before dropping columns with more than 50% missing values: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')
df_ocd = df_ocd.dropna(thresh=0.5*len(df_ocd), axis=1)
df_autism = df_autism.dropna(thresh=0.5*len(df_autism), axis=1)
print(f'Dimensions after dropping columns with more than 50% missing values: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')

print(f'columns in df_ocd: {df_ocd.columns}')
#* Only keep the columns in these two dataframes that are in both dataframes and are in the lists below
autism_columns_to_keep = ['author', 'author_flair_richtext', 'author_flair_type','created_utc', 'id', 'is_video', 'selftext', 'title', 'is_original_content','media_only', 'author_fullname','target']
ocd_columns_to_keep = ['author', 'author_flair_richtext', 'author_flair_type','created_utc', 'id', 'is_video', 'selftext', 'title', 'is_original_content','media_only', 'author_fullname','target']

# drop columns that are not in the lists above
print(f'Dimensions before dropping columns that are not in the lists above: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')
df_ocd = df_ocd[ocd_columns_to_keep] 
df_autism = df_autism[autism_columns_to_keep]
print(f'Dimensions after dropping columns that are not in the lists above: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')

# Now remove any posts from these dataframes where the `is_video` or `media_only` columns are True
print(f'Dimensions before removing posts where `is_video` or `media_only` columns are True: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')
df_ocd = df_ocd[(df_ocd['is_video'] == False) & (df_ocd['media_only'] == False)]
df_autism = df_autism[(df_autism['is_video'] == False) & (df_autism['media_only'] == False)]
print(f'Dimensions after removing posts where `is_video` or `media_only` columns are True: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')

# and now we can drop the `is_video` and `media_only` columns
df_ocd = df_ocd.drop(columns=['is_video', 'media_only'])
df_autism = df_autism.drop(columns=['is_video', 'media_only'])
print(f'Dropped the `is_video` and `media_only` columns')

# some posts are in the title column and some are in the selftext column so we need to combine these columns into one column if they are long enough.
# find the median length of the title and selftext columns combined for each dataframe
med_len_title_selftext_ocd = df_ocd.title.str.len().add(df_ocd.selftext.str.len()).median()
med_len_title_selftext_autism = df_autism.title.str.len().add(df_autism.selftext.str.len()).median()
print(f'Median length of title and selftext columns combined for OCD: {med_len_title_selftext_ocd}')
print(f'Median length of title and selftext columns combined for Autism: {med_len_title_selftext_autism}')


# how many posts have a title and selftext combined that are longer than the median length of the title and selftext columns combined for each dataframe?
print(f'Acceptable number of OCD posts: {len(df_ocd[df_ocd.title.str.len().add(df_ocd.selftext.str.len()) > med_len_title_selftext_ocd])}')
print(f'Acceptable number of Autism posts: {len(df_autism[df_autism.title.str.len().add(df_autism.selftext.str.len()) > med_len_title_selftext_autism])}')

# remove posts where the title and selftext combined are shorter than the median length of the title and selftext columns combined for each dataframe
print(f'Dimensions before: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')
df_ocd = df_ocd[df_ocd.title.str.len().add(df_ocd.selftext.str.len()) > med_len_title_selftext_ocd]
df_autism = df_autism[df_autism.title.str.len().add(df_autism.selftext.str.len()) > med_len_title_selftext_autism]
print(f'Dimensions before: {df_ocd.shape} for OCD and {df_autism.shape} for Autism')

# drop author_flair_richtext
df_ocd = df_ocd.drop(columns=['author_flair_richtext'])
df_autism = df_autism.drop(columns=['author_flair_richtext'])

# how many authors are in each dataframe?
print(f'Number of authors in df_ocd: {len(df_ocd.author.unique())}')
print(f'Number of authors in df_autism: {len(df_autism.author.unique())}')


# how many posts are there for the top 100 authors in each dataframe?
top_authors_ocd = df_ocd.author.value_counts().head(100)
top_authors_byfullname_ocd = df_ocd.author_fullname.value_counts().head(100)
top_authors_autism = df_autism.author.value_counts().head(100)
top_authors_byfullname_autism = df_autism.author_fullname.value_counts().head(100)


  df_ocd = pd.read_csv('../data/ocd_thread.csv')
  df_autism = pd.read_csv('../data/autism_thread.csv')


Dimensions before dropping columns with more than 50% missing values: (41449, 93) for OCD and (25750, 90) for Autism
Dimensions after dropping columns with more than 50% missing values: (41449, 51) for OCD and (25750, 52) for Autism
columns in df_ocd: Index(['author', 'author_flair_richtext', 'author_flair_type', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'id',
       'is_crosspostable', 'is_reddit_media_domain', 'is_self', 'is_video',
       'link_flair_richtext', 'link_flair_text_color', 'link_flair_type',
       'locked', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'retrieved_on',
       'score', 'selftext', 'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_type', 'thumbnail', 'title', 'url', 'whitelist_status',
       'send_replies', 'no_follow', 'subreddit_subscribers',
       'is_original_content', 'pwls', 'wls', 'media_only', 'is_meta',
       'author_fullname', 'gildi

In [4]:
top_authors_ocd.head(2)

Userur      143
corinaah     44
Name: author, dtype: int64

In [5]:
top_authors_autism.head(2)

Jupiter642           47
anonaskingaccount    32
Name: author, dtype: int64

In [6]:
# are there any authors that are in both dataframes?
print(f'Number of authors that are in both dataframes: {len(set(top_authors_ocd.index).intersection(set(top_authors_autism.index)))}')
list_of_cross_posters = list(set(top_authors_ocd.index).intersection(set(top_authors_autism.index)))
print(f'List of authors that are in both dataframes: {list_of_cross_posters}')

Number of authors that are in both dataframes: 0
List of authors that are in both dataframes: []


In [7]:
# drop author_flair_type and author_fullname columns from both dataframes
df_ocd = df_ocd.drop(columns=['author_flair_type', 'author_fullname'])
df_autism = df_autism.drop(columns=['author_flair_type', 'author_fullname'])
# combine the title and self text columns into one column with the format `title - selftext`
df_ocd['title_selftext'] = df_ocd.title + ' - ' + df_ocd.selftext
df_autism['title_selftext'] = df_autism.title + ' - ' + df_autism.selftext
# drop the title and selftext columns
df_ocd = df_ocd.drop(columns=['title', 'selftext'])
df_autism = df_autism.drop(columns=['title', 'selftext'])

# rename the `title_selftext` column to `selftext`
df_ocd = df_ocd.rename(columns={'title_selftext': 'selftext'})
df_autism = df_autism.rename(columns={'title_selftext': 'selftext'})

# randomly sample one post from each dataframe and print it
print(f'Random OCD post: {df_ocd.sample(1).selftext.values[0]}')
print('='*100)
print(f'Random Autism post: {df_autism.sample(1).selftext.values[0]}')

cancel_words = ['ocd','aut*','autism','obsess*','compuls*','disorder','executive dysfunction','adhd','diagnosis','ive been taking','spectrum','intrusive thoughts','germaphobes','depression']

# apply the censor_words function to the selftext column of each dataframe
df_ocd['selftext'] = df_ocd['selftext'].apply(censor_words)

# remove punctuation
df_ocd['selftext'] = df_ocd['selftext'].str.replace('[^\w\s]','')
# remove numbers
df_ocd['selftext'] = df_ocd['selftext'].str.replace('\d+', '')
# remove whitespace
df_ocd['selftext'] = df_ocd['selftext'].str.replace('\s+', ' ')

# do the same for the autism dataframe
df_autism['selftext'] = df_autism['selftext'].apply(censor_words)
# remove punctuation
df_autism['selftext'] = df_autism['selftext'].str.replace('[^\w\s]','')
# remove numbers
df_autism['selftext'] = df_autism['selftext'].str.replace('\d+', '')
# remove whitespace
df_autism['selftext'] = df_autism['selftext'].str.replace('\s+', ' ')

# remove words from posts that are in the cancel_words list. There are regex patterns in the cancel_words list so we need to use the `regex=True` parameter

# then remove double spaces
df_ocd['selftext'] = df_ocd['selftext'].str.replace('  ', ' ')
df_autism['selftext'] = df_autism['selftext'].str.replace('  ', ' ')

# make a new dataframe called df_reddit that combines the two dataframes



Random OCD post: Why do some people hate Brain Lock? - My main struggle is with the compulsion to ruminate and so I am trying to discover new techniques to lessen this. I stumbled upon Schwartz and his 4 step plan which goes pretty much like this: Relabel the thought as OCD, reattribute the cause of OCD to a mental disorder instead of yourself, refocus on another activity, revalue the OCD thoughts as meaningless. In another variation, three steps are applied and pretty much go like this: label the thought as a thought/urge/feeling, determine if it is helpful to you right now in the present, refocus on another activity. I also discovered that many people do not use the 4 step plan because it relies on saying “it’s not me, it’s OCD.” Which method, if any, should I use?
Random Autism post: Has anyone else experienced this? I'm a new step father to a child with autism - Just a quick question, more curious than anything. My wife's son/my step son is 7 years old with autism. Very high functi

  df_ocd['selftext'] = df_ocd['selftext'].str.replace('[^\w\s]','')
  df_ocd['selftext'] = df_ocd['selftext'].str.replace('\d+', '')
  df_ocd['selftext'] = df_ocd['selftext'].str.replace('\s+', ' ')
  df_autism['selftext'] = df_autism['selftext'].str.replace('[^\w\s]','')
  df_autism['selftext'] = df_autism['selftext'].str.replace('\d+', '')
  df_autism['selftext'] = df_autism['selftext'].str.replace('\s+', ' ')


## Equilibrium 

In [8]:
df_reddit = pd.DataFrame(columns=df_ocd.columns)
# what is the length of the shorter dataframe?
if len(df_ocd) < len(df_autism): # if the OCD dataframe is shorter
    shorter_df = df_ocd # set the shorter dataframe to the OCD dataframe
    longer_df = df_autism # set the longer dataframe to the Autism dataframe
    df_reddit.append()
else: # if the Autism dataframe is shorter
    shorter_df = df_autism
    longer_df = df_ocd

# add the shorter dataframe to the new dataframe using concat
df_reddit = pd.concat([df_reddit, shorter_df], axis=0)
# shorten the longer dataframe to the length of the shorter dataframe
longer_df = longer_df.head(len(shorter_df))
# add the shortened longer dataframe to the new dataframe using concat
df_reddit = pd.concat([df_reddit, longer_df], axis=0)

# reset the index
df_reddit = df_reddit.reset_index(drop=True)


# shuffle the dataframe
df_reddit = df_reddit.sample(frac=1).reset_index(drop=True)
# check the dimensions of the new dataframe
print(f'Dimensions of the new dataframe: {df_reddit.shape}')
df_reddit.head(5)

Dimensions of the new dataframe: (18042, 6)


  df_reddit = pd.concat([df_reddit, longer_df], axis=0)


Unnamed: 0,author,created_utc,id,is_original_content,target,selftext
0,HyperWendingo,1547408376,afmxb9,False,0,romance sucks i honestly am starting to believ...
1,forest_purple,1551164597,auwfm7,False,0,how to cope with a simple change and frustrati...
2,alternativesnek,1588998087,gg8o90,False,1,the guilt behind the rules i hate how much unn...
3,maxturbated,1568069149,d1yi2z,False,0,my newtype theory my newtype theorynote skip i...
4,ladychick84,1555951440,bg4ag0,False,0,help need advice for asd year old hi my daught...


In [9]:
# double check that the number of posts for each subreddit is the same
print(f'Number of posts for OCD: {len(df_reddit[df_reddit.target == 1])}')
print(f'Number of posts for Autism: {len(df_reddit[df_reddit.target == 0])}')


Number of posts for OCD: 9021
Number of posts for Autism: 9021


In [10]:
df_ocd.head(2)

Unnamed: 0,author,created_utc,id,is_original_content,target,selftext
2016,mindkingdom,1527063603,8lhprn,False,1,thoughts that things could get corrupted or br...
2020,SteelSlayer7,1527074859,8lil9v,False,1,intrusive thoughts devaluing myself i am plagu...


In [11]:
df_autism.head(2)

Unnamed: 0,author,created_utc,id,is_original_content,target,selftext
0,Equadex,1546777579,ad56om,False,0,how can people be considered equals if they ar...
5,936R,1546786707,ad68am,False,0,questions for females with hi there im looking...


In [12]:
# find any of the medications in the selftext column that are in the data/drug_info.csv file under the Medication Name column and replace them with ' ' (empty string)
drug_info = pd.read_csv('../data/drug_info.csv')
drug_info['Medication Name'] = drug_info['Medication Name'].str.lower()
# create a list of the medications
medications = drug_info['Medication Name'].tolist()
print(f'Number of medications: {len(medications)}')
# how many posts contain a medication?
print(f'Number of posts that contain a medication: {len(df_reddit[df_reddit.selftext.str.contains("|".join(medications), regex=True)])}')

Number of medications: 3047
Number of posts that contain a medication: 18039


In [13]:
medications[0]

'abacavir sulfate'

In [14]:
medications = [med for med in medications if len(med) > 5]
# create a list of rows and the medications mentioned in each row
import os
medications_mentioned = []
if os.path.exists('../data/cleaned_reddit.csv'):
    pass
else:
    # with alive_bar (len(df_reddit)) as bar:
    for index, row in df_reddit.iterrows(): # iterate through each row in the dataframe
        # use regex to find all of the medications in the selftext column
        meds = re.findall(r'\b(?:{})\b'.format('|'.join(medications)), row['selftext'])
        if len(meds) > 0: # if there are medications mentioned in the post
            # replace the medications with ' ' (empty string)
            row['selftext'] = re.sub(r'\b(?:{})\b'.format('|'.join(medications)), ' ', row['selftext'])
            medications_mentioned.extend(meds) # add the medications to the medications_mentioned list
            # remove duplicate medications
            medications_mentioned = list(set(medications_mentioned))
            # bar()
# remove the words from the selftext column that are in the medications list
# if the file does not already exist, create it
if os.path.exists('../data/cleaned_reddit.csv'):
    # load the file
    df_reddit = pd.read_csv('../data/cleaned_reddit.csv')
else:
    print('File does not exist. Creating it now. Before meds removed from selftext the length of the dataframe is: ', len(df_reddit))
    print(f' Removed {len(medications_mentioned)} medications from the selftext column')
    # save the dataframe to a csv file
    df_reddit.to_csv('../data/cleaned_reddit.csv', index=False)
# Now we want to clean the text in the self text column
# remove punctuation
df_reddit['selftext'] = df_reddit['selftext'].str.replace('[^\w\s]','')
# remove numbers
df_reddit['selftext'] = df_reddit['selftext'].str.replace('\d+', '')
# remove double spaces
df_reddit['selftext'] = df_reddit['selftext'].str.replace('  ', ' ')
# remove single characters
df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')
# remove newlines
df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'\n', ' ')
# remove urls
df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'http\S+', '')
# remove html tags
df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'<.*?>', '')
# remove extra spaces
df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'\s+', ' ')
# remove extra spaces at the beginning of the string
df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'^\s+', '')
# remove extra spaces at the end of the string
df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'\s+$', '')

# read the file into a dataframe
df_reddit = pd.read_csv('../data/cleaned_reddit.csv')
# remove any rows that have a null value in the selftext column
df_reddit = df_reddit.dropna(subset=['selftext'])
# reset the index
df_reddit = df_reddit.reset_index(drop=True)
# check the dimensions of the dataframe
print(f'Dimensions of the dataframe: {df_reddit.shape}')
df_reddit.head(5)


def num_distinct_words(text,df):
    # for this text, find the words that do not appear in any other text in the dataframe column 'selftext'
    # split the text into a list of words
    if type(text) == str:
        text = text.split(' ')
        # find the number of words that are not in any other text in the dataframe
    else:
        # the text is a list of words
        words = text
        pass
    #words = text.split(' ')
    # find the number of words that are not in any other text in the dataframe
    distinct_words = [word for word in words if word not in df['selftext'].str.split(' ').sum()]
    number_distinct_words = len(distinct_words) # find the number of distinct words
    return number_distinct_words, distinct_words



df = df_reddit.copy() # make a copy of the dataframe

#* flag
block_active = False # do not run this section of code unless the flag is set to True.


# if ../data/distinct_words_and_extras.csv does not exist, then create it with the following code

if block_active:
    if os.path.exists('../data/distinct_words_and_extras.csv'):
        # load the file as df
        df = pd.read_csv('../data/distinct_words_and_extras.csv')
        print(f'Loaded the file ../data/distinct_words_and_extras.csv')
        print(f'   the included features are {df.columns}')
    else:
        # add selftext_length as a column
        df['selftext_length'] = df['selftext'].str.len()
        # add number of words as a column (split the selftext column on spaces)
        df['num_words'] = df['selftext'].str.split(' ').str.len()
        # add number of sentences as a column (split the selftext column on periods)
        df['num_sentences'] = df['selftext'].str.split('.').str.len()
        # add number of distinct words as a column (words that only this user used)

        df['num_distinct_words'] = df['selftext'].str.split(' ').apply(lambda x: num_distinct_words(x,df)[0]) # add the number of distinct words as a column
        df['distinct_words'] = df['selftext'].str.split(' ').apply(lambda x: num_distinct_words(x,df)[1]) # add the distinct words as a column


        scaler = StandardScaler()
        df['selftext_length'] = scaler.fit_transform(df['selftext_length'].values.reshape(-1, 1))

        print(f'Scaled the new "selftext_length" column. The mean is {df["selftext_length"].mean()} and the standard deviation is {df["selftext_length"].std()}')
        print(f' The median is {df["selftext_length"].median()} and the max is {df["selftext_length"].max()}')


        # save these columns to a csv file
        df.to_csv('../data/distinct_words_and_extras.csv', index=False)

        print(f'Saved the distinct words used by each user to a csv file "distinct_words_and_extras.csv" in the data folder')
        print(f'   this file also includes the number of words, sentences, scaled selftext_length, and distinct words used by each user')



  df_reddit['selftext'] = df_reddit['selftext'].str.replace('[^\w\s]','')
  df_reddit['selftext'] = df_reddit['selftext'].str.replace('\d+', '')
  df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')
  df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'\n', ' ')
  df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'http\S+', '')
  df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'<.*?>', '')
  df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'\s+', ' ')
  df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'^\s+', '')
  df_reddit['selftext'] = df_reddit['selftext'].str.replace(r'\s+$', '')


Dimensions of the dataframe: (12544, 33)


# data

* [X_df.csv](./data/X_df.csv) - The features used in the model.
* [autism_thread.csv](./data/autism_thread.csv) - The raw data from the autism thread.
* [best_scores.csv](./data/best_scores.csv) - The best scores from the model.
* [cleaned_reddit.csv](./data/cleaned_reddit.csv) - The cleaned data from the autism thread.
* [cleaned_reddit_withsentiment.csv](./data/cleaned_reddit_withsentiment.csv) - The cleaned data from the autism thread with sentiment analysis. Also includes the OCD thread.
* [cvec.csv](./data/cvec.csv) - The cvec data. Count Vectorized data. This is used in the model.
* [cvec_vocab.txt](./data/cvec_vocab.txt) - The vocabulary used in the cvec data.
* [df_after_feature_engineering.csv](./data/df_after_feature_engineering.csv) - The data after feature engineering. This may not be used in the model.
* [df_cleaned.csv](./data/df_cleaned.csv) - The cleaned data from the autism and OCD threads.
* [drug_info.csv](./data/drug_info.csv) - The drug information from the drugbank database.
* [global_variables.csv](./data/global_variables.csv) - The global variables used in the model.
* [master_results_dataframe.csv](./data/master_results_dataframe.csv) - The master results dataframe.
* [ocd_thread.csv](./data/ocd_thread.csv) - The raw data from the OCD thread.
* [reddit_threads.csv](./data/reddit_threads.csv) - The raw data from the autism and OCD threads.
* [tfidf.csv](./data/tfidf.csv) - The tfidf data. Term Frequency Inverse Document Frequency data. This is used in the model.
* [tfidf_vocab.txt](./data/tfidf_vocab.txt) - The vocabulary used in the tfidf data.
* [y.csv](./data/y.csv) - The target variable used in the model.

In [None]:
# files and their contents
