### Import Libraries: ###

In [7]:
# imports necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk
#nltk.download('punkt_tab') # RUN WITH THIS ONCE
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Setting up the Dataframe: ###

In [8]:
# imports csv data to pandas dataframe
data = pd.read_csv('Data.csv', index_col=0)

# removes NAN values from dataframe
data.dropna(inplace=True)

### Clean Data: ###

In [9]:
# function used to remove unwanted patterns (url, punctuation, etc)
def rPatterns(text):
    # removes URLs
    text = re.sub(r'http[s]?://\S+', '', text)

    # removes mark-down links
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)

    # removes handles
    text = re.sub(r'@\w+', '', text)

    # removes punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

# used for stemming words
def stem_words(words):
    return ' '.join(stemmer.stem(str(word)) for word in words)

# main code used for cleaning up csv:
# adds columns for number of sentences and characters
data['num_of_chars'] = data['statement'].str.len()
data['num_of_sents'] = data['statement'].apply(lambda x: len(nltk.sent_tokenize(x)))

# changes everything to lowercase
data['lc_statement'] = data['statement'].str.lower()

# removes unwanted text (punctuation, urls, etc.)
data['lc_statement'] = data['lc_statement'].apply(rPatterns)

# performs tokenization (split sentences into list of words)
data['tokens'] = data['lc_statement'].apply(word_tokenize)

# performs stemming (reduce words to their base/root form)
stemmer = PorterStemmer()
data['tokens_stemmed'] = data['tokens'].apply(stem_words)

print(data.head())


                                           statement   status  num_of_chars  \
0                                         oh my gosh  Anxiety            10   
1  trouble sleeping, confused mind, restless hear...  Anxiety            64   
2  All wrong, back off dear, forward doubt. Stay ...  Anxiety            78   
3  I've shifted my focus to something else but I'...  Anxiety            61   
4  I'm restless and restless, it's been a month n...  Anxiety            72   

   num_of_sents                                       lc_statement  \
0             1                                         oh my gosh   
1             2  trouble sleeping confused mind restless heart ...   
2             2  all wrong back off dear forward doubt stay in ...   
3             1  ive shifted my focus to something else but im ...   
4             2  im restless and restless its been a month now ...   

                                              tokens  \
0                                     [oh, my, g

### Prepare data for machine learning models: ###

In [10]:
# separate features and labels
X = data['tokens_stemmed']
y = data['status']

# label encoding target variable
l_enc = LabelEncoder()
y = l_enc.fit_transform(y.values)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# convert text to features using TF-IDF
vect = TfidfVectorizer(ngram_range=(1,2), max_features=50000)
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

# resample
ros = RandomOverSampler(random_state=101)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_vect, y_train)

### Train, Predict, and Score: ###

In [None]:
# defines classifiers and stores them in a dictionary
classifiers = {
    'Bernoulli' : BernoulliNB(alpha=0.1, binarize=0.0),
    'DecTree' : DecisionTreeClassifier(max_depth=9, min_samples_split=5, random_state=101),
    'LogReg' : LogisticRegression(solver='liblinear', penalty='l1', C=10, random_state=101)
}

# holds accuracy scores in case we want them for comparisons
accuracy_scores = []

# trains, tests, and evalutates for all classifiers
for name, clf, in classifiers.items():
    clf.fit(X_train_resampled, y_train_resampled)
    y_pred = clf.predict(X_test_vect)
    accuracy = accuracy_score(y_test, y_pred)

    # prints accuracy score and classification report
    print('')
    print(f"For {name}")
    print(f"Accuracy: {accuracy}")

    # creates confusion matrix and classification report
    labels = l_enc.classes_
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(classification_report(y_test, y_pred, target_names=labels))

    # prints confusion matrix
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='viridis', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {name}')
    plt.show()

    # adds accuracy score to list
    accuracy_scores.append(accuracy)


Doing just LogisticRegression model:

In [None]:
model = LogisticRegression(solver='liblinear', penalty='l1', C=10, random_state=101)
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test_vect)
print(accuracy_score(y_test, y_pred))

0.7595140931954066


Saving models as pickle files for usage in other files:

In [None]:
import pickle

with open('model_pkl', 'wb') as files:
    pickle.dump(model, files)

with open('tfidf_pkl', 'wb') as file:
    pickle.dump(vect, file)

with open('labels_pkl', 'wb') as file:
    pickle.dump(l_enc, file)
