Note that in your implementation, we recommend you to:

*    explore both DT and RF models
*    explore different parameter settings to find a model that works best on your data
*    comment on important steps
*    interpret model performance
*    discuss pros and cons of each model


#### Important modules for import ####

In [13]:
import os
import numpy as np
import pandas as pd
import re, string
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# import spacy
# nlp = spacy.load("en_core_web_sm")

from nltk.corpus import stopwords
import nltk
# nltk.download("stopwords")
from nltk.stem import PorterStemmer


from tqdm import tqdm
from bs4 import BeautifulSoup

# for counting
from collections import Counter

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import graphviz
import matplotlib.pyplot as plt
%matplotlib inline

#### IGNORE THIS SET UP: only needed to run once to save the file for later use ####

In [2]:
# #### DON'T RUN AGAIN ####
# # for this task, merge them


# # import the correct csv files from another file
# import sys
# sys.path.append("D:\\hw\\adopt-proj\\")
# from adopt_setup import adopted_posts, adopted_comms, adoption_posts, adoption_comms

# # get the dataframes
# adp_df = adopted_posts
# adc_df = adopted_comms
# anp_df = adoption_posts
# anc_df = adoption_comms

# # concat all the dfs
# all_df = pd.concat([adp_df, adc_df, anp_df, anc_df]).reset_index(drop=True)


# ### function from HW2 of Content Analysis ###


# def word_tokenize(word_list):
#     tokenized = []
#     # pass word list through language model.
#     doc = nlp(word_list)
#     for token in doc:
#         if not token.is_punct and len(token.text.strip()) > 0:
#             tokenized.append(token.text)
#     return tokenized


# def normalizeTokens(word_list, extra_stop=[]):
#     #We can use a generator here as we just need to iterate over it
#     normalized = []
#     if type(word_list) == list and len(word_list) == 1:
#         word_list = word_list[0]

#     if type(word_list) == list:
#         word_list = ' '.join([str(elem) for elem in word_list])

#     doc = nlp(word_list.lower())

#     # add the property of stop word to words considered as stop words
#     if len(extra_stop) > 0:
#         for stopword in extra_stop:
#             lexeme = nlp.vocab[stopword]
#             lexeme.is_stop = True

#     for w in doc:
#         # if it's not a stop word or punctuation mark, add it to our article
#         if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
#             # we add the lematized version of the word
#             normalized.append(str(w.lemma_))

#     return normalized


# ### TOKENIZE AND NORMALIZE WORDS ###
# # takes around 66 minutes to run
# all_df["tokenized_text"] = all_df.loc[:,'post_text'].apply(lambda x: word_tokenize(x))
# all_df['word_counts'] = all_df.loc[:,'tokenized_text'].apply(lambda x: len(x))
# # takes around 61 minutes to run
# all_df['normalized_tokens'] = all_df['tokenized_text'].apply(lambda x: normalizeTokens(x))
# all_df['normalized_tokens_count'] = all_df['normalized_tokens'].apply(lambda x: len(x))

# # drop duplicates
# all_df = all_df.drop_duplicates(subset=['post_text', "user"])
# all_df.reset_index(drop=True)

# need to remove 
# all_df.loc[all_df["user_flair"] == "nan","is_adoptee"] = np.nan # might need to add .astype(str)
# all_df.loc[all_df["user_flair"] == "nan","user_flair"] = np.nan # might need to add .astype(str)

# # save data as pkl file (to keep data type, as opposed to csv which casts all to str)
# all_df.to_pickle("all_df.pkl")

## THIS is where the code begins ##

In [14]:
## IMPORT OUR CLEANED DATA ##
all_df = pd.read_pickle("D:\\hw\\adopt-proj\\newest_all_df.pkl")

In [4]:
# check the number of nulls in each column
# since most text is from comments, they do not have title, n_comments, or post_flair
all_df.isnull().sum()

user                  979
user_flair         247339
post_date               0
post_flair         297083
score                  74
n_comments         287462
link                51459
is_comment              0
subreddit               0
full_text               0
cleaner_text            0
full_tokens             0
word_count              0
norm_tokens             0
tokens_sents            0
norm_sents              0
POS_sents               0
is_adoptee              0
num_tokens              0
num_norm_tokens         0
sentiment               0
dtype: int64

In [5]:
# functions from TextClassification.ipynb

def decontracted(phrase):
    """
    Expand the contracted phrase into normal words
    """
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase) # prime 
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase


def clean_text(df):
    """
    Clean the review texts
    """
    cleaned_post_text = []

    for post_text in tqdm(df['post_text']):
        
        # expand the contracted words
        post_text = decontracted(post_text)
        
        #remove html tags
        post_text = BeautifulSoup(post_text, 'lxml').get_text().strip() # re.sub(r'<.*?>', '', text)
        
        #remove non-alphabetic characters
        post_text = re.sub("[^a-zA-Z]"," ", post_text)
    
        #remove url 
        post_text = re.sub(r'https?://\S+|www\.\S+', '', post_text)
        
        #Removing punctutation, string.punctuation in python consists of !"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`
        post_text = post_text.translate(str.maketrans('', '', string.punctuation))
        # ''.join([char for char in movie_text_data if char not in string.punctuation])
        
        # remove emails
        post_text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", '', post_text)
    
        cleaned_post_text.append(post_text)

    return cleaned_post_text

In [6]:
# all_df['cleaner_text'] = clean_text(all_df)

First we need to find the subset of the data that contains labeled data, the ground truth. We are assuming here that users who add flairs to their names are doing so truthfully. All flairs were checked using regular expressions and manually inspected to determine whether each use was an adoptee or not. Furthermore, any flair that could not be determined to be some form of adoptee, which includes adoptive parents, etc. was coded as False. All posts who had no attached user flair were deemed NaN.

In [15]:
all_df = all_df[all_df.num_tokens > 25]

In [17]:
truth_df = all_df[all_df.is_adoptee < 2]

CHanges "is_adoptee" bools to binary 0 and 1 in a new column called "labels"

Find our stopwords

In [18]:
stop_words = stopwords.words('english')

#### GETTING INTO MACHINE LEARNING, VECTORIZATION, DECISION TREES, ETC ####

It is important to tune the max_df, min_df parameters and possibly add more stop words. For this example, I will use a count vectorizer (bag of words) for this classification task

In [19]:
vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words, max_df=0.9, min_df=3, ngram_range=(1,1))

# convert the cleaned reviews to vectors
X = vectorizer.fit_transform(truth_df.cleaner_text)
y = truth_df.is_adoptee

#### Train and test splitting ####

In [20]:
train_idx, test_idx = train_test_split(np.arange(truth_df.shape[0]), test_size=0.3, 
                                       shuffle=True, random_state=42)

X_train = X[train_idx]
y_train = y.iloc[train_idx]

X_test = X[test_idx]
y_test = y.iloc[test_idx]

print("Training data: X_train : {}, y_train : {}".format(X_train.shape, y_train.shape))
print("Testing data: X_test : {}, y_test : {}".format(X_test.shape, y_test.shape))

Training data: X_train : (17021, 16016), y_train : (17021,)
Testing data: X_test : (7296, 16016), y_test : (7296,)


We can see that the training data is 70% where as testing is 30%.

In [21]:
count_y_train = Counter(y_train)
count_y_test = Counter(y_test)
print(f"Training data distribution: {count_y_train[0]} (not-adoptee), {count_y_train[1]} (adoptee)")
print(f"Testing data distribution: {count_y_test[0]} (not-adoptee), {count_y_test[1]} (adoptee)")

Training data distribution: 7019 (not-adoptee), 10002 (adoptee)
Testing data distribution: 2982 (not-adoptee), 4314 (adoptee)


We can also note that the distributions for testing and training regarding adoptee and non adoptee are roughly the same

#### CREATING A DECISION TREE MODEL ####

We see that the DT classifier correctly classifies the texts to the correct label in .65 of all cases. I will now change some of the DT parameters to see if this improves model fit. This model only barely does better than a model that would predict all posts as being written by an adoptee. I fear that not setting max_depth has lead to the tree overfitting the training data. Therefore it is important to tune parameters for the DT classifier. 

In [22]:
dt_clf = DecisionTreeClassifier(criterion="gini",random_state=42)

dt_clf.fit(X_train, y_train)

# then predict on test set
print(float("{:.5f}".format(dt_clf.score(X_test, y_test))))

0.65228


We see that after limiting the max_depth to 12, changing splitting criterion to entropy, setting max_leaf_notes to 100, and setting max features to a proportion of .8; the fit has improved (.682) to be slightly better than 2/3. Also I tinkered around with other settings, but I couldn't drastically improve the model more than this. Changing the max features, max depth, and max leaf nodes probably reduced the ability of the tree from overfitting the training data. 

In [23]:
dt_clf = DecisionTreeClassifier(criterion="entropy", max_depth=12,
                                random_state=42, max_leaf_nodes=100,
                                max_features=.8)

dt_clf.fit(X_train, y_train)

# then predict on test set
print(float("{:.5f}".format(dt_clf.score(X_test, y_test))))

0.68503


We see that after adjusting for class weights (since there are about double the amount of non adoptees as adoptees in the training and testing splits), our model performs worse again (.647)

In [24]:
dt_clf = DecisionTreeClassifier(criterion="entropy", max_depth=12,
                                random_state=42, max_leaf_nodes=100,
                                max_features=.8, class_weight="balanced")

dt_clf.fit(X_train, y_train)

# then predict on test set
print(float("{:.5f}".format(dt_clf.score(X_test, y_test))))

0.67859


#### Feature Importance ####

Because this model is using text data, getting at which features are important for the model is made more difficult. I am not totally sure how I would go about doing this for text data where the features are individual word counts.

In [25]:
y_predprob_test = dt_clf.predict_proba(X_test)
probs = pd.DataFrame(y_predprob_test[1:], columns= ["predictprob1", "predict_prob0"])

#### Model Evaluation ####

In [26]:
y_pred_test = dt_clf.predict(X_test)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.61      0.58      0.59      2982
           1       0.72      0.75      0.73      4314

    accuracy                           0.68      7296
   macro avg       0.67      0.66      0.66      7296
weighted avg       0.68      0.68      0.68      7296



From this table we can see that the recall for 1 is quite low (of all adoptee posts, only 35% were labeled as such). The precision is also low (less than half of all posts classified as being adopted actually were). Overall, the f1-score is also low for 1. This means that the model performs worse than random chance. For 0, non-adopted posts, precision, recall, and f1 score are generally much better.

## Trying a different method for word vectorizer ##

Changing the vectorizer

In [27]:
vectorizer = TfidfVectorizer(lowercase=True, stop_words=stop_words, max_df=0.9, min_df=3, ngram_range=(1,2))

# convert the cleaned reviews to vectors
X = vectorizer.fit_transform(truth_df.cleaner_text)
y = truth_df.is_adoptee

Train/Test split

In [28]:
train_idx, test_idx = train_test_split(np.arange(truth_df.shape[0]), test_size=0.3, 
                                       shuffle=True, random_state=42)
X_train = X[train_idx]
y_train = y.iloc[train_idx]
X_test = X[test_idx]
y_test = y.iloc[test_idx]

### Reapply the classifier and fit the model ###

Under the same classifier criteria, the DT using tf-idf vectorizer has slightly improved the overall model performance compared to countevectorizer. To note, these models take much longer to run than countvectorizer

In [29]:
dt_clf = DecisionTreeClassifier(criterion="gini",random_state=42)

dt_clf.fit(X_train, y_train)

# then predict on test set
print(float("{:.5f}".format(dt_clf.score(X_test, y_test))))

0.64556


#### Adjust the classifier hyperparams ####

Again, compared to the best model using countvectorizer, using tf-idf vectorizer has very slightly improved the model performance. The model will correctly guess that a post is made by an adoptee or not in 68.21% of cases.

In [30]:
dt_clf = DecisionTreeClassifier(criterion="entropy", max_depth=12,
                                random_state=42, max_leaf_nodes=100,
                                max_features=.8, class_weight="balanced")

dt_clf.fit(X_train, y_train)

# then predict on test set
print(float("{:.5f}".format(dt_clf.score(X_test, y_test))))

0.68147


#### Let's try to improve this model by adjusting the parameters in different ways ####

This is the best tuned model I could get by changing around the hyperparameters. It seems that a combination of changing the splitting criterion to GINI, increasing the max depth to 20, and reducing the max_features to half of all features (.5) have produced the best results. Evidently, this model produces a score of .684, the best yet. It will correctly label posts in 68.42% of cases.

In [31]:
dt_clf = DecisionTreeClassifier(criterion="gini", max_depth=20,
                                random_state=42, max_leaf_nodes=100,
                                max_features=.5, class_weight="balanced")

dt_clf.fit(X_train, y_train)

# then predict on test set
print(float("{:.5f}".format(dt_clf.score(X_test, y_test))))

0.68353


In [32]:
y_pred_test = dt_clf.predict(X_test)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.61      0.61      0.61      2982
           1       0.73      0.73      0.73      4314

    accuracy                           0.68      7296
   macro avg       0.67      0.67      0.67      7296
weighted avg       0.68      0.68      0.68      7296



We can see that compared to the other vectorizer, tf-idf performs better at precision for adoptee posts, but worse for non adoptees. Recall for non adoptee posts is .95, which means that almost all non adoptee posts were labeled correctly. The f1 score for non adoptee posts is slightly higher than countvectorizer at .80 (vs .75). However, this model fairs worse for recall and f1 score of adoptee posts. Recall for adoptee posts is half of that of the other model (.16 vs .35). Furthermore the f1-score has decreased as a result as well. 

## RANDOM FORESTS ##

Though the accuracy of the other model using the tf-idf vectorizer was better, the f1-score was much worse. Therefore let us go back to using countvectorizer for the time being.

In [33]:
vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words, max_df=0.9, 
                             min_df=3, ngram_range=(1,1))
X = vectorizer.fit_transform(truth_df.cleaner_text)
y = truth_df.is_adoptee
train_idx, test_idx = train_test_split(np.arange(truth_df.shape[0]), test_size=0.3, 
                                       shuffle=True, random_state=42)
X_train = X[train_idx]
y_train = y.iloc[train_idx]
X_test = X[test_idx]
y_test = y.iloc[test_idx]

In [34]:
rf_clf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', 
                                random_state = 42, class_weight="balanced")

# train RFC
rf_clf.fit(X_train, y_train)

# evaluate the random forest classifier on test set 
print(float("{:.5f}".format(rf_clf.score(X_test, y_test))))

0.74973


In [35]:
y_pred_test = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.77      0.56      0.65      2982
           1       0.74      0.88      0.81      4314

    accuracy                           0.75      7296
   macro avg       0.75      0.72      0.73      7296
weighted avg       0.75      0.75      0.74      7296



This score is an improvement over any of the classifiers performed thus far. The accuracy of the model is 70% meaning that it accurately labels adoptee and non-adoptee posts 69.7% of the time. Which is still not super good, considering if one predicted all posts were made by non adoptees, the accuracy of the model would still be 66.7%.

### Trying with the other vectorizer ###

In [36]:
vectorizer = TfidfVectorizer(lowercase=True, stop_words=stop_words, max_df=0.9, 
                             min_df=3, ngram_range=(1,2))
X = vectorizer.fit_transform(truth_df.cleaner_text)
y = truth_df.is_adoptee
train_idx, test_idx = train_test_split(np.arange(truth_df.shape[0]), test_size=0.3, 
                                       shuffle=True, random_state=42)
X_train = X[train_idx]
y_train = y.iloc[train_idx]
X_test = X[test_idx]
y_test = y.iloc[test_idx]

In [37]:
rf_clf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', 
                                random_state = 42, class_weight="balanced")

# train RFC
rf_clf.fit(X_train, y_train)

# evaluate the random forest classifier on test set 
print(float("{:.5f}".format(rf_clf.score(X_test, y_test))))

0.74904


In [38]:
y_pred_test = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.77      0.55      0.64      2982
           1       0.74      0.89      0.81      4314

    accuracy                           0.75      7296
   macro avg       0.75      0.72      0.72      7296
weighted avg       0.75      0.75      0.74      7296



Overall, the precision is actually better for the adoptee posts this time. Otherwise the recall and f1 score fairly similar to the other random forest with countvectorizer. 