<a href="https://colab.research.google.com/github/cindylay/cs159-final-proj/blob/main/Experiments/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ! pip install transformers==4.6.1

In [2]:
!pip install tensorflow-gpu==2.6.2



In [3]:
import tensorflow as tf
print(tf.version.VERSION)

2.6.2


In [4]:
import numpy as np
import seaborn as sns
sns.set()
import sys
sys.path.append('models')
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from google.colab import drive
drive.mount('/content/drive')
!cp '/content/drive/MyDrive/CS159/BERT_DATA_DIR/train.csv' train.csv
!cp '/content/drive/MyDrive/CS159/BERT_DATA_DIR/valid.csv' valid.csv

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
import csv
import tensorflow as tf
# Using  error_bad_lines=False  fixed the problem somehow?
train_df = pd.read_csv('train.csv',encoding='utf-8')
valid_df = pd.read_csv('valid.csv', encoding='utf-8')
concat_data = pd.concat((train_df, valid_df))
print(concat_data.shape)

(60000, 6)


In [7]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [8]:
import re

def drop(body):
    return body.replace("\n", " ")

concat_data['Body'] = concat_data['Body'].apply(lambda x: drop(x))

In [9]:
concat_data.Y = pd.Categorical(concat_data.Y)

In [10]:
concat_data['Y_cat_code'] = concat_data.Y.cat.codes

In [11]:
concat_data.head(5)

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y,Y_cat_code
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE,1
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ,0
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ,0
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ,0
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ,0


In [12]:
concat_data = concat_data.drop(columns=['Y', 'Id', 'CreationDate'])

In [13]:
concat_data.head(5)

Unnamed: 0,Title,Body,Tags,Y_cat_code
0,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,1
1,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,0
2,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,0
3,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,0
4,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,0


Use NLTK's Pre-Trained Sentiment Analyzer.
NLTK already has a built-in, pretrained sentiment analyzer called VADER (Valence Aware Dictionary and sEntiment Reasoner).
To use VADER, first create an instance of nltk.sentiment.SentimentIntensityAnalyzer, then use .polarity_scores() on a raw string:
You’ll get back a dictionary of different scores. The negative, neutral, and positive scores are related: They all add up to 1 and can’t be negative. The compound score is calculated differently. It’s not just an average, and it can range from -1 to 1.

In [14]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# checking to make sure sentiment analysis works
sia.polarity_scores("Wow, NLTK is really powerful! ")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




{'compound': 0.8012, 'neg': 0.0, 'neu': 0.295, 'pos': 0.705}

In [15]:
def nltk_sentiment(post):
  sentiment_dict = sia.polarity_scores(post)
  return sentiment_dict['compound']

In [16]:
concat_data['Title'] = concat_data['Title'].apply(lambda x: nltk_sentiment(x))

In [17]:
concat_data['Body'] = concat_data['Body'].apply(lambda x: nltk_sentiment(x))

In [18]:
concat_data.head()

Unnamed: 0,Title,Body,Tags,Y_cat_code
0,0.0,0.2177,<java><repeat>,1
1,0.0,0.3612,<java><optional>,0
2,0.0,0.6369,<javascript><image><overlay><react-native><opa...,0
3,0.2023,-0.4839,<swift><operators><whitespace><ternary-operato...,0
4,0.4588,0.3612,<android><material-design><floating-action-but...,0


In [19]:
def tag_clean(body):
    return body.replace("><", "> <").split()

concat_data['Tags'] = concat_data['Tags'].apply(lambda x: tag_clean(x))

In [20]:
concat_data.head()

Unnamed: 0,Title,Body,Tags,Y_cat_code
0,0.0,0.2177,"[<java>, <repeat>]",1
1,0.0,0.3612,"[<java>, <optional>]",0
2,0.0,0.6369,"[<javascript>, <image>, <overlay>, <react-nati...",0
3,0.2023,-0.4839,"[<swift>, <operators>, <whitespace>, <ternary-...",0
4,0.4588,0.3612,"[<android>, <material-design>, <floating-actio...",0


In [21]:
from collections import Counter
tag_count = Counter()
tag_idx_dict = {}

'''
Takes in a list and updates
counter and dictionary.
'''
def unique_tags(df_list):
  t_count = -1
  new_list = []
  for tag in df_list:
    tag_count[tag] += 1

  for tag in tag_count:
    t_count += 1
    tag_idx_dict[tag] = t_count

  for tag in df_list:
    new_list.append(tag_idx_dict[tag])
  
  return new_list

# Testing the unique_tags function
print(unique_tags(['<php>', '<mysql>', '<sql>', '<codeigniter>', '<mysqli>']))
print(len(tag_count), len(tag_idx_dict))

[0, 1, 2, 3, 4]
5 5


In [22]:
## Add everything to the Counter
concat_data['Tags'] = concat_data['Tags'].apply(lambda x: unique_tags(x))

In [23]:
concat_data.head()

Unnamed: 0,Title,Body,Tags,Y_cat_code
0,0.0,0.2177,"[5, 6]",1
1,0.0,0.3612,"[5, 7]",0
2,0.0,0.6369,"[8, 9, 10, 11, 12]",0
3,0.2023,-0.4839,"[13, 14, 15, 16, 7]",0
4,0.4588,0.3612,"[17, 18, 19]",0


In [24]:
# On the assumption that the first tag is the most important, we will only keep the first tag_idx
concat_data['Tags'] = concat_data['Tags'].apply(lambda x: x[0])
concat_data.head()

Unnamed: 0,Title,Body,Tags,Y_cat_code
0,0.0,0.2177,5,1
1,0.0,0.3612,5,0
2,0.0,0.6369,8,0
3,0.2023,-0.4839,13,0
4,0.4588,0.3612,17,0


In [25]:
#
# let's convert our dataframe to a numpy array, named A
#    Our ML library, scikit-learn operates entirely on numpy arrays.
#
A = concat_data.values    # .values gets the numpy array
print(A)

[[ 0.000e+00  2.177e-01  5.000e+00  1.000e+00]
 [ 0.000e+00  3.612e-01  5.000e+00  0.000e+00]
 [ 0.000e+00  6.369e-01  8.000e+00  0.000e+00]
 ...
 [-4.019e-01  4.576e-01  1.320e+02  2.000e+00]
 [ 0.000e+00  7.720e-02  8.000e+00  1.000e+00]
 [ 0.000e+00  7.635e-01  4.200e+01  1.000e+00]]


In [26]:
#
# let's make sure it's all floating-point, so we can multiply and divide
#
A = A.astype('float64')  # so many:  www.tutorialspoint.com/numpy/numpy_data_types.htm
print(A)

[[ 0.000e+00  2.177e-01  5.000e+00  1.000e+00]
 [ 0.000e+00  3.612e-01  5.000e+00  0.000e+00]
 [ 0.000e+00  6.369e-01  8.000e+00  0.000e+00]
 ...
 [-4.019e-01  4.576e-01  1.320e+02  2.000e+00]
 [ 0.000e+00  7.720e-02  8.000e+00  1.000e+00]
 [ 0.000e+00  7.635e-01  4.200e+01  1.000e+00]]


In [27]:
#
# nice to have NUM_ROWS and NUM_COLS around
#
NUM_ROWS, NUM_COLS = A.shape
print(f"\nThe dataset has {NUM_ROWS} rows and {NUM_COLS} cols")


The dataset has 60000 rows and 4 cols


In [28]:
print("+++ Start of data definitions +++\n")

X_all = A[:,0:3].copy()  # X (features) ... is all rows, columns 0, 1, 2
y_all = A[:,3].copy()    # y (labels) ... is all rows, column 3 only

# Create a copy of X_all and y_all to apply rf analysis to later
X_all_rf = X_all.copy()
y_all_rf = y_all.copy()

print(f"X_all (just features: 10 rows) is \n {X_all[:10,:]}")
print(f"y_all (just labels)   is \n {y_all}") 

+++ Start of data definitions +++

X_all (just features: 10 rows) is 
 [[ 0.      0.2177  5.    ]
 [ 0.      0.3612  5.    ]
 [ 0.      0.6369  8.    ]
 [ 0.2023 -0.4839 13.    ]
 [ 0.4588  0.3612 17.    ]
 [ 0.     -0.3919 20.    ]
 [ 0.4019  0.7906  8.    ]
 [ 0.      0.9336 23.    ]
 [ 0.      0.     28.    ]
 [ 0.      0.765  32.    ]]
y_all (just labels)   is 
 [1. 0. 0. ... 2. 1. 1.]


In [29]:
#
# we scramble the data, to give a different TRAIN/TEST split each time...
# 
indices = np.random.permutation(len(y_all))  # indices is a permutation-list

# we scramble both X and y, necessarily with the same permutation
X_all = X_all[indices]              # we apply the _same_ permutation to each!
y_all = y_all[indices]              # again...


print("features\n", X_all[:10,:])
print("labels\n",y_all)

features
 [[ 0.000e+00  0.000e+00  9.500e+01]
 [ 0.000e+00  0.000e+00  4.200e+01]
 [ 0.000e+00  5.411e-01  1.280e+02]
 [ 0.000e+00  0.000e+00  9.500e+01]
 [ 0.000e+00 -5.340e-02  8.000e+00]
 [ 3.400e-01  7.882e-01  8.000e+00]
 [ 4.939e-01  7.520e-02  2.030e+02]
 [ 0.000e+00 -8.911e-01  4.200e+01]
 [ 0.000e+00  2.500e-01  1.420e+02]
 [ 0.000e+00  7.105e-01  8.000e+00]]
labels
 [2. 0. 2. ... 2. 2. 2.]


In [30]:

# We next separate into test data and training data ... 
#    + We will train on the training data...
#    + We will _not_ look at the testing data to build the model
#
# Then, afterward, we will test on the testing data -- and see how well we do!
#

#
# a common convention:  train on 80%, test on 20%    Let's define the TEST_PERCENT
#
NUM_ROWS = X_all.shape[0]     # the number of rows
TEST_PERCENT = 0.20
TEST_SIZE = int(TEST_PERCENT*NUM_ROWS)   # no harm in rounding down

X_test = X_all[:TEST_SIZE]    # first section are for testing
y_test = y_all[:TEST_SIZE]

X_train = X_all[TEST_SIZE:]   # all the rest are for training
y_train = y_all[TEST_SIZE:]

print(f"training with {len(y_train)} rows;  testing with {len(y_test)} rows" )

training with 48000 rows;  testing with 12000 rows


In [31]:
#
# for most NNets, it's important to keep the feature values in the -1-to-1 range
#    This is done through the "StandardScaler" in scikit-learn
# 
USE_SCALER = True   # this variable is important! It tracks if we need to use the scaler...

# we "train the scaler"  (computes the mean and standard deviation)
if USE_SCALER == True:
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    # Scale with the training data! ave becomes 0; stdev becomes 1
    scaler.fit(X_train)   
    
if USE_SCALER == True:    # if we're using the scaler, all inputs need to be scaled...
    X_train_scaled = scaler.transform(X_train) # scale!
    X_test_scaled = scaler.transform(X_test) # scale!
    print("USING the input scaler:\n")
else:
    X_train_scaled = X_train.copy()  # not using the scaler
    X_test_scaled = X_test.copy()  # not using the scaler
    print("NOT using the input scaler:\n")
    
y_train_scaled = y_train  # the predicted/desired labels are not scaled
y_test_scaled = y_test  # not using the scaler
    
# ascii_table(X_train_scaled,y_train_scaled)  # unweildy
print(f"X_train_scaled (first 5 rows): {X_train_scaled[:5,:]}")

USING the input scaler:

X_train_scaled (first 5 rows): [[ 1.88900039  1.35017732 -0.17290213]
 [ 1.37972168 -0.12656316 -0.28722915]
 [-0.09519113 -0.46139891 -0.26182314]
 [ 0.67653531 -2.03431762 -0.0534939 ]
 [-0.09519113  0.42246208 -0.28341825]]


### Using A MLPClassifier


In [32]:
from sklearn.neural_network import MLPClassifier

nn_classifier = MLPClassifier(hidden_layer_sizes=(9,9), max_iter=400, activation="tanh",
                    solver='sgd', verbose=False, shuffle=True,
                    random_state=None, # reproduceability!
                    learning_rate_init=.1, learning_rate = 'adaptive')

print("\n\n++++++++++  TRAINING:  begin  +++++++++++++++\n\n")
nn_classifier.fit(X_train_scaled, y_train_scaled)
print("\n\n++++++++++  TRAINING:   end  +++++++++++++++\n\n")

print(f"The (log) prediction error (the loss) is {nn_classifier.loss_}") 



++++++++++  TRAINING:  begin  +++++++++++++++




++++++++++  TRAINING:   end  +++++++++++++++


The (log) prediction error (the loss) is 1.0341806537702933


In [33]:
#
# how did it do on the testing data?
#

def ascii_table(X,y,nn):
    """ a table including predictions using nn.predict """
    predictions = nn.predict(X) # all predictions
    prediction_probs = nn.predict_proba(X) # all prediction probabilities
    # count correct
    num_correct = 0
    # printing
    print(f"{'question quatliy features ':>18s} -> {'pred':^6s} {'desr':^6s}") 
    for i in range(len(y)):
        pred = predictions[i]
        pred_probs = prediction_probs[i,:]
        desired = y[i]
        if pred != desired: result = "  incorrect: " + str(pred_probs)
        else: result = ""; num_correct += 1
        # can "cheat" in our view by using X_train here...
        # print(f"{X[i,:]!s:>18s} -> {pred:^6.0f} {desired:^6.0f} {result:^10s}") <- Only print this for debugging purposes. Otherwise, the output is too long
    print(f"\ncorrect predictions: {num_correct} out of {len(y)}")
    
# let's see how it did on the test data (also the training data!)
#
ascii_table(X_test_scaled,y_test_scaled,nn_classifier)
# other things...
#

nn = nn_classifier
print("\n\n+++++ parameters, weights, etc. +++++\n")
print(f"\nweights/coefficients:\n")
for wts in nn.coefs_:
    print(wts)
print(f"\nintercepts: {nn.intercepts_}")
print(f"\nall parameters: {nn.get_params()}")

question quatliy features  ->  pred   desr 

correct predictions: 5572 out of 12000


+++++ parameters, weights, etc. +++++


weights/coefficients:

[[ 0.03553822  0.48896657 -0.2079947  -0.60481989 -0.211439    0.10632876
   0.51256575 -0.14696484  0.0814194 ]
 [-0.33626942  0.06053263 -1.21623391 -0.05268914  0.61048725  0.2266948
  -0.65796482 -0.06605096  0.58501613]
 [ 3.20199112  1.53033477 -0.47053059  0.68395436 -1.63422572  3.86920169
   0.12440012  3.5706475  -0.02004523]]
[[-0.12974257 -0.3551833  -0.16700184 -0.32805542  0.254743   -0.57471504
   1.11993452 -0.02160104  0.53655478]
 [ 0.13019346  0.33225458  0.56543354 -0.73586044 -0.30341495  0.50356007
   0.10276119 -0.01201328  0.09942617]
 [ 0.01284904  0.35754154  0.12862309 -0.33666526 -0.90214102 -0.23411651
   0.01206563  0.08596778 -0.20100467]
 [ 0.14523127  0.05482888  0.3833594  -0.50835457 -0.13414171  1.47745261
   0.45231362 -0.10519986 -0.60978573]
 [ 0.11809144  0.38942729 -0.34211058  0.95872061  0.8135771

### Using a MLPRegressor

In [34]:
from sklearn.neural_network import MLPRegressor

#
# Here's where you can change the number of hidden layers
# and number of neurons!
#
nn_regressor = MLPRegressor(hidden_layer_sizes=(9,9), max_iter=400, activation="tanh",
                    solver='sgd', verbose=False, shuffle=True,
                    random_state=None, # reproduceability!
                    learning_rate_init=.1, learning_rate = 'adaptive')

print("\n\n++++++++++  TRAINING:  begin  +++++++++++++++\n\n")
nn_regressor.fit(X_train_scaled, y_train_scaled)
print("\n\n++++++++++  TRAINING:   end  +++++++++++++++\n\n")

print(f"The (sq) prediction error (the loss) is {nn_regressor.loss_}") 



++++++++++  TRAINING:  begin  +++++++++++++++




++++++++++  TRAINING:   end  +++++++++++++++


The (sq) prediction error (the loss) is 0.30784068017755073


In [35]:
def ascii_table_for_regressor(X,y,nn):
    """ a table including predictions using nn.predict """
    predictions = nn.predict(X) # all predictions
    # measure error
    error = 0.0
    # printing
    print(f"{'input ':>18s} ->  {'pred':^6s}  {'desr':^6s}  {'absdiff':^10s}") 
    for i in range(len(y)):
        pred = predictions[i]
        desired = y[i]
        result = abs(desired - pred)
        error += result
        # of use X_train to see the unscaled...
        # print(f"{X_train[i,:]!s:>18s} ->  {pred:<+6.3f}  {desired:<+6.3f}  {result:^10.3f}") <- Only print this for debugging purposes. Otherwise, the output is too long
    num_correct = len(y) - error
    print(f"\ncorrect predictions: {num_correct} out of {len(y)}")
    print(f"\naverage abs error: {error/len(y)}")

# let's see how it did on the test data (also the training data!)
#
ascii_table_for_regressor(X_test_scaled,y_test_scaled,nn_regressor)
# other things...
#

nn = nn_regressor
print("\n\n+++++ parameters, weights, etc. +++++\n")
print(f"\nweights/coefficients:\n")
for wts in nn.coefs_:
    print(wts)
print(f"\nintercepts: {nn.intercepts_}")
print(f"\nall parameters: {nn.get_params()}")

            input  ->   pred    desr    absdiff  

correct predictions: 3890.2107830319837 out of 12000

average abs error: 0.675815768080668


+++++ parameters, weights, etc. +++++


weights/coefficients:

[[ 1.04710463e-01  7.59297451e-02  9.54391689e-02  7.00413028e-01
   1.68562642e-01  4.13974330e-01 -1.35393937e-01 -4.27078007e-01
   6.97118309e-02]
 [-1.35402615e-01 -1.16551730e-03  1.51297196e-01 -2.65939150e-01
   1.96465352e-01 -1.16516701e-01  5.81922495e-03  2.97847756e-02
   7.95289147e-02]
 [-4.21195218e-01  3.81811612e-01 -1.33285405e+00  9.78053105e-01
  -3.33204328e-02 -1.20181958e+00  3.01485112e+00  8.10868957e-01
  -3.36147684e+00]]
[[-0.36789951  0.35191502 -0.38700863 -0.25421077  0.06698544 -0.18088219
  -0.41010453  0.15079192 -0.0209114 ]
 [ 0.14094943 -0.26066719 -0.17995816 -0.139692   -0.36765211  0.05907383
   0.48580755  0.15282544 -0.60602232]
 [ 0.29516925 -0.54714602  0.17713822  0.02198352 -0.44038219 -0.15270434
   0.21084584  0.35612272 -0.1727077 ]


### Random Forest Analysis


In [36]:
#
# we can re-weight different features here...
#
COL_WEIGHT_DICT = {}

for colname in concat_data.columns[:-1]:
    i = concat_data.columns.get_loc(colname)
    weight = 1
    COL_WEIGHT_DICT[colname] = 1
    print("Weighting", colname, "by", weight) 
    X_all_rf[:,i] *= weight # multiply by the weight to give this column ("feature")


Weighting Title by 1
Weighting Body by 1
Weighting Tags by 1


In [37]:
#
# we scramble the data, to give a different TRAIN/TEST split each time...
# 
indices = np.random.permutation(len(y_all_rf))  # indices is a permutation-list

# we scramble both X and y, necessarily with the same permutation
X_labeled = X_all_rf[indices]              # we apply the _same_ permutation to each!
y_labeled = y_all_rf[indices]              # again...
print(X_labeled)
print(y_labeled)


[[ 2.023e-01  8.608e-01  3.900e+01]
 [ 0.000e+00  8.555e-01  4.590e+02]
 [ 2.732e-01  3.071e-01  5.000e+00]
 ...
 [ 2.732e-01  0.000e+00  8.000e+00]
 [ 0.000e+00 -4.696e-01  2.000e+00]
 [-3.089e-01  7.915e-01  0.000e+00]]
[0. 2. 0. ... 1. 2. 2.]


In [38]:
#
# We next separate into test data and training data ... 
#    + We will train on the training data...
#    + We will _not_ look at the testing data to build the model
#
# Then, afterward, we will test on the testing data -- and see how well we do!
#

#
# a common convention:  train on 80%, test on 20%    Let's define the TEST_PERCENT
#
NUM_ROWS = X_labeled.shape[0]     # the number of labeled rows
TEST_PERCENT = 0.20
TEST_SIZE = int(TEST_PERCENT*NUM_ROWS)   # no harm in rounding down

X_test = X_labeled[:TEST_SIZE]    # first section are for testing
y_test = y_labeled[:TEST_SIZE]

X_train = X_labeled[TEST_SIZE:]   # all the rest are for training
y_train = y_labeled[TEST_SIZE:]

print(f"training with {len(y_train)} rows;  testing with {len(y_test)} rows" )


training with 48000 rows;  testing with 12000 rows


In [39]:
#
# Function to print testing results in a vertical table (or, an excuse to f-string?)
#

def compare_labels(predicted_labels, actual_labels):
    """ a more neatly formatted comparison """
    NUM_LABELS = len(predicted_labels)
    num_correct = 0
    
    for i in range(NUM_LABELS): 
        p = int(round(predicted_labels[i]))         # round protects from fp error 
        a = int(round(actual_labels[i]))
        result = "incorrect"
        if p == a:  # if they match,
            result = ""       # no longer incorrect
            num_correct += 1  # and we count a match!

        # print(f"row {i:>3d} : {p} {a}   {result}") <- Use this line only for debugging purposes. Otherwise, the output is too long  

    print()
    print("Correct:", num_correct, "out of", NUM_LABELS)
    return num_correct

In [40]:
from sklearn import tree      # for decision trees
from sklearn import ensemble  # for random forests
from sklearn.model_selection import cross_val_score

In [41]:

# +++ This is the "Model-building and Model-training Cell"
#       
# Create a RF model and train it! 
#
best_depth = 6        # we don't know what depth to use, so we guess...
best_num_trees = 42   # again, we guess
rforest_model = ensemble.RandomForestClassifier(max_depth=best_depth, 
                                                n_estimators=best_num_trees)

# we train the model (it's one line!)
rforest_model.fit(X_train, y_train)                              # yay!  trained!
print(f"Built an RF with depth={best_depth} and #trees={best_num_trees}") 


Built an RF with depth=6 and #trees=42


In [42]:
#
# +++ This is the "Model-testing Cell"
#
# Now, let's see how well we did on our "held-out data" (the testing data)
#

# We run our test set!
predicted_labels = rforest_model.predict(X_test)
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual  labels  :", actual_labels)

# And, some overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total.")

# and, let's print our table, too... 
compare_labels(predicted_labels,actual_labels)


Predicted labels: [2. 0. 1. ... 1. 2. 0.]
Actual  labels  : [0. 2. 0. ... 2. 2. 1.]

Results on test set:  5826 correct out of 12000 total.

Correct: 5826 out of 12000


5826

In [43]:
#
# Now, to TUNE the model (with cross-validation)...
#
#
# We used a depth of 1  and #trees of 42  
#
# So, we try several depths and # of trees
# 
# Again, the tradeoff is underfitting/overfitting...
#

In [44]:
#
# So, to compare different parameters, let's use cv
#

from sklearn.model_selection import cross_val_score

#
# cross-validation splits the training set into two pieces:
#   + model-building and model-validation. We'll use "build" and "validate"
#

#
# lab task:  wrap this loop in another one! (or create an inner one...)
#

best_accuracy = 0
best_depth = 1
best_num_trees = 42
for ntrees in range(50,300,100):  # range(50,300,100)
    for d in range(1,20):
        rforest_model = ensemble.RandomForestClassifier(max_depth=d, 
                                                        n_estimators=ntrees)
        cv_scores = cross_val_score( rforest_model, X_train, y_train, cv=5 ) # 5 means 80/20 split
        # print(cv_scores)  # if we want to see the five individual scores 
        average_cv_accuracy = cv_scores.mean()  # more likely, only their average
        if average_cv_accuracy >= best_accuracy:
            best_accuracy = average_cv_accuracy
            best_depth = d
            best_num_trees = ntrees
        print(f"depth: {d:2d} ntrees: {ntrees:3d} cv accuracy: {average_cv_accuracy:7.4f}")




print()
print(f"best_depth: {best_depth} and best_num_trees: {best_num_trees} are our choices, with accuracy {best_accuracy}.")  

#
# remember that the lab task is to complete this two-dimensional cv loop!
#


depth:  1 ntrees:  50 cv accuracy:  0.4381
depth:  2 ntrees:  50 cv accuracy:  0.4531
depth:  3 ntrees:  50 cv accuracy:  0.4650
depth:  4 ntrees:  50 cv accuracy:  0.4705
depth:  5 ntrees:  50 cv accuracy:  0.4761
depth:  6 ntrees:  50 cv accuracy:  0.4825
depth:  7 ntrees:  50 cv accuracy:  0.4906
depth:  8 ntrees:  50 cv accuracy:  0.4979
depth:  9 ntrees:  50 cv accuracy:  0.5043
depth: 10 ntrees:  50 cv accuracy:  0.5065
depth: 11 ntrees:  50 cv accuracy:  0.5086
depth: 12 ntrees:  50 cv accuracy:  0.5098
depth: 13 ntrees:  50 cv accuracy:  0.5113
depth: 14 ntrees:  50 cv accuracy:  0.5118
depth: 15 ntrees:  50 cv accuracy:  0.5095
depth: 16 ntrees:  50 cv accuracy:  0.5084
depth: 17 ntrees:  50 cv accuracy:  0.5063
depth: 18 ntrees:  50 cv accuracy:  0.5032
depth: 19 ntrees:  50 cv accuracy:  0.4992
depth:  1 ntrees: 150 cv accuracy:  0.4386
depth:  2 ntrees: 150 cv accuracy:  0.4547
depth:  3 ntrees: 150 cv accuracy:  0.4649
depth:  4 ntrees: 150 cv accuracy:  0.4720
depth:  5 n

In [45]:
#
# Now, we re-create and re-run the  "Model-building and -training Cell"
#
from sklearn import tree      # for decision trees
from sklearn import ensemble  # for random forests

# we should have best_depth and best_num_trees
rforest_model_tuned = ensemble.RandomForestClassifier(max_depth=best_depth, 
                                                      n_estimators=best_num_trees)

# we train the model (it's one line!)
rforest_model_tuned.fit(X_train, y_train)                              # yay!  trained!
print(f"Built an RF classifier with depth={best_depth} and ntrees={best_num_trees}") 



Built an RF classifier with depth=12 and ntrees=250


In [46]:
#
# +++ This is our "Model-testing Cell"
#
# Now, let's see how well we did on our "held-out data" (the testing data)
#

# We run our test set!
predicted_labels = rforest_model_tuned.predict(X_test)
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual  labels  :", actual_labels)

# And, some overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total.")

# and, let's print our table, too...
compare_labels(predicted_labels,actual_labels)



Predicted labels: [0. 0. 1. ... 1. 0. 2.]
Actual  labels  : [0. 2. 0. ... 2. 2. 1.]

Results on test set:  6157 correct out of 12000 total.

Correct: 6157 out of 12000


6157

In [47]:
#
# feature importances can be even more "important" than predictions!
#

print(rforest_model_tuned.feature_importances_)
print()

# let's see them with each feature name:
IMPs = rforest_model_tuned.feature_importances_

# # sort the importances in ascending order
# IMPs = np.sort(IMPs)

feature_importances_dict = {}
# enumerate is great when you want indices _and_ elements!
for i, importance in enumerate(IMPs):
    perc = importance*100
    feature_importances_dict[concat_data.columns[i]] = perc
    
sorted_dict = {}
sorted_keys = sorted(feature_importances_dict, key=feature_importances_dict.get)
for w in sorted_keys:
    sorted_dict[w] =  feature_importances_dict[w]

for keys in sorted_dict.keys():
    print(f"Feature {keys:>12s} has {sorted_dict[keys]:>7.2f}% of the decision-making importance.")
    

[0.17816122 0.27425495 0.54758383]

Feature        Title has   17.82% of the decision-making importance.
Feature         Body has   27.43% of the decision-making importance.
Feature         Tags has   54.76% of the decision-making importance.


### Random Tree Visualization

In [48]:
#
# we can get the individual trees, if we want...
#
i = 0
one_rf_tree = rforest_model_tuned.estimators_[i]
print(f"One of the forest's trees is {one_rf_tree}")

# From there, it's possible to create a graphical version...
filename = f'rf_tree_{i:03d}.gv'    # .gv preferred over .dot
tree.export_graphviz(one_rf_tree, out_file=filename,  # the filename constructed above...!
                            feature_names=concat_data.columns[:-1], # actual feature names, not species
                            filled=True,              # fun!
                            rotate=False,             # False for Up/Down; True for L/R
                            class_names=["HQ", "LQ_CLOSE", "LQ_EDIT"],      # We know that 0=HQ, 1=LQ_CLOSE, 2=LQ_EDIT
                            leaves_parallel=True )    # lots of options!
print(f"file {filename} written. Try copying the result to http://viz-js.com/ \n")
with open(filename, "r") as f:
    file_text = f.read()
    print(file_text)

One of the forest's trees is DecisionTreeClassifier(max_depth=12, max_features='auto',
                       random_state=1336748726)
file rf_tree_000.gv written. Try copying the result to http://viz-js.com/ 

digraph Tree {
node [shape=box, style="filled", color="black", fontname="helvetica"] ;
graph [ranksep=equally, splines=polyline] ;
edge [fontname="helvetica"] ;
0 [label="Title <= 0.03\ngini = 0.667\nsamples = 30385\nvalue = [15826, 16188, 15986]\nclass = LQ_CLOSE", fillcolor="#fefffe"] ;
1 [label="Title <= -0.482\ngini = 0.666\nsamples = 24199\nvalue = [13564, 12687, 12014]\nclass = HQ", fillcolor="#fefbf8"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="Title <= -0.516\ngini = 0.646\nsamples = 791\nvalue = [548, 322, 349]\nclass = HQ", fillcolor="#f9e2d2"] ;
1 -> 2 ;
3 [label="Body <= -0.867\ngini = 0.659\nsamples = 611\nvalue = [376, 290, 266]\nclass = HQ", fillcolor="#fceee4"] ;
2 -> 3 ;
4 [label="Body <= -0.928\ngini = 0.52\nsamples = 92\nvalue = 