# Imports

In [4]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import re
import string

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

from wordcloud import WordCloud

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer

from sklearn.linear_model import Lasso
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report, plot_confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline as ImPipeline
from imblearn.over_sampling import SMOTE

In [10]:
pip install keras==2.9

Collecting keras==2.9
  Using cached keras-2.9.0-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.8.0
    Uninstalling keras-2.8.0:
      Successfully uninstalled keras-2.8.0
Successfully installed keras-2.9.0
Note: you may need to restart the kernel to use updated packages.


ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

tensorflow 2.8.0 requires keras<2.9,>=2.8.0rc0, but you'll have keras 2.9.0 which is incompatible.


In [11]:
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential

ImportError: cannot import name 'dtensor' from 'tensorflow.compat.v2.experimental' (C:\Users\alvaro\anaconda3\envs\learn-env\lib\site-packages\tensorflow\_api\v2\compat\v2\experimental\__init__.py)

# Dataframe Initialization and Observations

In [2]:
# load data and initialize dataframe
df = pd.read_csv('./data/judge-1377884607_tweet_product_company.csv',
                 encoding="ISO-8859-1")
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [3]:
# check for nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [4]:
# count nulls
df.isnull().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

In [5]:
# view nulls vs subcategories
df['emotion_in_tweet_is_directed_at'].value_counts(dropna=False)

NaN                                5802
iPad                                946
Apple                               661
iPad or iPhone App                  470
Google                              430
iPhone                              297
Other Google product or service     293
Android App                          81
Android                              78
Other Apple product or service       35
Name: emotion_in_tweet_is_directed_at, dtype: int64

### Summary

We have a large number of nulls in the data set. We will need to figure out how to handle these nulls in data cleaning in addition to the standard NLP data cleaning procedures such as RegEx, stopword removal, lemmatization,  and tokenization. 

# Data Cleanup

In [6]:
def process_string(text):
    """This function returns a processed list of words from the given text
    
    This function removes html elements and urls using regular expression, then
    converts string to list of workds, them find the stem of words in the list of words and
    finally removes stopwords and punctuation marks from list of words.
    
    Args:
        text(string): The text from which html elements, urls, stopwords, punctuation are removed and lemmatized
        
    Returns:
        clean_text(string): A text formed after text preprocessing.
    """
    
    # Remove any urls from the text
    text = re.sub(r'https:\/\/.*[\r\n]*',
                  "",
                  str(text))
    
    # Remove any urls starting from www. in the text
    text = re.sub(r'www\.\w*\.\w\w\w',
                  "",
                  str(text))
    
    # Remove any html elements from the text
    text = re.sub(r"<[\w]*[\s]*/>",
                  "",
                  str(text))
    
    # Remove prediods  marks
    text = re.sub(r"[\.]*",
                  "",
                  str(text))
    
 
    # Initialize RegexpTokenizer
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    tokenizer = RegexpTokenizer(pattern)

    
    # Tokenize text
    text_tokens = tokenizer.tokenize(text.lower())
    
    lemmatizer  = WordNetLemmatizer()
    # Get english stopwords
    english_stopwords = stopwords.words("english")
    new_list = ["mention", "sxsw", 'link', 'rt', 'quot']
    english_stopwords.extend(new_list)
    
    cleaned_text_tokens = [] # A list to hold cleaned text tokens
    
    for word in text_tokens:
        if((word not in english_stopwords) and # Remove stopwords
            (word not in string.punctuation)): # Remove punctuation marks
                
                lemmas = lemmatizer.lemmatize(word) # Get lemma of the current word
                cleaned_text_tokens.append(lemmas) # Appened lemma word to list of cleaned list
    
    # Combine list into single string
    clean_text = " ".join(cleaned_text_tokens)
    
    return clean_text

In [7]:
# apply cleanup function to tweet_text column
df['tweet_text'] = df['tweet_text'].apply(process_string)

In [8]:
# categorize Tweets by 'Manufacturer' based on if Tweet contains certain keywords words
# instantiate list of keywords
is_apple = ['ipad', "ipad's", 'iphone', 'iphones', "iphone's", 'apple', "apple's", 'mac', 'ios']
is_google = ['google', "google's", 'pixel', "pixel's", 'pixels', 'android', "android's", "androids", 'nest']

In [9]:
# define functions that loop through keyword lists and assign a category in a new column in the df
def apple_sorter(x):
    for i in is_apple:
        if i.lower() in x.lower():
            return 'Apple'
        else:
            continue
        
def google_sorter(x):
    for i in is_google:
        if i.lower() in x.lower():
            return 'Google'
        else:
            continue

In [11]:
# apply apple and google sorters to tweet_text
df['Manufacturer'] = df['tweet_text'].apply(apple_sorter)
df['Google'] = df['tweet_text'].apply(google_sorter)

In [12]:
# merge created columns into 1 master column, 'Manufacturer'
df['Manufacturer'] = df['Manufacturer'].combine_first(df['Google'])

In [13]:
# drop extraneous columns
df.drop('Google', axis=1, inplace=True)
df.drop('emotion_in_tweet_is_directed_at', axis=1, inplace=True)

In [14]:
# recheck null counts
df['Manufacturer'].value_counts(dropna=False)

Apple     5548
Google    2762
NaN        783
Name: Manufacturer, dtype: int64

In [15]:
# drop remaining NaN - does not contain major keywords related to Apple and Google
df.dropna(inplace=True)

In [16]:
# recheck null counts
df['Manufacturer'].value_counts(dropna=False)

Apple     5548
Google    2762
Name: Manufacturer, dtype: int64

In [17]:
# check cleaned data
df.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,Manufacturer
0,wesley g iphone hr tweeting rise austin dead n...,Negative emotion,Apple
1,jessedee know fludapp awesome ipad iphone app ...,Positive emotion,Apple
2,swonderlin wait ipad also sale,Positive emotion,Apple
3,hope year's festival crashy year's iphone app,Negative emotion,Apple
4,sxtxstate great stuff fri marissa mayer google...,Positive emotion,Google


In [19]:
# rename columns
df.rename(columns={'tweet_text':'Text', 'is_there_an_emotion_directed_at_a_brand_or_product' : 'Sentiment'}, inplace=True)

In [20]:
# quick check
df.head()

Unnamed: 0,Text,Sentiment,Manufacturer
0,wesley g iphone hr tweeting rise austin dead n...,Negative emotion,Apple
1,jessedee know fludapp awesome ipad iphone app ...,Positive emotion,Apple
2,swonderlin wait ipad also sale,Positive emotion,Apple
3,hope year's festival crashy year's iphone app,Negative emotion,Apple
4,sxtxstate great stuff fri marissa mayer google...,Positive emotion,Google


In [21]:
# check value counts for Sentiment column
df['Sentiment'].value_counts(dropna=False)

No emotion toward brand or product    4651
Positive emotion                      2940
Negative emotion                       569
I can't tell                           150
Name: Sentiment, dtype: int64

In [22]:
# drop 'I can't tell' as this will not be relevant for modeling
df = df[df['Sentiment'] != "I can't tell"]

In [24]:
# final check
df['Sentiment'].value_counts(normalize=True)

No emotion toward brand or product    0.569975
Positive emotion                      0.360294
Negative emotion                      0.069730
Name: Sentiment, dtype: float64

# Train/Test Split, Label Encoding, Vectorization

In [25]:
# train test split with test size = 33% and random_state for result interpretability
X = df.Text
y = df.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# TFIDF vectorize X_train and X_test
tf_idf = TfidfVectorizer()

X_train = tf_idf.fit_transform(X_train.tolist())
X_test = tf_idf.transform(X_test.tolist())

In [None]:
# check shapes
print(X_train.shape, X_test.shape)

In [26]:
# LabelEncode
label_encoder = preprocessing.LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [32]:
from gensim.models.word2vec import Word2Vec

In [30]:
wordvec = []

for i in X_train:
    wordvec.append(i.split())
    
print(wordvec[:9])

[['gabacustweets', 'ipad', 'sold', 'went', 'new', 'buyer', 'report', 'socialmedia', 'brk'], ['google', 'coming', 'location', 'dominance', 'pcmagcom'], ['holler', 'gram', 'ipad', 'itunes', 'app', 'store', 'via', 'sters', 'great', 'app', 'madebymany', 'free'], ['get', 'badge', 'find', 'food', 'drink', 'figure', 'iphone', 'roaming', 'unpack', 'priority'], ['wonder', 'many', 'ipads', "they'll", 'sell', 'opening', 'pop', 'store'], ['help', 'win', 'ipad', 'click', 'like', 'following', 'picture', 'party', 'thanks'], ['google', 'test', 'check', 'offer', 'via', 'google', 'check'], ['deviantart', 'buy', 'ipad', 'austin', 'test', 'muro', 'drawing', 'super', 'fast', 'deviantart', 'htt', 'cont'], ['google', 'asks', 'want', 'know']]


In [34]:
word_2_vec = Word2Vec(sentences=wordvec, 
                      size=100, 
                      window=5, 
                      min_count=5, 
                      workers=16)
print(word_2_vec)

Word2Vec(vocab=1496, size=100, alpha=0.025)


In [12]:
token = Tokenizer(1496)
token.fit_on_texts(df['Text'])
text = token.texts_to_sequences(df['Text'])
text = pad_sequences(text, 75)
print(text[:9])

NameError: name 'Tokenizer' is not defined

In [None]:
keras_model = Sequential()
keras_model.add(word_2_vec.wv.get_keras_embedding(True))
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Dense(200))
keras_model.add(Activation('relu'))
keras_model.add(Dropout(0.2))
keras_model.add(Dense(2))
keras_model.add(Activation('softmax'))
keras_model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
keras_model.fit(X_train, y_train, batch_size=16, epochs=3, validation_data=(X_test, y_test))

# Models

#### Baseline Model (Dummy)

In [None]:
# Create Dummy/Baseliner
estimator = DummyClassifier(strategy='most_frequent')
estimator.fit(X_train, y_train)

# Get predictions with dummy model
y_pred = estimator.predict(X_test)

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['-', 'Neutral', '+']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=estimator,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_Dummy');

target_names = ['-', 'Neutral', '+']
print(classification_report(y_test, y_pred, target_names=target_names))

Dummy model predicts 'Neutral' for all and has an accuracy of 57%

#### Logistic Regression with SMOTE

In [None]:
pipe = ImPipeline(steps=[
    ('sm', SMOTE(random_state=42)),
    ('estimator', LogisticRegression(random_state=42))
])

param_grid = {}
param_grid['estimator__C'] = [100, 10, 1.0, 0.1, 0.01]
param_grid['estimator__solver'] = ['newton-cg', 'lbfgs', 'liblinear']
param_grid['estimator__penalty'] = ['l2']

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           n_jobs=-1,
                           verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")


# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['-', 'Neutral', '+']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)


target_names = ['-', 'Neutral', '+']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_LogReg');

Performs well on Training Data, but may be overfit due to its worse performance on unseen Test Data, let's see the results for the same pipeline without SMOTE

#### Logistic without SMOTE

In [None]:
pipe = ImPipeline(steps=[
    ('estimator', LogisticRegression(random_state=42))
])

param_grid = {}
param_grid['estimator__C'] = [100, 10, 1.0, 0.1, 0.01]
param_grid['estimator__solver'] = ['newton-cg', 'lbfgs', 'liblinear']
param_grid['estimator__penalty'] = ['l2']

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           n_jobs=-1,
                           verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['-', 'Neutral', '+']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)


target_names = ['-', 'Neutral', '+']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_LogReg_NoSMOTE');

Performs better on unseen data without SMOTE, we will not use SMOTE in our next models

#### Ridge Regression

In [None]:
pipe = ImPipeline(steps=[
    ('estimator', RidgeClassifier(random_state=42))
])

param_grid = {}
param_grid['estimator__alpha'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           n_jobs=-1,
                           verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['-', 'Neutral', '+']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

target_names = ['-', 'Neutral', '+']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_Ridge');

Ridge yields a more overfit model, but it performs slightly better

#### K-Nearest Neighbors (KNN)

In [None]:
pipe = ImPipeline(steps=[
    ('estimator', KNeighborsClassifier())])

param_grid = {}
param_grid['estimator__n_neighbors'] = [1, 5, 9, 13, 17, 21]
param_grid['estimator__metric'] = ['euclidean', 'manhattan', 'minkowski']
param_grid['estimator__weights'] = ['uniform', 'distance']
    
grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           n_jobs=-1,
                           verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['-', 'Neutral', '+']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

target_names = ['-', 'Neutral', '+']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_KNN');

KNN not good

In [None]:
pipe = ImPipeline(steps=[
    ('estimator', SVC())])

param_grid = {}
param_grid['estimator__kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid['estimator__C'] = [10, 1.0, 0.1]

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           n_jobs=-1,
                           verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['-', 'Neutral', '+']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

target_names = ['-', 'Neutral', '+']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_KNN');

#### Random Forest

In [None]:
pipe = Pipeline(steps=[
    ('estimator', RandomForestClassifier(random_state=42))
])

param_grid = {}
param_grid['estimator__n_estimators'] = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
param_grid['estimator__max_features'] = ['auto', 'sqrt', 'log2']
param_grid['estimator__max_depth'] = [int(x) for x in np.linspace(10, 110, num=11)]
param_grid['estimator__min_samples_split'] = [2, 5, 10]
param_grid['estimator__min_samples_leaf'] = [1, 2, 4]

grid_search = RandomizedSearchCV(estimator=pipe, 
                                 param_distributions=param_grid, 
                                 cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                                 return_train_score=True, 
                                 scoring='accuracy', 
                                 n_iter=100, 
                                 random_state=42, 
                                 n_jobs=-1, 
                                 verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['-', 'Neutral', '+']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

target_names = ['-', 'Neutral', '+']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_RandomForest');