example executive summary [here](https://www.proposify.biz/blog/executive-summary)

Revision changes:
1. changed "StockMarket" to "Paleontology"

In [2]:
import requests
import pandas as pd
import numpy as np

# Import train test split
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

#import confusion matrix fxn
from sklearn.metrics import confusion_matrix, plot_roc_curve

#### 1. Engineer a feature to turn `source_feature` into a 1/0 column, where 1 indicates `stocks` subreddit.

In [4]:
df = pd.read_csv('data.csv')

#### 2. Split our data into `X` and `y`.

In [5]:
X = df[['title']]
y = df['stocks']

#### 3. Split our data into training and testing sets.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

#### 4. Turn our text into features. [Documentation here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html).

In [7]:
# Instantiate our CountVectorizer.
cvec = CountVectorizer(max_features = 500, stop_words = 'english')
# Fit our CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['title']).todense(),
                          columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['title']).todense(),
                         columns = cvec.get_feature_names())

#### 4.1 Fit a Naive Bayes model!

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
nb = MultinomialNB()
model = nb.fit(X_train_cvec, y_train)
predictions = model.predict(X_test_cvec)
model.score(X_train_cvec, y_train)

0.9115264797507788

In [17]:
model.score(X_test_cvec, y_test)

0.8619402985074627

In [18]:
print(confusion_matrix(y_test, predictions))
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

[[255  16]
 [ 58 207]]
True Negatives: 255
False Positives: 16
False Negatives: 58
True Positives: 207


#### 4.2 Fit a Random Forest model!

In [19]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
np.random.seed(42)

In [20]:
rf = RandomForestClassifier(n_estimators=100)
et = ExtraTreesClassifier(n_estimators=100)
print(f'rf: {cross_val_score(rf, X_train_cvec, y_train, cv=5).mean()}')
print(f'ef: {cross_val_score(et, X_train_cvec, y_train, cv=5).mean()}')

rf: 0.864797507788162
ef: 0.8654205607476635


In [21]:
rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 5,10,20],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train_cvec, y_train)
print(gs.best_score_)
gs.best_params_

0.8697819314641745


{'max_depth': None, 'n_estimators': 100}

In [22]:
print(gs.score(X_train_cvec, y_train))
print(gs.score(X_test_cvec, y_test))

0.9482866043613707
0.8432835820895522


In [23]:
predictions = gs.predict(X_test_cvec)
print(confusion_matrix(y_test, predictions))
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

[[250  21]
 [ 63 202]]
True Negatives: 250
False Positives: 21
False Negatives: 63
True Positives: 202


#### 4.3 Fit a Voting Classifier model!

In [12]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

In [9]:
vote = VotingClassifier([
    ('tree', DecisionTreeClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
])
vote_params = {
    'ada__n_estimators': [50,75],
    'gb__n_estimators': [100,125],
    'tree__max_depth': [None, 5]
}
gs = GridSearchCV(vote, param_grid=vote_params, cv=3)
gs.fit(X_train_cvec, y_train)
print(gs.best_score_)
gs.best_params_

0.5926686687207221


{'ada__n_estimators': 50, 'gb__n_estimators': 125, 'tree__max_depth': 5}

In [13]:
gs.score(X_test_cvec, y_test)

0.5878932064499075

# Conclusion
The types of subreddits chosen will affect the accuracy of the classifier