# code 3

## Revision changes
#### 4 sep 
1. attempting to change from tvec to TF-IDF
2. removed api and extract. using data from csv
3. added results and csv generator to curate results  

example executive summary [here](https://www.proposify.biz/blog/executive-summary)

In [1]:
import requests
import pandas as pd
import numpy as np
import time

# Import train test split
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

# Import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#import confusion matrix fxn
from sklearn.metrics import confusion_matrix, plot_roc_curve

#### 1. Engineer a feature to turn `source_feature` into a 1/0 column, where 1 indicates `stocks` subreddit.

In [36]:
df = pd.read_csv('data.csv')

#### 2. Split our data into `X` and `y`.

In [37]:
X = df[['title']]
y = df['stocks']

#### 3. Split our data into training and testing sets.

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

#### 4. Turn our text into features. [Documentation here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html).

In [39]:
# Instantiate our CountVectorizer.
tvec = tvec = TfidfVectorizer(max_features = 500, stop_words = 'english',ngram_range=(1, 1))
# Fit our CountVectorizer on the training data and transform training data.
X_train_tvec = pd.DataFrame(tvec.fit_transform(X_train['title']).todense(),
                          columns = tvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_tvec = pd.DataFrame(tvec.transform(X_test['title']).todense(),
                         columns = tvec.get_feature_names())

In [40]:
results = pd.DataFrame(columns=['model','parameters','train','test'])

#### 4.1 Fit a Naive Bayes model!

In [41]:
from sklearn.naive_bayes import MultinomialNB

In [42]:
nb = MultinomialNB()
model = nb.fit(X_train_tvec, y_train)
predictions = model.predict(X_test_tvec)
model.score(X_train_tvec, y_train)

0.6371728210099585

In [43]:
new_model = pd.DataFrame([['nb',None,model.score(X_train_tvec, y_train),model.score(X_test_tvec, y_test)]],columns=results.columns.tolist())
results = results.append(new_model,ignore_index=True)

In [44]:
print(confusion_matrix(y_test, predictions))
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

[[1006  883]
 [ 665 1229]]
True Negatives: 1006
False Positives: 883
False Negatives: 665
True Positives: 1229


#### 4.2 Fit a Random Forest model!

In [45]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
np.random.seed(42)

In [46]:
rf = RandomForestClassifier(n_estimators=100)
et = ExtraTreesClassifier(n_estimators=100)
print(f'rf: {cross_val_score(rf, X_train_tvec, y_train, cv=5).mean()}')
print(f'ef: {cross_val_score(et, X_train_tvec, y_train, cv=5).mean()}')

rf: 0.602098189930164
ef: 0.5983088670706301


In [47]:
rf_params = {
    'n_estimators': [20,40,100, 200],
    'max_depth': [None, 1, 5,10,20],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train_tvec, y_train)
print(gs.best_score_)
gs.best_params_

0.610382535728639


{'max_depth': 20, 'n_estimators': 100}

In [48]:
print(gs.score(X_train_tvec, y_train))
print(gs.score(X_test_tvec, y_test))

0.6823830087247731
0.591329632566746


In [49]:
new_model = pd.DataFrame([[gs.best_estimator_,gs.best_params_,gs.score(X_train_tvec, y_train),gs.score(X_test_tvec, y_test)]],columns=results.columns.tolist())
results = results.append(new_model,ignore_index=True)

In [51]:
predictions = gs.predict(X_test_tvec)
print(confusion_matrix(y_test, predictions))
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

#### 4.3 Fit a Ada boost model!

In [52]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [53]:
# timer
start_time = time.time()

ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ada_params = {
    'n_estimators': [30,50,100],
    'base_estimator__max_depth': [1,2],
    'learning_rate': [.9, 1.]
}
gs = GridSearchCV(ada, param_grid=ada_params, cv=3)
gs.fit(X_train_tvec, y_train)

# timer
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: "
      f"{elapsed_time:.3f} seconds")

print(gs.best_score_)
gs.best_params_

Elapsed time to compute the importances: 872.165 seconds
0.5937255646404245


{'base_estimator__max_depth': 2, 'learning_rate': 1.0, 'n_estimators': 100}

In [54]:
print(gs.score(X_train_tvec, y_train))
print(gs.score(X_test_tvec, y_test))

0.6704855909050851
0.5892149088025377


In [55]:
new_model = pd.DataFrame([[gs.best_estimator_,gs.best_params_,gs.score(X_train_tvec, y_train),gs.score(X_test_tvec, y_test)]],columns=results.columns.tolist())
results = results.append(new_model,ignore_index=True)

#### 4.4 Fit a Vote Classifier model!

In [56]:
from sklearn.ensemble import VotingClassifier

In [57]:
vote = VotingClassifier([
    ('tree', RandomForestClassifier()),
    ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))
])
vote_params = {
    'ada__n_estimators': [30,50],
    'tree__n_estimators': [50,75],
    'tree__max_depth': [1, 3],
    'voting': ['hard', 'soft']
}

# timer
start_time = time.time()

gs = GridSearchCV(vote, param_grid=vote_params, cv=3)
gs.fit(X_train_tvec, y_train)

# timer
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: "
      f"{elapsed_time/60:.1f} minutes")

Elapsed time to compute the importances: 54.0 minutes


In [58]:
print(gs.best_params_)
print(gs.best_score_)
print(gs.score(X_test_tvec, y_test))

{'ada__n_estimators': 50, 'tree__max_depth': 1, 'tree__n_estimators': 50, 'voting': 'hard'}
0.5825331710479481
0.54903515728258


In [59]:
new_model = pd.DataFrame([[gs.best_estimator_,gs.best_params_,gs.score(X_train_tvec, y_train),gs.score(X_test_tvec, y_test)]],columns=results.columns.tolist())
results = results.append(new_model,ignore_index=True)

In [60]:
results.sort_values(by='test')[['model','parameters','test']]

Unnamed: 0,model,parameters,test
3,"VotingClassifier(estimators=[('tree',\n ...","{'ada__n_estimators': 50, 'tree__max_depth': 1...",0.549035
2,"(DecisionTreeClassifier(max_depth=2, random_st...","{'base_estimator__max_depth': 2, 'learning_rat...",0.589215
0,nb,,0.590801
1,"(DecisionTreeClassifier(max_depth=20, max_feat...","{'max_depth': 20, 'n_estimators': 100}",0.59133


In [61]:
for x,y in zip(results.model,results.test):
    print(x)
    print(y)
    print('='*12)

nb
0.5908009516256939
RandomForestClassifier(max_depth=20)
0.591329632566746
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                   n_estimators=100)
0.5892149088025377
VotingClassifier(estimators=[('tree',
                              RandomForestClassifier(max_depth=1,
                                                     n_estimators=50)),
                             ('ada',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))])
0.54903515728258


In [62]:
results

Unnamed: 0,model,parameters,train,test
0,nb,,0.637173,0.590801
1,"(DecisionTreeClassifier(max_depth=20, max_feat...","{'max_depth': 20, 'n_estimators': 100}",0.682383,0.59133
2,"(DecisionTreeClassifier(max_depth=2, random_st...","{'base_estimator__max_depth': 2, 'learning_rat...",0.670486,0.589215
3,"VotingClassifier(estimators=[('tree',\n ...","{'ada__n_estimators': 50, 'tree__max_depth': 1...",0.617873,0.549035


In [63]:
results['vectorizer'] = 'TfidfVectorizer'

In [64]:
results.to_csv('./results/code3_results.csv',index_label=False)

TF-IDF and RandomForestClassifier with  
max_depth: 5 and n_estimators: 200  
seems to be the best classifier out of those tested here.

In [2]:
pd.read_csv('code3_results.csv')

Unnamed: 0.1,Unnamed: 0,model,parameters,train,test,vectorizer
0,0,nb,,0.637173,0.590801,TfidfVectorizer
1,1,RandomForestClassifier(max_depth=20),"{'max_depth': 20, 'n_estimators': 100}",0.682383,0.59133,TfidfVectorizer
2,2,AdaBoostClassifier(base_estimator=DecisionTree...,"{'base_estimator__max_depth': 2, 'learning_rat...",0.670486,0.589215,TfidfVectorizer
3,3,"VotingClassifier(estimators=[('tree',\n ...","{'ada__n_estimators': 50, 'tree__max_depth': 1...",0.617873,0.549035,TfidfVectorizer


countVectoriser with VotingClassifier(estimators=[('tree',
                              RandomForestClassifier(max_depth=5,
                                                     n_estimators=50)),
                             ('ada',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),
                                                 n_estimators=30))])) socre = 0.594237 is better than TF-IDF based on gridsearchCV results with Random Forest