# Grid Search
[SXSW Tweets Sentiment Analysis](https://github.com/czarinagluna/Twitter-Sentiment-Analysis/blob/main/sxsw-sentiment-analysis.ipynb)

Authors: Marcelo Scatena, Czarina Luna, Piotr Czolpik, Ross McKim

In [1]:
%store -r X_train_processed

In [2]:
X_train_processed

6488                        putting gun head give iphone 
1944     virtualwallet nfc iphone5 bc standardization ...
6869    want win ticket 1 party rule simple android un...
3640    still big line outside apple pop shop 3 day ip...
7209    go without saying google bread going amazing g...
                              ...                        
8852    case wanna stalk google austinjs rocksauce sha...
2671    de ipad new ubersocial iphone app store includ...
9048          bet man kindle apple sure pre order amazon 
8257    joke keynote trying hard kind flat android spi...
5259    da verpixelungsrechtthe right house pixelated ...
Name: text, Length: 6681, dtype: object

In [3]:
%store -r X_test_processed

In [4]:
%store -r y_train

In [5]:
%store -r y_test

In [26]:
import time
import numpy as np

from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [34]:
baseline_models = {'LogisticRegression': {'classifier': LogisticRegression(max_iter=1000, random_state=20211122)}, 
                   'MultinomialNB': {'classifier': MultinomialNB()},
                   'DecisionTree': {'classifier': DecisionTreeClassifier(random_state=20211122)},
                   'RandomForest': {'classifier': RandomForestClassifier(random_state=20211122)},
                   'ExtraTrees': {'classifier': ExtraTreesClassifier(random_state=20211122)},
                   'GradientBoost': {'classifier': GradientBoostingClassifier(random_state=20211122)},
                   'VectorClass': {'classifier': SVC(random_state=20211122)},
                   'SGDClassifier': {'classifier': SGDClassifier(random_state=20211122)}
                  }

In [35]:
ros = RandomOverSampler(random_state=112221)
cv = CountVectorizer(lowercase=False)
tfidf = TfidfVectorizer(stop_words='english', lowercase=False)

tuned_params = {}

def run_gridsearch(params, name, vectorizer, models=baseline_models):
    for model, grid in params.items():
        print(model, 'Grid Search:')
        print(f'Time Started: {time.asctime()}')
        
        ros.fit_resample(np.array(X_train_processed).reshape(-1, 1), y_train)
        
        pipe = Pipeline(steps=[(str(vectorizer), vectorizer), ('classifier', models[model]['classifier'])])
        pipe.fit(X_train_processed, y_train)
        
        # scorer = make_scorer(accuracy_score, average='macro')
        gridsearch = GridSearchCV(estimator=pipe, param_grid=grid, scoring='recall_micro', cv=5, refit=False)
        gridsearch.fit(X_train_processed, y_train)
        
        print(f'Time Finished: {time.asctime()}')
        print(f'Cross validation scores: {gridsearch.cv_results_["mean_test_score"]}')
        print(f'Best cross validation score: {gridsearch.best_score_ :.2%}')
        print(f'Optimal parameters: {gridsearch.best_params_}')
        
        tuned_params[name] = gridsearch.best_params_

In [11]:
params_lr1 = {'LogisticRegression': [{
    'classifier__C':[0.001, 0.1, 1],
    'classifier__solver':['lbfgs', 'saga'],
    'classifier__fit_intercept':[True, False]
}]}

run_gridsearch(params_lr1, name='LogisticRegression1_cv', vectorizer=cv)

LogisticRegression Grid Search:
Time Started: Mon Feb 21 18:13:02 2022
Time Finished: Mon Feb 21 18:13:37 2022
Cross validation scores: [0.60290377 0.60290377 0.60844235 0.60844235 0.66905911 0.66905911
 0.66202408 0.66217378 0.67773973 0.67714116 0.66471645 0.66486615]
Best cross validation score: 67.77%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__solver': 'lbfgs'}


In [12]:
params_lr1 = {'LogisticRegression': [{
    'classifier__C':[0.001, 0.1, 1],
    'classifier__solver':['lbfgs', 'saga'],
    'classifier__fit_intercept':[True, False]
}]}

run_gridsearch(params_lr1, name='LogisticRegression1_tfidf', vectorizer=tfidf)

LogisticRegression Grid Search:
Time Started: Mon Feb 21 18:14:32 2022
Time Finished: Mon Feb 21 18:14:47 2022
Cross validation scores: [0.60290377 0.60290377 0.61038846 0.61038846 0.63807758 0.63807758
 0.65095072 0.65095072 0.68013506 0.68013506 0.67355024 0.67355024]
Best cross validation score: 68.01%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__solver': 'saga'}


In [13]:
params_lr2 = {'LogisticRegression': [{
    'classifier__C':[1, 1e10, 1e12, 1e15],
    'classifier__solver':['saga'],
    'classifier__fit_intercept':[True]
}]}

run_gridsearch(params_lr2, name='LogisticRegression2_tfidf', vectorizer=tfidf)

LogisticRegression Grid Search:
Time Started: Mon Feb 21 18:14:48 2022
Time Finished: Mon Feb 21 18:15:24 2022
Cross validation scores: [0.68013506 0.62684881 0.62684881 0.62684881]
Best cross validation score: 68.01%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__solver': 'saga'}


In [14]:
LogisticRegression()

params_lr3 = {'LogisticRegression': [{
    'classifier__multi_class':['auto', 'ovr', 'multinomial'],
    'classifier__penalty':['l1', 'l2', 'elasticnet', 'none']
}]}

run_gridsearch(params_lr3, name='LogisticRegression3_tfidf', vectorizer=tfidf)

LogisticRegression Grid Search:
Time Started: Mon Feb 21 18:15:24 2022
Time Finished: Mon Feb 21 18:16:46 2022
Cross validation scores: [       nan 0.68013506        nan 0.61966329        nan 0.67384897
        nan 0.62175921        nan 0.68013506        nan 0.61966329]
Best cross validation score: 68.01%
Optimal parameters: {'classifier__multi_class': 'auto', 'classifier__penalty': 'l2'}


In [17]:
params_mn2 = {'MultinomialNB': [{
    'classifier__alpha':[.001, .01, .05, .1, .2, .4, .6, .8, 1]
}]}

run_gridsearch(params_mn2, name='MultinomialNB2_cv', vectorizer=cv)

MultinomialNB Grid Search:
Time Started: Mon Feb 21 18:17:04 2022
Time Finished: Mon Feb 21 18:17:08 2022
Cross validation scores: [0.63837284 0.64046843 0.64181517 0.64226461 0.64615694 0.64885155
 0.65334212 0.65438946 0.6563349 ]
Best cross validation score: 65.63%
Optimal parameters: {'classifier__alpha': 1}


In [15]:
params_mn1 = {'MultinomialNB': [{
    'classifier__alpha':[.001, .01, .05, .1, .2, .4, .6, .8, 1]
}]}

run_gridsearch(params_mn1, name='MultinomialNB1_tfidf', vectorizer=tfidf)

MultinomialNB Grid Search:
Time Started: Mon Feb 21 18:16:46 2022
Time Finished: Mon Feb 21 18:16:50 2022
Cross validation scores: [0.65049714 0.65169508 0.65573666 0.65887903 0.66411922 0.66322135
 0.66157588 0.65618755 0.65319466]
Best cross validation score: 66.41%
Optimal parameters: {'classifier__alpha': 0.2}


In [18]:
params_dtc1 = {'DecisionTree': [{
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[1, 2, 6, 10, 15],
    'classifier__ccp_alpha':[.001, .01, .1, .5]
}]}

run_gridsearch(params_dtc1, name='DecisionTree1_cv', vectorizer=cv)

DecisionTree Grid Search:
Time Started: Mon Feb 21 18:17:35 2022
Time Finished: Mon Feb 21 18:17:59 2022
Cross validation scores: [0.61083678 0.61323199 0.62071489 0.62954622 0.63238896 0.61083678
 0.61323199 0.62355965 0.62415823 0.6328384  0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.61083678 0.61083678 0.61083678
 0.61083678 0.61083678 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377]
Best cross validation score: 63.28%
Optimal parameters: {'classifier__ccp_alpha': 0.001, 'classifier__criterion': 'entropy', 'classifier__max_depth': 15}


In [19]:
params_dtc2 = {'DecisionTree': [{
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[1, 2, 6, 10, 15],
    'classifier__ccp_alpha':[.001, .01, .1, .5]
}]}

run_gridsearch(params_dtc2, name='DecisionTree1_tfidf', vectorizer=tfidf)

DecisionTree Grid Search:
Time Started: Mon Feb 21 18:17:59 2022
Time Finished: Mon Feb 21 18:18:30 2022
Cross validation scores: [0.61353139 0.6132321  0.61802028 0.62041515 0.62206175 0.61353139
 0.6133818  0.62086504 0.62550509 0.62475636 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.61353139 0.61353139 0.61353139
 0.61353139 0.61353139 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377]
Best cross validation score: 62.55%
Optimal parameters: {'classifier__ccp_alpha': 0.001, 'classifier__criterion': 'entropy', 'classifier__max_depth': 10}


In [22]:
params_gbc1 = {'GradientBoost': [{
    'classifier__learning_rate':[.001, .01],
    'classifier__n_estimators':[100, 200],
    'classifier__max_depth':[5, 10]
}]}

run_gridsearch(params_gbc1, name='GradBoost1_cv', vectorizer=cv)

GradientBoost Grid Search:
Time Started: Mon Feb 21 18:19:33 2022
Time Finished: Mon Feb 21 18:27:31 2022
Cross validation scores: [0.60290377 0.60290377 0.60290377 0.61293259 0.63927609 0.65319533
 0.65828369 0.66921072]
Best cross validation score: 66.92%
Optimal parameters: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 10, 'classifier__n_estimators': 200}


In [23]:
params_gbc1 = {'GradientBoost': [{
    'classifier__learning_rate':[.001, .01],
    'classifier__n_estimators':[100, 200],
    'classifier__max_depth':[5, 10]
}]}

run_gridsearch(params_gbc1, name='GradBoost1_tfidf', vectorizer=tfidf)

GradientBoost Grid Search:
Time Started: Mon Feb 21 18:27:31 2022
Time Finished: Mon Feb 21 18:39:39 2022
Cross validation scores: [0.60290377 0.60320294 0.60290377 0.61712331 0.63792833 0.65379458
 0.65753609 0.66861315]
Best cross validation score: 66.86%
Optimal parameters: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 10, 'classifier__n_estimators': 200}


In [24]:
params_rf1 = {'RandomForest': [{
    'classifier__n_estimators':[100, 1000],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[1, 2, 6, 10, 15],
    'classifier__ccp_alpha':[.001, .01, .1, .5]
}]}

run_gridsearch(params_rf1, name='RandomForest1_cv', vectorizer=cv)

RandomForest Grid Search:
Time Started: Mon Feb 21 18:39:39 2022
Time Finished: Mon Feb 21 18:51:56 2022
Cross validation scores: [0.60290377 0.60290377 0.60290377 0.60290377 0.60305347 0.60290377
 0.60619684 0.60529864 0.60859194 0.60814295 0.60290377 0.60290377
 0.60290377 0.60290377 0.60305347 0.60290377 0.60559804 0.60544834
 0.60844224 0.60769385 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377

In [25]:
params_rf1 = {'RandomForest': [{
    'classifier__n_estimators':[100, 1000],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[1, 2, 6, 10, 15],
    'classifier__ccp_alpha':[.001, .01, .1, .5]
}]}

run_gridsearch(params_rf1, name='RandomForest2_tfidf', vectorizer=tfidf)

RandomForest Grid Search:
Time Started: Mon Feb 21 18:51:56 2022
Time Finished: Mon Feb 21 19:05:19 2022
Cross validation scores: [0.60290377 0.60290377 0.60290377 0.60290377 0.60469984 0.60425074
 0.60649613 0.60619684 0.61263285 0.60904093 0.60290377 0.60290377
 0.60290377 0.60290377 0.60410104 0.60350223 0.60619684 0.60589744
 0.6103879  0.60829243 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377 0.60290377

In [36]:
params_svc1 = {'VectorClass': [{
    'classifier__C':[1, 2, 3],
    'classifier__kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'classifier__gamma':['scale', 'auto'],
    'classifier__shrinking':[True, False],
    'classifier__class_weight':['balanced', None],
}]}

run_gridsearch(params_svc1, name='SVC1_cv', vectorizer=cv)

VectorClass Grid Search:
Time Started: Mon Feb 21 19:23:01 2022
Time Finished: Mon Feb 21 19:57:38 2022
Cross validation scores: [0.65588681 0.6557371  0.65304339 0.65304339 0.66471802 0.66471802
 0.5798492  0.57820294 0.65588681 0.6557371  0.38763431 0.38763431
 0.38763431 0.38763431 0.38763431 0.38763431 0.66980706 0.66965736
 0.643914   0.64406371 0.6826803  0.6823809  0.65139724 0.65244515
 0.66980706 0.66965736 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.65528778 0.65543737 0.66097618 0.66097618
 0.68088221 0.68103192 0.56443452 0.56293751 0.65528778 0.65543737
 0.38763431 0.38763431 0.39167622 0.39167622 0.38763431 0.38763431
 0.66217277 0.66202307 0.65259418 0.65259418 0.69375456 0.69375456
 0.63643021 0.63717849 0.66217277 0.66202307 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.64870174 0.64870174
 0.66636462 0.66636462 0.68537312 0.68537312 0.54332953 0.54692235
 0.64870174 0.64870174 0.38763431 0.38763431 0.41339501 0.41339501


In [37]:
params_svc1 = {'VectorClass': [{
    'classifier__C':[1, 2, 3],
    'classifier__kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'classifier__gamma':['scale', 'auto'],
    'classifier__shrinking':[True, False],
    'classifier__class_weight':['balanced', None],
}]}

run_gridsearch(params_svc1, name='SVC1_tfidf', vectorizer=tfidf)

VectorClass Grid Search:
Time Started: Mon Feb 21 19:57:38 2022
Time Finished: Mon Feb 21 20:31:33 2022
Cross validation scores: [0.63522958 0.63522958 0.66142629 0.66142629 0.66651286 0.66651286
 0.60619315 0.60619315 0.63522958 0.63522958 0.38763431 0.38763431
 0.38763431 0.38763431 0.38763431 0.38763431 0.68716998 0.68716998
 0.65843295 0.65843295 0.68792005 0.68792005 0.67145444 0.67100533
 0.68716998 0.68716998 0.60290377 0.60290377 0.60290377 0.60290377
 0.60290377 0.60290377 0.6415151  0.6415151  0.65843194 0.65843194
 0.67624228 0.67624228 0.59885704 0.59930647 0.6415151  0.6415151
 0.38763431 0.38763431 0.38763431 0.38763431 0.38763431 0.38763431
 0.67489587 0.67489587 0.66232438 0.66232438 0.69106208 0.69106208
 0.66007809 0.66007809 0.67489587 0.67489587 0.60290377 0.60290377
 0.60290377 0.60290377 0.60290377 0.60290377 0.64151476 0.64151476
 0.65888082 0.65903052 0.68028397 0.68028397 0.59391736 0.5930195
 0.64151476 0.64151476 0.38763431 0.38763431 0.38763431 0.38763431
 0

In [38]:
params_sgd1 = {'SGDClassifier': [{
    'classifier__penalty':['l1', 'l2'],
    'classifier__alpha':[0.0001, 0.001, 0.01],
    'classifier__shuffle':[True, False],
    'classifier__class_weight':['balanced', None],
}]}

run_gridsearch(params_sgd1, name='SGDClassifier1_cv', vectorizer=cv)

SGDClassifier Grid Search:
Time Started: Mon Feb 21 20:31:33 2022
Time Finished: Mon Feb 21 20:32:05 2022
Cross validation scores: [0.64196633 0.6422645  0.65573778 0.65843127 0.66112498 0.66501608
 0.66636383 0.67085418 0.62759899 0.62041247 0.6656141  0.66711099
 0.65379335 0.6537938  0.67788999 0.67863861 0.59197887 0.57476162
 0.63418313 0.63463156 0.60604636 0.60290377 0.63972418 0.6416704 ]
Best cross validation score: 67.86%
Optimal parameters: {'classifier__alpha': 0.001, 'classifier__class_weight': None, 'classifier__penalty': 'l2', 'classifier__shuffle': False}


In [39]:
params_sgd1 = {'SGDClassifier': [{
    'classifier__penalty':['l1', 'l2'],
    'classifier__alpha':[0.0001, 0.001, 0.01],
    'classifier__shuffle':[True, False],
    'classifier__class_weight':['balanced', None],
}]}

run_gridsearch(params_sgd1, name='SGDClassifier2_tfidf', vectorizer=tfidf)

SGDClassifier Grid Search:
Time Started: Mon Feb 21 20:32:05 2022
Time Finished: Mon Feb 21 20:32:31 2022
Cross validation scores: [0.6626221  0.65738348 0.65723321 0.660226   0.67908726 0.67998726
 0.67609359 0.67938678 0.61277986 0.62101306 0.644959   0.64436053
 0.63358735 0.63164247 0.63747968 0.63688088 0.60290377 0.60275418
 0.61173453 0.62669776 0.60290377 0.60290377 0.60290377 0.60290377]
Best cross validation score: 68.00%
Optimal parameters: {'classifier__alpha': 0.0001, 'classifier__class_weight': None, 'classifier__penalty': 'l1', 'classifier__shuffle': False}
