# Grid Search
[SXSW Tweets Sentiment Analysis](https://github.com/czarinagluna/Twitter-Sentiment-Analysis/blob/main/sxsw-sentiment-analysis.ipynb)

Authors: Marcelo Scatena, Czarina Luna, Piotr Czolpik, Ross McKim

In [1]:
%store -r X_train_processed

In [2]:
X_train_processed

6488                        putting gun head give iphone 
1944     virtualwallet nfc iphone5 bc standardization ...
6869    want win ticket 1 party rule simple android un...
3640    still big line outside apple pop shop 3 day ip...
7209    go without saying google bread going amazing g...
                              ...                        
8852    case wanna stalk google austinjs rocksauce sha...
2671    de ipad new ubersocial iphone app store includ...
9048          bet man kindle apple sure pre order amazon 
8257    joke keynote trying hard kind flat android spi...
5259    da verpixelungsrechtthe right house pixelated ...
Name: text, Length: 6681, dtype: object

In [3]:
%store -r X_test_processed

In [4]:
%store -r y_train

In [5]:
%store -r y_test

In [49]:
import time

from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

In [50]:
baseline_models = {'LogisticRegression': {'classifier': LogisticRegression(max_iter=1000, random_state=20211122)}, 
                   'MultinomialNB': {'classifier': MultinomialNB()},
                   'DecisionTree': {'classifier': DecisionTreeClassifier(random_state=20211122)},
                   'RandomForest': {'classifier': RandomForestClassifier(random_state=20211122)},
                   'ExtraTrees': {'classifier': ExtraTreesClassifier(random_state=20211122)},
                   'GradientBoost': {'classifier': GradientBoostingClassifier(random_state=20211122)}
                  }

In [57]:
ros = RandomOverSampler(random_state=112221)
cv = CountVectorizer(lowercase=False)
tfidf = TfidfVectorizer(stop_words='english', lowercase=False)

tuned_params = {}

def run_gridsearch(params, name, vectorizer, models=baseline_models):
    for model, grid in params.items():
        print(model, 'Grid Search:')
        print(f'Time Started: {time.asctime()}')
        
        ros.fit_resample(np.array(X_train_processed).reshape(-1, 1), y_train)
        
        pipe = Pipeline(steps=[(str(vectorizer), vectorizer), ('classifier', models[model]['classifier'])])
        pipe.fit(X_train_processed, y_train)
        
        # scorer = make_scorer(accuracy_score, average='macro')
        gridsearch = GridSearchCV(estimator=pipe, param_grid=grid, scoring='recall_micro', cv=5, refit=False)
        gridsearch.fit(X_train_processed, y_train)
        
        print(f'Time Finished: {time.asctime()}')
        print(f'Cross validation scores: {gridsearch.cv_results_["mean_test_score"]}')
        print(f'Best cross validation score: {gridsearch.best_score_ :.2%}')
        print(f'Optimal parameters: {gridsearch.best_params_}')
        
        tuned_params[name] = gridsearch.best_params_

In [58]:
params_lr1 = {'LogisticRegression': [{
    'classifier__C':[0.001, 0.1, 1],
    'classifier__solver':['lbfgs', 'saga'],
    'classifier__fit_intercept':[True, False]
}]}

run_gridsearch(params_lr1, name='LogisticRegression1_cv', vectorizer=cv)

LogisticRegression Grid Search:
Time Started: Wed Feb 16 13:44:36 2022
Time Finished: Wed Feb 16 13:45:37 2022
Cross validation scores: [0.60290377 0.60290377 0.60844235 0.60844235 0.66905911 0.66905911
 0.66202408 0.66217378 0.67773973 0.67714116 0.66471645 0.66486615]
Best cross validation score: 67.77%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__solver': 'lbfgs'}


In [59]:
params_lr1 = {'LogisticRegression': [{
    'classifier__C':[0.001, 0.1, 1],
    'classifier__solver':['lbfgs', 'saga'],
    'classifier__fit_intercept':[True, False]
}]}

run_gridsearch(params_lr1, name='LogisticRegression1_tfidf', vectorizer=tfidf)

LogisticRegression Grid Search:
Time Started: Wed Feb 16 13:46:38 2022
Time Finished: Wed Feb 16 13:47:04 2022
Cross validation scores: [0.60290377 0.60290377 0.61038846 0.61038846 0.63807758 0.63807758
 0.65095072 0.65095072 0.68013506 0.68013506 0.67355024 0.67355024]
Best cross validation score: 68.01%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__solver': 'saga'}


In [60]:
params_lr2 = {'LogisticRegression': [{
    'classifier__C':[1, 1e10, 1e12, 1e15],
    'classifier__solver':['saga'],
    'classifier__fit_intercept':[True]
}]}

run_gridsearch(params_lr2, name='LogisticRegression2_tfidf', vectorizer=tfidf)

LogisticRegression Grid Search:
Time Started: Wed Feb 16 13:48:05 2022
Time Finished: Wed Feb 16 13:49:18 2022
Cross validation scores: [0.68013506 0.62684881 0.62684881 0.62684881]
Best cross validation score: 68.01%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__solver': 'saga'}


In [62]:
LogisticRegression()

params_lr3 = {'LogisticRegression': [{
    'classifier__multi_class':['auto', 'ovr', 'multinomial'],
    'classifier__penalty':['l1', 'l2', 'elasticnet', 'none']
}]}

run_gridsearch(params_lr3, name='LogisticRegression3_tfidf', vectorizer=tfidf)

LogisticRegression Grid Search:
Time Started: Wed Feb 16 13:54:46 2022
Time Finished: Wed Feb 16 13:57:24 2022
Cross validation scores: [       nan 0.68013506        nan 0.6189149         nan 0.67384897
        nan 0.6214597         nan 0.68013506        nan 0.6189149 ]
Best cross validation score: 68.01%
Optimal parameters: {'classifier__multi_class': 'auto', 'classifier__penalty': 'l2'}


In [63]:
params_mn1 = {'MultinomialNB': [{
    'classifier__alpha':[.001, .01, .05, .1, .2, .4, .6, .8, 1]
}]}

run_gridsearch(params_mn1, name='MultinomialNB1_tfidf', vectorizer=tfidf)

MultinomialNB Grid Search:
Time Started: Wed Feb 16 14:04:45 2022
Time Finished: Wed Feb 16 14:04:51 2022
Cross validation scores: [0.65049714 0.65169508 0.65573666 0.65887903 0.66411922 0.66322135
 0.66157588 0.65618755 0.65319466]
Best cross validation score: 66.41%
Optimal parameters: {'classifier__alpha': 0.2}
