In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import DictVectorizer
from nltk import pos_tag
# %pip install textblob
from textblob import TextBlob
from collections import Counter
import re
from scipy.sparse import hstack
from sklearn.feature_selection import chi2, SelectKBest
from nltk import word_tokenize
from gensim.models import Word2Vec
import warnings, time
warnings.filterwarnings('ignore')

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')
GOOGLE_PATH_PREFIX = "drive/MyDrive/EECS595/project/"

Mounted at /content/drive


In [4]:
def load_data(path):
    df = pd.read_csv(path, sep='\t', encoding='ISO-8859-1')
    return df

In [5]:

train = pd.read_csv(GOOGLE_PATH_PREFIX + 'dataset/semeval2016-task6-trainingdata.txt', sep='\t', encoding='ISO-8859-1')
test = pd.read_csv(GOOGLE_PATH_PREFIX + 'dataset/semeval2016-task6-testdata-gold/SemEval2016-Task6-subtaskA-testdata-gold.txt', sep='\t', encoding='ISO-8859-1')

In [6]:
feminist = train[train["Target"] == 'Feminist Movement'][['Tweet', 'Target']]
hillary = train[train["Target"] == 'Hillary Clinton'][['Tweet', 'Target']]
abortion = train[train["Target"] == 'Legalization of Abortion'][['Tweet', 'Target']]
atheism = train[train["Target"] == 'Atheism'][['Tweet', 'Target']]
climate = train[train["Target"] == 'Climate Change is a Real Concern'][['Tweet', 'Target']]

In [7]:
stance_detail = pd.read_csv(GOOGLE_PATH_PREFIX + '/dataset/StanceDataset/train.csv', encoding = 'ISO-8859-1', engine='python')

In [8]:
stance_detail['Opinion Towards'].unique()

array(['1.  The tweet explicitly expresses opinion about the target, a part of the target, or an aspect of the target.',
       '3.  The tweet is not explicitly expressing opinion. (For example, the tweet is simply giving information.)',
       '2. The tweet does NOT expresses opinion about the target but it HAS opinion about something or someone other than the target.'],
      dtype=object)

In [9]:
def split_data(train, test, name):
    X_train = train[train['Target']==name][['Tweet', 'Target']]
    y_train = train[train['Target']==name]['Stance']
    X_test = test[test['Target']==name][['Tweet', 'Target']]
    y_test = test[test['Target']==name]['Stance']
    return X_train, y_train, X_test, y_test

In [10]:
def report_score(feature_union, pipeline, X_test, y_test):
    # X_test = feature_union.transform(X_test)
    prediction = pipeline.predict(X_test)
    report = classification_report(y_test, prediction, output_dict=True, zero_division=0)
    # print(classification_report(y_test, prediction, zero_division=0))
    f1_favor = report['FAVOR']['f1-score']
    f1_against = report['AGAINST']['f1-score']
    score = (f1_favor + f1_against)/2
    # print("The score of this model is {}.".format(score))

    return score

In [11]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, lower=True, remove_at=True, lemmatize=False, remove_semst=True):
        self.lower = lower
        self.remove_at = remove_at
        self.lemmatize = lemmatize
        self.lemmatizer = WordNetLemmatizer()
        self.remove_semst = remove_semst

    def fit(self, X, y=None):
        return self

    def transform(self, text, y=None):
        processed_texts = []
        # for text in X:
        if self.lower:
            text = text.lower()
        if self.remove_at:
            text = re.sub(r'(@\w+\s?)', '', text)
        if self.remove_semst:
            text = re.sub(r'(\#semst\s?)', '', text)
        if self.lemmatize:
            text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split()])
            processed_texts.append(text)
        return text

In [12]:
def preprocess(text):
    return TextPreprocessor().transform(text)

In [13]:
def transform_all(data):
    return data.apply(preprocess)

In [14]:
class ModifiedTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, max_feature):
        self.max_feature = max_feature
        self.tfidf = TfidfVectorizer(max_features=self.max_feature)

    def fit(self, X, y=None):
        X = X['Tweet']
        self.tfidf.fit(X)
        return self
    def transform(self, X):
        X = X['Tweet']
        return self.tfidf.transform(X)

In [15]:
class SentimentExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.array([[TextBlob(text).sentiment.polarity, TextBlob(text).sentiment.subjectivity] for text in X['Tweet']])


In [40]:
C = [0.01, 0.1, 1, 10]
logistic_fit_intercept = [True, False]
logistic_class_weight = [None, 'balanced']
logistic_solver = ['lbfgs', 'liblinear', 'newton-cg']
n_neighbors = [3,4,5,6,7]
n_estimators = [10,25,50,75,100]
criterion =['gini', 'entropy']
max_depth =  [2,3,4,5,6,7,10]
decision_function_shape = ['ovo', 'ovr']
kernel = ['linear', 'rbf']
gamma = ['scale', 'auto', 0.1, 1, 10]
activation = ['relu', 'tanh']
hidden_layer_sizes = [(50,), (100,)]
alpha = [0.0001, 0.001, 0.01]
learning_rate = ['constant', 'adaptive']
classifiers = {
    'LogisticRegression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(probability=True),
    'GradientBoosting': GradientBoostingClassifier()
}
params_grid = {
    'LogisticRegression':{
        'classifier__C': C,
        'classifier__fit_intercept':logistic_fit_intercept,
        'classifier__class_weight': logistic_class_weight,
        'classifier__solver': logistic_solver
    },
    'KNN': {
        'classifier__n_neighbors':n_neighbors
    },
    'RandomForest': {
        'classifier__n_estimators':n_estimators,
        'classifier__criterion':criterion,
        'classifier__max_depth':max_depth
    },
    'SVM': {
        'classifier__class_weight': ['balanced', None],
        'classifier__decision_function_shape': decision_function_shape,
        'classifier__kernel':kernel,
        'classifier__gamma':gamma
    },
    'GradientBoosting': {
        'classifier__max_depth':max_depth,
        'classifier__n_estimators': n_estimators
    }
}

In [17]:
def train_word2vec(train):
    tweet = transform_all(train['Tweet'])
    tokenized_tweet = [word_tokenize(sentence) for sentence in tweet]
    wrod2vec_model = Word2Vec(tokenized_tweet, vector_size=200, window=5, min_count=1, workers=4)
    return wrod2vec_model

In [18]:
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    # Initialize with a pre-trained Word2Vec model
    def __init__(self, model):
        self.word2vec_model = model
        self.vector_size = model.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec_model.wv[word] for word in doc.split() if word in self.word2vec_model.wv]
                    or [np.random.rand(self.vector_size)], axis=0)
            for doc in X['Tweet']
        ])

In [19]:
class NGramVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, ngram_range_word=(1, 3), binary=True, k=200, ngram_range_char=None):
        self.binary = binary
        self.k = k
        self.ngram_range_char = ngram_range_char
        self.ngram_range_word = ngram_range_word
        self.word_vectorizer = CountVectorizer(ngram_range=self.ngram_range_word, binary=self.binary)
        if self.ngram_range_char is not None:
            self.char_vectorizer = CountVectorizer(analyzer='char', ngram_range=self.ngram_range_char, binary=self.binary)
        else:
            self.char_vectorizer = None
        self.selector = SelectKBest(chi2, k=self.k)
        self.is_fitted = False


    def fit(self, X, y=None):
        return self

    def fit_selector(self, X, y):
        word_features = self.word_vectorizer.fit_transform(X['Tweet'])
        if self.char_vectorizer is not None:
            char_features = self.char_vectorizer.fit_transform(X['Tweet'])
            combined_features = hstack([word_features, char_features])
        else:
            combined_features = word_features
        self.selector.fit(combined_features, y)
        self.is_fitted = True

    def transform(self, X):
        if not self.is_fitted:
            raise RuntimeError("You must call fit_selector before calling transform")
        word_features = self.word_vectorizer.transform(X['Tweet'])
        if self.char_vectorizer is not None:
            char_features = self.char_vectorizer.transform(X['Tweet'])
            combined_features = hstack([word_features, char_features])
            selected_features = self.selector.transform(combined_features)
        else:
            selected_features = self.selector.transform(word_features)
        return selected_features

In [20]:
target_words = {
    'Atheism': ['atheism', 'musili', 'god'],
    'Hillary Clinton': ['hillary', 'clinton'],
    'Climate Change is a Real Concern': ['climate'],
    'Feminist Movement': ['feminism', 'feminist', 'female', 'woman'],
    'Legalization of Abortion': ['abortion', 'prolife', 'youth']
}
unique_words = set()
for words in target_words.values():
    unique_words.update(words)
target_words = list(unique_words)

In [21]:
class TargetPresence_Single(BaseEstimator, TransformerMixin):
    def __init__(self, target_words_dict):
        self.target_words_dict = target_words_dict

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features = []

        for _, row in X.iterrows():
            tweet = row['Tweet']
            target = row['Target']
            related_words = self.target_words_dict.get(target, [])

            # Check for the presence of each related word in the tweet
            presence = [int(word in tweet) for word in related_words]
            features.append(presence)

        return np.array(features)

In [22]:
class TargetPresence(BaseEstimator, TransformerMixin):
    def __init__(self, target_words_list):
        self.target_words_list = target_words_list

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features = []
        for tweet in X['Tweet']:
            presence = [int(word in tweet.split()) for word in self.target_words_list]
            features.append(presence)

        return np.array(features)

In [23]:
class GloVeVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, glove_path, vector_size=200):
        self.glove_path = glove_path
        self.vector_size = vector_size
        self.embeddings = self.load_glove_embeddings()

    def load_glove_embeddings(self):
        embeddings = {}
        with open(self.glove_path, 'r', encoding='utf-8') as file:
            for line in file:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
        return embeddings

    def document_vector(self, doc):
        words = doc.split()
        word_vectors = [self.embeddings[word] for word in words if word in self.embeddings]

        if len(word_vectors) == 0:
            return np.zeros(self.vector_size)


        word_mean_vec = np.mean(word_vectors, axis=0)
        # if len(word_mean_vec) != self.vector_size:
        #     print(word_mean_vec)
        return word_mean_vec

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.document_vector(doc) for doc in X['Tweet']])

In [24]:
class PosTagVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
      self.pos_tags = ['CC','CD','DT','EX','FW','IN','JJ','JJR','JJS','MD','NN','NNP','NNS','PDT','POS','PRP','PRP$','RB','RBR','RBS','RP','SYM','TO','UH','VB','VBD','VBG','VBN','VBP','VBZ','WDT','WP','WRB']
    def fit(self, X, y=None):
        return self

    def transform(self, X):
      transformed = []
      for tweet in X['Tweet']:
        tag_counts = {tag: 0 for tag in self.pos_tags}
        counts = Counter(tag for word, tag in pos_tag(word_tokenize(tweet)))
        for key, value in counts.items():
          if key in tag_counts.keys():
            tag_counts[key] = value
        transformed.append(tag_counts)
      return transformed

In [25]:
def classifier_grid(feature_union, classify_pipeline, classifiers, params_grid, X_train, y_train, X_test, y_test):
    best_score = 0
    best_classifier = None
    best_classifier_name = ''
    X_train_transformed = feature_union.fit_transform(X_train)
    X_test_transformed = feature_union.transform(X_test)
    for classifier_name, model in classifiers.items():
        classify_pipeline.set_params(classifier=model)
        grid_search = RandomizedSearchCV(classify_pipeline, param_distributions=params_grid[classifier_name], cv=5, verbose=0, random_state=595, n_jobs=-1)
        start = time.time()
        grid_search.fit(X_train_transformed, y_train)
        end = time.time()
        # print('The best parameter for {} is {}.'.format(classifier_name, grid_search.best_params_))
        # print("Grid Search for Model {} needs {} seconds.".format(classifier_name, end-start))
        # print("The score for {} is {:.2f}.".format(classifier_name, grid_search.best_score_))
        best_model = grid_search.best_estimator_
        score = report_score(feature_union, best_model, X_test_transformed, y_test)

        if score > best_score:
            best_score = score
            best_classifier = best_model
            best_classifier_name = classifier_name

    return feature_union, best_classifier, best_classifier_name, best_score

In [26]:
def find_best(feature_union, pipeline, train, test, name, classifiers, params_grid):
    X_train, y_train, X_test, y_test = split_data(train, test, name)
    feature_extraction, best_classifier, classifier_name, score = classifier_grid(feature_union, pipeline, classifiers, params_grid, X_train, y_train, X_test, y_test)
    print("The best model for {} is {} with {}".format(name, classifier_name, best_classifier.get_params()))
    # print("The avg F1-score is ", score)
    return feature_extraction, best_classifier, classifier_name, score

In [27]:
def train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid):
    trained_classifiers = {}
    trained_feature_extraction = {}
    for name in train["Target"].unique():
        feature_extraction, classifier, classifier_name, score = find_best(feature_union, classify_pipeline, train, test, name, classifiers, params_grid)
        print("The best classifier for {} is {} with the average of F1 as {}.".format(name, classifier_name, score))
        trained_classifiers[name] = classifier
        trained_feature_extraction[name] = feature_extraction
    return trained_feature_extraction, trained_classifiers

In [28]:
def test_all(test_data, trained_feature_extraction, classifiers):
    predictions = pd.DataFrame(index=test_data.index)

    for target in classifiers:
        # Select the test data for the current target
        target_data = test_data[test_data['Target'] == target][['Tweet', 'Target']]

        if not target_data.empty:
            classifier = classifiers[target]
            feature_union = trained_feature_extraction[target]
            transformed_features = feature_union.transform(target_data)
            target_predictions = classifier.predict(transformed_features)
            predictions.loc[target_data.index, 'Prediction'] = target_predictions

    print(classification_report(test_data['Stance'], predictions))
    report = classification_report(test_data['Stance'], predictions, output_dict=True, zero_division=0)
    f1_favor = report['FAVOR']['f1-score']
    f1_against = report['AGAINST']['f1-score']
    score = (f1_favor + f1_against)/2
    print("The average F1-score for total test dataset is ", score)
    return score, predictions

In [29]:
feature_union = FeatureUnion([
    ('pos_tag', Pipeline([
            ('pos_extractor', PosTagVectorizer()),
            ('vectorizer', DictVectorizer())
        ]))
    # ('tfidf', ModifiedTfidfVectorizer(max_feature=500)),
    # ('sentiment', SentimentExtractor()),
    # ('bi-gram', CountVectorizer(ngram_range=(2,2))),
    # ('tri-gram', CountVectorizer(ngram_range=(3,3)))
])

# Model Pipeline
classify_pipeline = Pipeline([('classifier', None)])

In [30]:
def train_test_single(train, test, feature_union, pipeline, classifiers, params_grid):
    X_train = train[['Tweet', 'Target']]
    y_train = train['Stance']
    X_test = test[['Tweet', 'Target']]
    y_test = test['Stance']
    feature_extarction, best_classifier, classifier_name, score = classifier_grid(feature_union, pipeline, classifiers, params_grid, X_train, y_train, X_test, y_test)
    print("The best model for considering all targets is {} with {}".format(classifier_name, best_classifier.get_params()))
    print("The avg F1-score is ", score)

In [31]:
def test_on_opinionA(train, dataset, feature_union, classifiers):
  test_data = dataset[['Tweet', 'Target', 'Stance']]
  feature_union.fit(dataset[['Tweet', 'Target']])

  predictions = pd.DataFrame(index=test_data.index)

  for target, clf in classifiers.items():
    X = train[train['Target']==target][['Tweet', 'Target']]
    y = train[train['Target']==target]['Stance']
    X_transformed = feature_union.transform(X)
    clf.fit(X_transformed, y)
    target_data = test_data[test_data['Target'] == target][['Tweet', 'Target']]
    transformed_features = feature_union.transform(target_data)
    target_predictions = clf.predict(transformed_features)
    predictions.loc[target_data.index, 'Prediction'] = target_predictions

  print(classification_report(test_data['Stance'], predictions))
  report = classification_report(test_data['Stance'], predictions, output_dict=True, zero_division=0)
  f1_favor = report['FAVOR']['f1-score']
  f1_against = report['AGAINST']['f1-score']
  score = (f1_favor + f1_against)/2
  print("The average F1-score for total test dataset is ", score)

In [32]:
trump = load_data(GOOGLE_PATH_PREFIX + "dataset/semeval2016-task6-testdata-gold/SemEval2016-Task6-subtaskB-testdata-gold.txt")
trump["Tweet"] = transform_all(trump['Tweet'])
glove_vectorizer = GloVeVectorizer(GOOGLE_PATH_PREFIX+"glove.6B.200d.txt", vector_size=200)

In [33]:
def predict_model_taskB(tarin, test, feature_union, classifiers):

  feature_union.fit(train[['Tweet', 'Target']])
  for target, clf in classifiers.items():
    X = train[train['Target']==target][['Tweet', 'Target']]
    y = train[train['Target']==target]['Stance']
    X_transformed = feature_union.transform(X)
    clf.fit(X_transformed, y)

  X = test[['Tweet', 'Target']]
  y = test['Stance']
  X_transformed = feature_union.transform(X)

  probs = []
  for clf in classifiers.values():
    prob = clf.predict_proba(X_transformed)
    probs.append(prob)

  class_labels = list(classifiers.values())[0].classes_

  best_clf = None
  best_f1 = 0
  best_f1_report = None
  best_f1_targetmodel = None
  best_prediction = None
  i = 0
  for target, clf in classifiers.items():
    max_prob_indices = np.argmax(probs[i], axis=-1)
    predicted_stances = [class_labels[idx] for idx in max_prob_indices]
    report = classification_report(y, predicted_stances, output_dict=True)
    f1_favor = report['FAVOR']['f1-score']
    f1_against = report['AGAINST']['f1-score']
    score = (f1_favor + f1_against)/2
    i += 1
    if score > best_f1:
      best_f1 = score
      best_clf = clf
      best_f1_report = classification_report(y, predicted_stances)
      best_f1_targetmodel = target
      best_prediction = predicted_stances


  sum_probs = np.sum(probs, axis=0)
  max_prob_indices = np.argmax(sum_probs, axis=-1)
  predicted_stances = [class_labels[idx] for idx in max_prob_indices]
  report = classification_report(y, predicted_stances, output_dict=True)
  f1_favor = report['FAVOR']['f1-score']
  f1_against = report['AGAINST']['f1-score']
  score = (f1_favor + f1_against)/2

  if score > best_f1:
      best_f1 = score
      best_clf = "Soft Voting Classifier"
      best_f1_report = classification_report(y, predicted_stances)
      best_f1_targetmodel = "Five Combined"
      best_prediction = predicted_stances


  print("The average F1-score for task B is ", best_f1)
  print("The best Classifier for task B is ", best_clf)
  print("The classification reporst is: ", best_f1_report)
  print("The most similiar target for task B is ", best_f1_targetmodel)
  return predicted_stances, best_f1_targetmodel

In [34]:
stance_test = pd.read_csv(GOOGLE_PATH_PREFIX + '/dataset/StanceDataset/test.csv', encoding = 'ISO-8859-1', engine='python')
stance_test['Tweet'] = transform_all(stance_test['Tweet'])
stance_test_A = stance_test[stance_test['Target']!="Donald Trump"]
stance_test_B = stance_test[stance_test['Target']=="Donald Trump"]
opinion_to_target_A = stance_test_A[stance_test_A['Opinion Towards'].str.startswith('1')]
opinion_to_other_A = stance_test_A[stance_test_A['Opinion Towards'].str.startswith('2')]
opinion_to_target_B = stance_test_B[stance_test_B['Opinion Towards'].str.startswith('1')]
opinion_to_other_B = stance_test_B[stance_test_B['Opinion Towards'].str.startswith('2')]

In [35]:
def test_on_opinionB(train, dataset, feature_union, classifiers, best_f1_targetmodel):
  test_data = dataset[['Tweet', 'Target', 'Stance']]
  feature_union.fit(dataset[['Tweet', 'Target']])

  predictions = pd.DataFrame(index=test_data.index)
  if best_f1_targetmodel != "Five Combined":
    clf = classifiers[best_f1_targetmodel]
    target_data = test_data[['Tweet', 'Target']]
    transformed_features = feature_union.transform(target_data)
    target_predictions = clf.predict(transformed_features)
    predictions.loc[target_data.index, 'Prediction'] = target_predictions
  else:
    probs = []
    for target, clf in classifiers.items():
      target_data = test_data[['Tweet', 'Target']]
      transformed_features = feature_union.transform(target_data)
      prob = clf.predict_proba(transformed_features)
      probs.append(prob)
    sum_probs = np.sum(probs, axis=0)
    max_prob_indices = np.argmax(sum_probs, axis=-1)
    class_labels = list(classifiers.values())[0].classes_
    predicted_stances = [class_labels[idx] for idx in max_prob_indices]
    predictions.loc[target_data.index, 'Prediction'] = predicted_stances

  print(classification_report(test_data['Stance'], predictions))
  report = classification_report(test_data['Stance'], predictions, output_dict=True, zero_division=0)
  f1_favor = report['FAVOR']['f1-score']
  f1_against = report['AGAINST']['f1-score']
  score = (f1_favor + f1_against)/2
  print("The average F1-score for total test dataset is ", score)

In [36]:
train['Tweet'] = transform_all(train['Tweet'])
test['Tweet'] = transform_all(test['Tweet'])
word2vec_model = train_word2vec(train)
word2vec_vectorizer = Word2VecVectorizer(word2vec_model)

In [None]:
train['Tweet'] = transform_all(train['Tweet'])
test['Tweet'] = transform_all(test['Tweet'])
# only pos_tag
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

KeyboardInterrupt: ignored

In [None]:
# tfidf + bigram
# train = load_data(args.train_path)
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=10))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=10), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 10, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.5845596310125192.
The best 

In [None]:
# tfidf + trigram
# train = load_data(args.train_path)
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
bigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.5766905508284819.
The best mod

In [None]:
# tfidf + ngram
# train = load_data(args.train_path)
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
bigram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier())], 'verbose': False, 'classifier': GradientBoostingClassifier(), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 3, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.6645716346401525.
The best model for Climate Change 

In [None]:
# tfidf + sentiment
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('sentiment', SentimentExtractor()))
# transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=5, n_estimators=25))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=5, n_estimators=25), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 25, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# tfidf + target
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
target_words = {
            'Atheism': ['atheism', 'god'],
            'Hillary Clinton': ['hillary', 'clinton'],
            'Climate Change is a Real Concern': ['climate'],
            'Feminist Movement': ['feminism', 'feminist', 'female', 'woman'],
            'Legalization of Abortion': ['abortion', 'prolife', 'youth']
        }
unique_words = set()
for words in target_words.values():
    unique_words.update(words)
target_words = list(unique_words)
transformers.append(('target_presence', TargetPresence(target_words)))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=5, n_estimators=50))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=5, n_estimators=50), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 50, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# tfidf + pos_tag
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.6065232086525462.
The best mod

In [None]:
# tfidf + ngram + sentiment
# train = load_data(args.train_path)
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('sentiment', SentimentExtractor()))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
bigram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier())], 'verbose': False, 'classifier': GradientBoostingClassifier(), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 3, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.6597859327217125.
The best model for Climate Change 

In [None]:
# tfidf + bigram + sentiment
# train = load_data(args.train_path)
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('sentiment', SentimentExtractor()))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=5, n_estimators=50))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=5, n_estimators=50), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 50, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# tfidf + trigram + sentiment
# train = load_data(args.train_path)
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('sentiment', SentimentExtractor()))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
bigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=5, n_estimators=50))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=5, n_estimators=50), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 50, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# tfidf + pos_tag + bigram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=10))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=10), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 10, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.5556187443130118.
The best 

In [None]:
# tfidf + pos_tag + trigram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=5, n_estimators=50))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=5, n_estimators=50), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 50, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# tfidf + pos_tag + ngram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is RandomForest with {'memory': None, 'steps': [('classifier', RandomForestClassifier(max_depth=10, n_estimators=10))], 'verbose': False, 'classifier': RandomForestClassifier(max_depth=10, n_estimators=10), 'classifier__bootstrap': True, 'classifier__ccp_alpha': 0.0, 'classifier__class_weight': None, 'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': None, 'classifier__max_samples': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 10, 'classifier__n_jobs': None, 'classifier__oob_score': False, 'classifier__random_state': None, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is RandomForest with the average of F1 as 0.6986885245901638.
The best model for Climate Change is a Real Concern is GradientBoos

In [None]:
# tfidf + target + bigram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('target_presence', TargetPresence(target_words)))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=5, n_estimators=25))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=5, n_estimators=25), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 25, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# tfidf + target + trigram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('target_presence', TargetPresence(target_words)))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(n_estimators=25))], 'verbose': False, 'classifier': GradientBoostingClassifier(n_estimators=25), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 3, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 25, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.5701420969533945.
The b

In [None]:
# tfidf + target + ngram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('target_presence', TargetPresence(target_words)))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(n_estimators=25))], 'verbose': False, 'classifier': GradientBoostingClassifier(n_estimators=25), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 3, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 25, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.6634897360703812.
The b

In [None]:
# tfidf + pos_tag + sentiment
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
transformers.append(('sentiment', SentimentExtractor()))
# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']
# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.585978835978836.
The best mode

In [None]:
# tfidf + pos_tag + sentiment + ngram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
transformers.append(('sentiment', SentimentExtractor()))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier())], 'verbose': False, 'classifier': GradientBoostingClassifier(), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 3, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.6508683825068006.
The best model for Climate Change 

In [None]:
# tfidf + target + sentiment
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))
transformers.append(('sentiment', SentimentExtractor()))
# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']
# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=5, n_estimators=25))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=5, n_estimators=25), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 25, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# tfidf + target + sentiment + ngram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))
transformers.append(('sentiment', SentimentExtractor()))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.5911931818181818.
The best mod

In [None]:
# tfidf + target + pos_tag
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
transformers.append(('target_presence', TargetPresence(target_words)))
# transformers.append(('sentiment', SentimentExtractor()))
# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']
# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.6560134566862911.
The best mod

In [None]:
# tfidf + target + pos_tag + ngram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
transformers.append(('target_presence', TargetPresence(target_words)))
# transformers.append(('sentiment', SentimentExtractor()))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=2, n_estimators=50))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=2, n_estimators=50), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 2, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 50, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# tfidf + target + pos_tag + sentiment
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
transformers.append(('target_presence', TargetPresence(target_words)))
transformers.append(('sentiment', SentimentExtractor()))
# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']
# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.6012405382674517.
The best mod

In [None]:
# tfidf + target + pos_tag + sentiment + ngram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
transformers.append(('target_presence', TargetPresence(target_words)))
transformers.append(('sentiment', SentimentExtractor()))
train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier())], 'verbose': False, 'classifier': GradientBoostingClassifier(), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 3, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 100, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.6493026204564667.
The best model for Climate Change 

In [None]:
# word2vec era!!!

In [None]:
# word2vec + bigram
transformers = []
word2vec_model = train_word2vec(train)
word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is LogisticRegression with {'memory': None, 'steps': [('classifier', LogisticRegression(C=1, class_weight='balanced', fit_intercept=False,
                   solver='liblinear'))], 'verbose': False, 'classifier': LogisticRegression(C=1, class_weight='balanced', fit_intercept=False,
                   solver='liblinear'), 'classifier__C': 1, 'classifier__class_weight': 'balanced', 'classifier__dual': False, 'classifier__fit_intercept': False, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 100, 'classifier__multi_class': 'auto', 'classifier__n_jobs': None, 'classifier__penalty': 'l2', 'classifier__random_state': None, 'classifier__solver': 'liblinear', 'classifier__tol': 0.0001, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is LogisticRegression with the average of F1 as 0.49742562929061784.
The best model for Climate Change is a Real Concern is KNN with {'memory': None, 'st

In [None]:
# word2vec + trigram
transformers = []
word2vec_model = train_word2vec(train)
word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is LogisticRegression with {'memory': None, 'steps': [('classifier', LogisticRegression(C=1))], 'verbose': False, 'classifier': LogisticRegression(C=1), 'classifier__C': 1, 'classifier__class_weight': None, 'classifier__dual': False, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 100, 'classifier__multi_class': 'auto', 'classifier__n_jobs': None, 'classifier__penalty': 'l2', 'classifier__random_state': None, 'classifier__solver': 'lbfgs', 'classifier__tol': 0.0001, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is LogisticRegression with the average of F1 as 0.4516363636363636.
The best model for Climate Change is a Real Concern is RandomForest with {'memory': None, 'steps': [('classifier', RandomForestClassifier(max_depth=6, n_estimators=50))], 'verbose': False, 'classifier': RandomForestClassifier(max_depth=6, n_estimators=50), 'classifi

In [None]:
# word2vec + ngram
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is SVM with {'memory': None, 'steps': [('classifier', SVC(gamma=0.1, kernel='linear'))], 'verbose': False, 'classifier': SVC(gamma=0.1, kernel='linear'), 'classifier__C': 1.0, 'classifier__break_ties': False, 'classifier__cache_size': 200, 'classifier__class_weight': None, 'classifier__coef0': 0.0, 'classifier__decision_function_shape': 'ovr', 'classifier__degree': 3, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__max_iter': -1, 'classifier__probability': False, 'classifier__random_state': None, 'classifier__shrinking': True, 'classifier__tol': 0.001, 'classifier__verbose': False}
The best classifier for Atheism is SVM with the average of F1 as 0.6652046783625731.
The best model for Climate Change is a Real Concern is RandomForest with {'memory': None, 'steps': [('classifier', RandomForestClassifier(max_depth=7, n_estimators=50))], 'verbose': False, 'classifier': RandomForestClassifier(max_depth=7, n_estimators=50), 'classifier__bootst

In [None]:
# word2vec + sentiment
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

# transformers.append(('target_presence', TargetPresence(target_words)))

transformers.append(('sentiment', SentimentExtractor()))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4, n_estimators=10))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4, n_estimators=10), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 10, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# word2vec + pos_tag
transformers = []
word2vec_model = train_word2vec(train)
word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is KNN with {'memory': None, 'steps': [('classifier', KNeighborsClassifier(n_neighbors=7))], 'verbose': False, 'classifier': KNeighborsClassifier(n_neighbors=7), 'classifier__algorithm': 'auto', 'classifier__leaf_size': 30, 'classifier__metric': 'minkowski', 'classifier__metric_params': None, 'classifier__n_jobs': None, 'classifier__n_neighbors': 7, 'classifier__p': 2, 'classifier__weights': 'uniform'}
The best classifier for Atheism is KNN with the average of F1 as 0.4448484848484848.
The best model for Climate Change is a Real Concern is LogisticRegression with {'memory': None, 'steps': [('classifier', LogisticRegression(C=0.01, solver='newton-cg'))], 'verbose': False, 'classifier': LogisticRegression(C=0.01, solver='newton-cg'), 'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__dual': False, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 100, 'classifier__mult

In [None]:
# word2vec + target
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(n_estimators=25))], 'verbose': False, 'classifier': GradientBoostingClassifier(n_estimators=25), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 3, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 25, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.44034608378870677.
The 

In [None]:
# word2vec + sentiment + ngram
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

# transformers.append(('target_presence', TargetPresence(target_words)))

transformers.append(('sentiment', SentimentExtractor()))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")0.6449
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is SVM with {'memory': None, 'steps': [('classifier', SVC(gamma=0.1, kernel='linear'))], 'verbose': False, 'classifier': SVC(gamma=0.1, kernel='linear'), 'classifier__C': 1.0, 'classifier__break_ties': False, 'classifier__cache_size': 200, 'classifier__class_weight': None, 'classifier__coef0': 0.0, 'classifier__decision_function_shape': 'ovr', 'classifier__degree': 3, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__max_iter': -1, 'classifier__probability': False, 'classifier__random_state': None, 'classifier__shrinking': True, 'classifier__tol': 0.001, 'classifier__verbose': False}
The best classifier for Atheism is SVM with the average of F1 as 0.6449843260188088.
The best model for Climate Change is a Real Concern is LogisticRegression with {'memory': None, 'steps': [('classifier', LogisticRegression(C=0.1, solver='newton-cg'))], 'verbose': False, 'classifier': LogisticRegression(C=0.1, solver='newton-cg'), 'classifier__C': 0.1, 'clas

In [None]:
# word2vec + target + ngram
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is SVM with {'memory': None, 'steps': [('classifier', SVC(gamma=0.1, kernel='linear'))], 'verbose': False, 'classifier': SVC(gamma=0.1, kernel='linear'), 'classifier__C': 1.0, 'classifier__break_ties': False, 'classifier__cache_size': 200, 'classifier__class_weight': None, 'classifier__coef0': 0.0, 'classifier__decision_function_shape': 'ovr', 'classifier__degree': 3, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__max_iter': -1, 'classifier__probability': False, 'classifier__random_state': None, 'classifier__shrinking': True, 'classifier__tol': 0.001, 'classifier__verbose': False}
The best classifier for Atheism is SVM with the average of F1 as 0.6601796407185628.
The best model for Climate Change is a Real Concern is KNN with {'memory': None, 'steps': [('classifier', KNeighborsClassifier(n_neighbors=6))], 'verbose': False, 'classifier': KNeighborsClassifier(n_neighbors=6), 'classifier__algorithm': 'auto', 'classifier__leaf_size': 30, 

In [None]:
# word2vec + pos_tag + ngram
transformers = []
word2vec_model = train_word2vec(train)
word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=2, n_estimators=50))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=2, n_estimators=50), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 2, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 50, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# word2vec + sentiment + POS_TAG
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

# transformers.append(('target_presence', TargetPresence(target_words)))

transformers.append(('sentiment', SentimentExtractor()))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4, n_estimators=10))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4, n_estimators=10), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 10, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# word2vec + sentiment + pos_tag + ngram
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

# transformers.append(('target_presence', TargetPresence(target_words)))

transformers.append(('sentiment', SentimentExtractor()))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=2, n_estimators=50))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=2, n_estimators=50), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 2, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 50, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# word2vec + sentiment + target
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

transformers.append(('target_presence', TargetPresence(target_words)))

transformers.append(('sentiment', SentimentExtractor()))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4, n_estimators=10))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4, n_estimators=10), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 10, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# word2vec + sentiment + target + ngram
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

transformers.append(('target_presence', TargetPresence(target_words)))

transformers.append(('sentiment', SentimentExtractor()))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is SVM with {'memory': None, 'steps': [('classifier', SVC(gamma=0.1, kernel='linear'))], 'verbose': False, 'classifier': SVC(gamma=0.1, kernel='linear'), 'classifier__C': 1.0, 'classifier__break_ties': False, 'classifier__cache_size': 200, 'classifier__class_weight': None, 'classifier__coef0': 0.0, 'classifier__decision_function_shape': 'ovr', 'classifier__degree': 3, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__max_iter': -1, 'classifier__probability': False, 'classifier__random_state': None, 'classifier__shrinking': True, 'classifier__tol': 0.001, 'classifier__verbose': False}
The best classifier for Atheism is SVM with the average of F1 as 0.650611620795107.
The best model for Climate Change is a Real Concern is KNN with {'memory': None, 'steps': [('classifier', KNeighborsClassifier(n_neighbors=6))], 'verbose': False, 'classifier': KNeighborsClassifier(n_neighbors=6), 'classifier__algorithm': 'auto', 'classifier__leaf_size': 30, '

In [None]:
# word2vec + pos_tag + target
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is KNN with {'memory': None, 'steps': [('classifier', KNeighborsClassifier(n_neighbors=6))], 'verbose': False, 'classifier': KNeighborsClassifier(n_neighbors=6), 'classifier__algorithm': 'auto', 'classifier__leaf_size': 30, 'classifier__metric': 'minkowski', 'classifier__metric_params': None, 'classifier__n_jobs': None, 'classifier__n_neighbors': 6, 'classifier__p': 2, 'classifier__weights': 'uniform'}
The best classifier for Atheism is KNN with the average of F1 as 0.46397532940846653.
The best model for Climate Change is a Real Concern is LogisticRegression with {'memory': None, 'steps': [('classifier', LogisticRegression(C=0.01, solver='newton-cg'))], 'verbose': False, 'classifier': LogisticRegression(C=0.01, solver='newton-cg'), 'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__dual': False, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 100, 'classifier__mul

In [None]:
# word2vec + pos_tag + target + ngram
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(n_estimators=25))], 'verbose': False, 'classifier': GradientBoostingClassifier(n_estimators=25), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 3, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 25, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.6728571428571429.
The b

In [None]:
# word2vec + pos_tag + target + sentiment
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

transformers.append(('target_presence', TargetPresence(target_words)))

transformers.append(('sentiment', SentimentExtractor()))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4, n_estimators=10))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4, n_estimators=10), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 10, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [None]:
# word2vec + pos_tag + target + sentiment + ngram
transformers = []
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

transformers.append(('target_presence', TargetPresence(target_words)))

transformers.append(('sentiment', SentimentExtractor()))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
classify_pipeline = Pipeline([('classifier', None)])

start = time.time()
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
end = time.time()
print(end-start, "for train_test_all.")

start = time.time()
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)
end = time.time()
print(end-start, "for test_all.")
# train_test_single(train, test, feature_union, classify_pipeline, classifiers, params_grid)

KeyboardInterrupt: ignored

In [None]:
## 0.6674 glove pos-tag bigram
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': GradientBoostingClassifier(max_depth=4),
    'Climate Change is a Real Concern': GradientBoostingClassifier(n_estimators=25),
    'Feminist Movement': LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg'),
    'Hillary Clinton': LogisticRegression(C=0.01, class_weight='balanced', fit_intercept=False,max_iter=1000, solver='liblinear'),
    'Legalization of Abortion': LogisticRegression(C=1, max_iter=1000)
}

transformers = []
transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.81      0.81      0.81       535
       FAVOR       0.76      0.53      0.62       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.71       824
   macro avg       0.52      0.45      0.48       824
weighted avg       0.79      0.71      0.74       824

The average F1-score for total test dataset is  0.7166604112566897
              precision    recall  f1-score   support

     AGAINST       0.53      0.76      0.63       173
       FAVOR       0.15      0.53      0.24        15
        NONE       0.70      0.29      0.41       194

    accuracy                           0.51       382
   macro avg       0.46      0.53      0.42       382
weighted avg       0.60      0.51      0.50       382

The average F1-score for total test dataset is  0.43044326735433514
The average F1-score for task B is  0.392631203777956
The best Classifier for task B is  LogisticRegres

In [None]:
## 0.6627 glove pos-tag trigram
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism':  SVC(gamma=0.1, kernel='linear', probability=True),
    'Climate Change is a Real Concern': GradientBoostingClassifier(max_depth=4),
    'Feminist Movement': GradientBoostingClassifier(n_estimators=25),
    'Hillary Clinton': LogisticRegression(C=0.01, class_weight='balanced', fit_intercept=False,max_iter=1000, solver='liblinear'),
    'Legalization of Abortion': LogisticRegression(C=1, max_iter=1000)
}

transformers = []
transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.80      0.82      0.81       535
       FAVOR       0.76      0.52      0.62       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.71       824
   macro avg       0.52      0.45      0.48       824
weighted avg       0.79      0.71      0.74       824

The average F1-score for total test dataset is  0.7143169207916467
              precision    recall  f1-score   support

     AGAINST       0.53      0.76      0.62       173
       FAVOR       0.18      0.53      0.27        15
        NONE       0.69      0.32      0.44       194

    accuracy                           0.53       382
   macro avg       0.47      0.54      0.44       382
weighted avg       0.60      0.53      0.51       382

The average F1-score for total test dataset is  0.4467571158259189
The average F1-score for task B is  0.39109956147474423
The best Classifier for task B is  GradientBoost

In [None]:
## 0.6522 glove
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': LogisticRegression(C=1),
    'Climate Change is a Real Concern': RandomForestClassifier(max_depth=6),
    'Feminist Movement': LogisticRegression(C=1, class_weight='balanced', fit_intercept=False,solver='liblinear'),
    'Hillary Clinton': LogisticRegression(C=0.1, class_weight='balanced', solver='liblinear'),
    'Legalization of Abortion': KNeighborsClassifier()
}

transformers = []
transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.84      0.77      0.80       535
       FAVOR       0.68      0.58      0.63       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.70       824
   macro avg       0.51      0.45      0.48       824
weighted avg       0.78      0.70      0.74       824

The average F1-score for total test dataset is  0.7142134335556789
              precision    recall  f1-score   support

     AGAINST       0.54      0.65      0.59       173
       FAVOR       0.12      0.53      0.20        15
        NONE       0.74      0.40      0.52       194

    accuracy                           0.52       382
   macro avg       0.47      0.53      0.44       382
weighted avg       0.62      0.52      0.54       382

The average F1-score for total test dataset is  0.3938045965896271
The average F1-score for task B is  0.3606060606060606
The best Classifier for task B is  LogisticRegres

In [None]:
## 0.6563 bigram
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
# transformers.append(('glove', glove_vectorizer))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

trigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': LogisticRegression(C=0.01, class_weight='balanced', fit_intercept=False,solver='liblinear'),
    'Climate Change is a Real Concern': LogisticRegression(C=0.1, solver='newton-cg'),
    'Feminist Movement': SVC(gamma=10, probability = True),
    'Hillary Clinton': GradientBoostingClassifier(max_depth=6, n_estimators=25),
    'Legalization of Abortion': KNeighborsClassifier()
}

transformers = []
# transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.76      0.95      0.85       535
       FAVOR       0.87      0.44      0.58       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.77       824
   macro avg       0.54      0.46      0.48       824
weighted avg       0.80      0.77      0.75       824

The average F1-score for total test dataset is  0.7139106205122709
              precision    recall  f1-score   support

     AGAINST       0.49      0.94      0.65       173
       FAVOR       0.12      0.33      0.18        15
        NONE       0.82      0.05      0.09       194

    accuracy                           0.46       382
   macro avg       0.48      0.44      0.30       382
weighted avg       0.64      0.46      0.34       382

The average F1-score for total test dataset is  0.4104284615922276
The average F1-score for task B is  0.39055677779658154
The best Classifier for task B is  Soft Voting C

In [None]:
## 0.6589 trigram
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
# transformers.append(('glove', glove_vectorizer))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': LogisticRegression(C=10, class_weight='balanced', solver='liblinear'),
    'Climate Change is a Real Concern': LogisticRegression(C=1),
    'Feminist Movement': LogisticRegression(C=10, class_weight='balanced', solver='liblinear'),
    'Hillary Clinton': LogisticRegression(C=0.01, class_weight='balanced', fit_intercept=False,solver='liblinear'),
    'Legalization of Abortion': GradientBoostingClassifier(max_depth=5, n_estimators=25)
}

transformers = []
# transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.76      0.98      0.85       535
       FAVOR       0.93      0.41      0.57       289

    accuracy                           0.78       824
   macro avg       0.84      0.70      0.71       824
weighted avg       0.82      0.78      0.76       824

The average F1-score for total test dataset is  0.7126665848474754
              precision    recall  f1-score   support

     AGAINST       0.49      0.99      0.66       173
       FAVOR       0.12      0.27      0.17        15
        NONE       1.00      0.01      0.01       194

    accuracy                           0.46       382
   macro avg       0.54      0.42      0.28       382
weighted avg       0.73      0.46      0.31       382

The average F1-score for total test dataset is  0.4126925898752752
The average F1-score for task B is  0.36885070493454175
The best Classifier for task B is  Soft Voting Classifier
The classification reporst is:              

In [None]:
## 0.6537 target
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
# transformers.append(('glove', glove_vectorizer))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': LogisticRegression(C=0.01, solver='newton-cg'),
    'Climate Change is a Real Concern': LogisticRegression(C=0.01, solver='newton-cg'),
    'Feminist Movement': KNeighborsClassifier(n_neighbors=4),
    'Hillary Clinton': KNeighborsClassifier(n_neighbors=7),
    'Legalization of Abortion': LogisticRegression(C=0.01, solver='newton-cg')
}

transformers = []
# transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# trump['Tweet'] = transform_all(trump['Tweet'])
# X_train = trump[['Tweet', 'Target']]
# y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.75      0.76      0.76       535
       FAVOR       0.88      0.42      0.57       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.64       824
   macro avg       0.54      0.39      0.44       824
weighted avg       0.80      0.64      0.69       824

The average F1-score for total test dataset is  0.6622024489050381
              precision    recall  f1-score   support

     AGAINST       0.52      0.79      0.62       173
       FAVOR       0.11      0.27      0.16        15
        NONE       0.54      0.23      0.32       194

    accuracy                           0.48       382
   macro avg       0.39      0.43      0.37       382
weighted avg       0.51      0.48      0.45       382

The average F1-score for total test dataset is  0.39192660550458713
The average F1-score for task B is  0.298
The best Classifier for task B is  KNeighborsClassifier(n_nei

In [None]:
## 0.6551 trigram target
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
# transformers.append(('glove', glove_vectorizer))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': LogisticRegression(C=10, class_weight='balanced', solver='liblinear'),
    'Climate Change is a Real Concern': KNeighborsClassifier(n_neighbors=4),
    'Feminist Movement': LogisticRegression(C=10, class_weight='balanced', solver='liblinear'),
    'Hillary Clinton': SVC(gamma=10, probability=True),
    'Legalization of Abortion': LogisticRegression(C=10, class_weight='balanced', fit_intercept=False, solver='newton-cg')
}

transformers = []
# transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# trump['Tweet'] = transform_all(trump['Tweet'])
# X_train = trump[['Tweet', 'Target']]
# y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.76      0.96      0.85       535
       FAVOR       0.86      0.44      0.59       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.78       824
   macro avg       0.54      0.47      0.48       824
weighted avg       0.80      0.78      0.76       824

The average F1-score for total test dataset is  0.7185260132004463
              precision    recall  f1-score   support

     AGAINST       0.48      0.95      0.64       173
       FAVOR       0.11      0.27      0.15        15
        NONE       1.00      0.01      0.01       194

    accuracy                           0.45       382
   macro avg       0.53      0.41      0.27       382
weighted avg       0.73      0.45      0.30       382

The average F1-score for total test dataset is  0.39523913997367266
The average F1-score for task B is  0.33826033121807775
The best Classifier for task B is  LogisticRegr

In [None]:
## 0.6546 glove target
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': LogisticRegression(C=1, max_iter=1000),
    'Climate Change is a Real Concern': GradientBoostingClassifier(max_depth=4),
    'Feminist Movement': LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg'),
    'Hillary Clinton': LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000,solver='liblinear'),
    'Legalization of Abortion': SVC(gamma=0.1, kernel='linear', probability=True)
}

transformers = []
transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# trump['Tweet'] = transform_all(trump['Tweet'])
# X_train = trump[['Tweet', 'Target']]
# y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.81      0.80      0.81       535
       FAVOR       0.73      0.54      0.62       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.71       824
   macro avg       0.51      0.45      0.48       824
weighted avg       0.78      0.71      0.74       824

The average F1-score for total test dataset is  0.7129168349715005
              precision    recall  f1-score   support

     AGAINST       0.52      0.68      0.59       173
       FAVOR       0.14      0.40      0.20        15
        NONE       0.67      0.39      0.49       194

    accuracy                           0.52       382
   macro avg       0.44      0.49      0.43       382
weighted avg       0.58      0.52      0.52       382

The average F1-score for total test dataset is  0.3974342636251646
The average F1-score for task B is  0.4429113712063415
The best Classifier for task B is  LogisticRegres

In [None]:
## 0.6597 glove bigram
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

trigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': GradientBoostingClassifier(max_depth=4),
    'Climate Change is a Real Concern': GradientBoostingClassifier(n_estimators=25),
    'Feminist Movement': LogisticRegression(C=1, class_weight='balanced', fit_intercept=False,solver='liblinear'),
    'Hillary Clinton': LogisticRegression(C=0.1, class_weight='balanced', solver='liblinear'),
    'Legalization of Abortion': KNeighborsClassifier(n_neighbors=4)
}

transformers = []
transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# trump['Tweet'] = transform_all(trump['Tweet'])
# X_train = trump[['Tweet', 'Target']]
# y_train = trump['Stance']

bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.82      0.79      0.80       535
       FAVOR       0.72      0.53      0.61       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.70       824
   macro avg       0.51      0.44      0.47       824
weighted avg       0.78      0.70      0.74       824

The average F1-score for total test dataset is  0.7073091536739449
              precision    recall  f1-score   support

     AGAINST       0.56      0.69      0.62       173
       FAVOR       0.16      0.60      0.25        15
        NONE       0.71      0.40      0.51       194

    accuracy                           0.54       382
   macro avg       0.47      0.56      0.46       382
weighted avg       0.62      0.54      0.55       382

The average F1-score for total test dataset is  0.4334832904884319
The average F1-score for task B is  0.35351624093345946
The best Classifier for task B is  KNeighborsCla

In [None]:
## 0.6526 glove trigram
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': SVC(gamma=1, kernel='linear', probability=True),
    'Climate Change is a Real Concern': LogisticRegression(C=1),
    'Feminist Movement': LogisticRegression(C=1, class_weight='balanced', fit_intercept=False,solver='liblinear'),
    'Hillary Clinton': LogisticRegression(C=0.1, class_weight='balanced', solver='liblinear'),
    'Legalization of Abortion': SVC(gamma=0.1, kernel='linear', probability=True)
}

transformers = []
transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# trump['Tweet'] = transform_all(trump['Tweet'])
# X_train = trump[['Tweet', 'Target']]
# y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.82      0.76      0.79       535
       FAVOR       0.70      0.53      0.61       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.68       824
   macro avg       0.51      0.43      0.46       824
weighted avg       0.78      0.68      0.72       824

The average F1-score for total test dataset is  0.697324363580766
              precision    recall  f1-score   support

     AGAINST       0.54      0.60      0.57       173
       FAVOR       0.14      0.53      0.22        15
        NONE       0.65      0.45      0.53       194

    accuracy                           0.52       382
   macro avg       0.44      0.53      0.44       382
weighted avg       0.58      0.52      0.53       382

The average F1-score for total test dataset is  0.39407814407814407
The average F1-score for task B is  0.3606060606060606
The best Classifier for task B is  LogisticRegres

In [None]:
## 0.6572 glove sentiment
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
transformers.append(('sentiment', SentimentExtractor()))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': SVC(gamma=0.1, kernel='linear', probability=True),
    'Climate Change is a Real Concern': LogisticRegression(C=1, class_weight='balanced', fit_intercept=False,solver='liblinear'),
    'Feminist Movement': GradientBoostingClassifier(max_depth=4, n_estimators=10),
    'Hillary Clinton': LogisticRegression(C=0.1, class_weight='balanced', solver='liblinear'),
    'Legalization of Abortion': KNeighborsClassifier()
}

transformers = []
transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

transformers.append(('sentiment', SentimentExtractor()))

# trump['Tweet'] = transform_all(trump['Tweet'])
# X_train = trump[['Tweet', 'Target']]
# y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.81      0.83      0.82       535
       FAVOR       0.71      0.52      0.60       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.72       824
   macro avg       0.51      0.45      0.47       824
weighted avg       0.77      0.72      0.74       824

The average F1-score for total test dataset is  0.7070999670320308
              precision    recall  f1-score   support

     AGAINST       0.55      0.79      0.65       173
       FAVOR       0.13      0.40      0.20        15
        NONE       0.74      0.34      0.47       194

    accuracy                           0.54       382
   macro avg       0.47      0.51      0.44       382
weighted avg       0.63      0.54      0.54       382

The average F1-score for total test dataset is  0.423040380047506
The average F1-score for task B is  0.3839093110279551
The best Classifier for task B is  GradientBoostin

In [None]:
## 0.6536 glove target trigram
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
# transformers.append(('sentiment', SentimentExtractor()))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# train['Tweet'] = transform_all(train['Tweet'])
# X_train = train[['Tweet', 'Target']]
# y_train = train['Stance']

trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': SVC(gamma=0.1, kernel='linear', probability=True),
    'Climate Change is a Real Concern': LogisticRegression(C=1, max_iter=1000),
    'Feminist Movement': LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg'),
    'Hillary Clinton': LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000,solver='liblinear'),
    'Legalization of Abortion': SVC(gamma=0.1, kernel='linear', probability=True)
}

transformers = []
transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# trump['Tweet'] = transform_all(trump['Tweet'])
# X_train = trump[['Tweet', 'Target']]
# y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
trigram_transformer.fit_selector(X_train, y_train)
transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.80      0.79      0.80       535
       FAVOR       0.71      0.51      0.59       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.69       824
   macro avg       0.51      0.43      0.46       824
weighted avg       0.77      0.69      0.73       824

The average F1-score for total test dataset is  0.696182820987467
              precision    recall  f1-score   support

     AGAINST       0.51      0.66      0.58       173
       FAVOR       0.11      0.27      0.15        15
        NONE       0.64      0.39      0.49       194

    accuracy                           0.51       382
   macro avg       0.42      0.44      0.40       382
weighted avg       0.56      0.51      0.51       382

The average F1-score for total test dataset is  0.36441642173129796
The average F1-score for task B is  0.4429113712063415
The best Classifier for task B is  LogisticRegres

In [None]:
## 0.6542 glove target pos_tag sentimet ngram
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
transformers.append(('sentiment', SentimentExtractor()))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
transformers.append(('target_presence', TargetPresence(target_words)))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, solver='liblinear'),
    'Climate Change is a Real Concern': GradientBoostingClassifier(),
    'Feminist Movement':  RandomForestClassifier(max_depth=6),
    'Hillary Clinton':  LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg'),
    'Legalization of Abortion': LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg')
}

transformers = []
transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))
transformers.append(('target_presence', TargetPresence(target_words)))

transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.83      0.75      0.79       535
       FAVOR       0.68      0.55      0.61       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.68       824
   macro avg       0.50      0.43      0.47       824
weighted avg       0.78      0.68      0.73       824

The average F1-score for total test dataset is  0.6986715314809618
              precision    recall  f1-score   support

     AGAINST       0.61      0.58      0.59       173
       FAVOR       0.15      0.60      0.23        15
        NONE       0.70      0.56      0.62       194

    accuracy                           0.57       382
   macro avg       0.48      0.58      0.48       382
weighted avg       0.64      0.57      0.59       382

The average F1-score for total test dataset is  0.41361902192762734
The average F1-score for task B is  0.3586214539007093
The best Classifier for task B is  RandomForestC

In [None]:
## tfidf ngram target
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('target_presence', TargetPresence(target_words)))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
# transformers.append(('glove', glove_vectorizer))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': GradientBoostingClassifier(n_estimators=25),
    'Climate Change is a Real Concern': SVC(gamma=0.1, kernel='linear', probability = True),
    'Feminist Movement': GradientBoostingClassifier(max_depth=5, n_estimators=25),
    'Hillary Clinton': GradientBoostingClassifier(max_depth=4),
    'Legalization of Abortion': RandomForestClassifier(max_depth=7, n_estimators=50)
}

transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
# transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.82      0.78      0.80       535
       FAVOR       0.69      0.50      0.58       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.68       824
   macro avg       0.50      0.43      0.46       824
weighted avg       0.77      0.68      0.72       824

The average F1-score for total test dataset is  0.6893441725253
              precision    recall  f1-score   support

     AGAINST       0.55      0.66      0.60       173
       FAVOR       0.15      0.40      0.22        15
        NONE       0.68      0.46      0.55       194

    accuracy                           0.55       382
   macro avg       0.46      0.51      0.46       382
weighted avg       0.60      0.55      0.56       382

The average F1-score for total test dataset is  0.41013802950975725
The average F1-score for task B is  0.4219521125863661
The best Classifier for task B is  GradientBoosting

In [None]:
## ngram
transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
transformers.append(('target_presence', TargetPresence(target_words)))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']
ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
# transformers.append(('glove', glove_vectorizer))
# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

feature_union = FeatureUnion(transformers)

classifiers = {
    'Atheism': GradientBoostingClassifier(n_estimators=25),
    'Climate Change is a Real Concern': SVC(gamma=0.1, kernel='linear', probability = True),
    'Feminist Movement': GradientBoostingClassifier(max_depth=5, n_estimators=25),
    'Hillary Clinton': GradientBoostingClassifier(max_depth=4),
    'Legalization of Abortion': RandomForestClassifier(max_depth=7, n_estimators=50)
}

transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
# transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

              precision    recall  f1-score   support

     AGAINST       0.84      0.77      0.80       535
       FAVOR       0.68      0.51      0.58       289
        NONE       0.00      0.00      0.00         0

    accuracy                           0.68       824
   macro avg       0.51      0.42      0.46       824
weighted avg       0.78      0.68      0.72       824

The average F1-score for total test dataset is  0.6904531912710687
              precision    recall  f1-score   support

     AGAINST       0.54      0.64      0.59       173
       FAVOR       0.15      0.40      0.21        15
        NONE       0.65      0.45      0.53       194

    accuracy                           0.54       382
   macro avg       0.44      0.50      0.44       382
weighted avg       0.58      0.54      0.55       382

The average F1-score for total test dataset is  0.4007936507936508
The average F1-score for task B is  0.4208620333446379
The best Classifier for task B is  GradientBoosti

## Try something new!

Apply voting classfier training and selection for each target

In [None]:
def voting_train(train, test, feature_union, classify_pipeline, classifiers, params_grid, num=3):
    trained_classifiers = {}
    trained_feature_extraction = {}
    trained_voting = {}
    for name in train["Target"].unique():
      trained_classifiers[name] = []
      X_train, y_train, X_test, y_test = split_data(train, test, name)
      best_score = 0
      X_train_transformed = feature_union.fit_transform(X_train)
      X_test_transformed = feature_union.transform(X_test)
      trained_feature_extraction[name] = feature_union
      for classifier_name, model in classifiers.items():
          classify_pipeline.set_params(classifier=model)
          grid_search = RandomizedSearchCV(classify_pipeline, param_distributions=params_grid[classifier_name], cv=5, verbose=0, random_state=595, n_jobs=-1)
          start = time.time()
          grid_search.fit(X_train_transformed, y_train)
          end = time.time()
          # print('The best parameter for {} is {}.'.format(classifier_name, grid_search.best_params_))
          # print("Grid Search for Model {} needs {} seconds.".format(classifier_name, end-start))
          # print("The score for {} is {:.2f}.".format(classifier_name, grid_search.best_score_))
          best_model = grid_search.best_estimator_
          score = report_score(feature_union, best_model, X_test_transformed, y_test)
          print(trained_classifiers[name])
          if len(trained_classifiers[name]) < num:
            trained_classifiers[name].append((best_model, score))
            trained_classifiers[name] = sorted(trained_classifiers[name], key = lambda x:x[1], reverse=True)
            best_score = trained_classifiers[name][-1][1]
          elif score > best_score:
              trained_classifiers[name].pop()
              trained_classifiers[name].append((best_model, score))
              trained_classifiers[name] = sorted(trained_classifiers[name], key = lambda x:x[1], reverse=True)
              best_score = trained_classifiers[name][-1][1]

      voting_classifier = VotingClassifier(estimators= [(f"clf{i}", trained_classifiers[name][i][0]) for i in range(len(trained_classifiers[name]))])
      voting_classifier.fit(X_train_transformed, y_train)
      voting_score = report_score(feature_union, voting_classifier, X_test_transformed, y_test)
      print(f"Voting score for {name} is {voting_score}.")
      trained_voting[name] = voting_classifier
    print("Finish training and voting model selection.")

    ## test part
    predictions = pd.DataFrame(index=test.index)

    for target in trained_voting:
        # Select the test data for the current target
        target_data = test[test['Target'] == target][['Tweet', 'Target']]

        clf = trained_voting[target]
        feature_union = trained_feature_extraction[target]
        transformed_features = feature_union.transform(target_data)
        target_predictions = clf.predict(transformed_features)
        predictions.loc[target_data.index, 'Prediction'] = target_predictions

    print(classification_report(test['Stance'], predictions))
    report = classification_report(test['Stance'], predictions, output_dict=True, zero_division=0)
    f1_favor = report['FAVOR']['f1-score']
    f1_against = report['AGAINST']['f1-score']
    score = (f1_favor + f1_against)/2
    print("The average F1-score for total test dataset is ", score)

    return score, predictions


In [None]:
trained_classifiers = {}
trained_feature_extraction = {}
trained_voting = {}
for name in train["Target"].unique():
  trained_classifiers[name] = []
  X_train, y_train, X_test, y_test = split_data(train, test, name)
  best_score = 0
  X_train_transformed = feature_union.fit_transform(X_train)
  X_test_transformed = feature_union.transform(X_test)
  trained_feature_extraction[name] = feature_union
  for classifier_name, model in classifiers.items():
      classify_pipeline.set_params(classifier=model)
      grid_search = RandomizedSearchCV(classify_pipeline, param_distributions=params_grid[classifier_name], cv=5, verbose=0, random_state=595, n_jobs=-1)
      start = time.time()
      grid_search.fit(X_train_transformed, y_train)
      end = time.time()
      # print('The best parameter for {} is {}.'.format(classifier_name, grid_search.best_params_))
      # print("Grid Search for Model {} needs {} seconds.".format(classifier_name, end-start))
      # print("The score for {} is {:.2f}.".format(classifier_name, grid_search.best_score_))
      best_model = grid_search.best_estimator_
      score = report_score(feature_union, best_model, X_test_transformed, y_test)
      print(trained_classifiers[name])
      if len(trained_classifiers[name]) < 3:
        trained_classifiers[name].append((best_model, score))
        trained_classifiers[name] = sorted(trained_classifiers[name], key = lambda x:x[1], reverse=True)
        best_score = trained_classifiers[name][-1][1]

[]


TypeError: ignored

In [None]:
trained_classifiers

['Atheism']

In [None]:
## glove pos-tag bigram on voting classifier
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
transformers.append(('pos_tag', Pipeline([
    ('pos_extractor', PosTagVectorizer()),
    ('vectorizer', DictVectorizer())
])))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
bigram_transformer.fit_selector(X_train, y_train)
transformers.append(('bigram', bigram_transformer))

feature_union = FeatureUnion(transformers)

voting_train(train, test, feature_union, classify_pipeline, classifiers, params_grid)
# transformers = []
# transformers.append(('glove', glove_vectorizer))
# # word2vec_model = train_word2vec(train)
# # word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# # transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# # transformers.append(('target_presence', TargetPresence(target_words)))

# # transformers.append(('sentiment', SentimentExtractor()))

# trump['Tweet'] = transform_all(trump['Tweet'])
# X_train = trump[['Tweet', 'Target']]
# y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

# trump_feature_union = FeatureUnion(transformers)

# test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
# test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
# predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
# test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
# test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

[]
[(Pipeline(steps=[('classifier', LogisticRegression(C=0.1, solver='newton-cg'))]), 0.5377415458937198)]
[(Pipeline(steps=[('classifier', LogisticRegression(C=0.1, solver='newton-cg'))]), 0.5377415458937198), (Pipeline(steps=[('classifier', KNeighborsClassifier(n_neighbors=6))]), 0.4657937601804285)]
[(Pipeline(steps=[('classifier', LogisticRegression(C=0.1, solver='newton-cg'))]), 0.5377415458937198), (Pipeline(steps=[('classifier', KNeighborsClassifier(n_neighbors=6))]), 0.4657937601804285), (Pipeline(steps=[('classifier', RandomForestClassifier(max_depth=6))]), 0.44474153297682717)]
[(Pipeline(steps=[('classifier', SVC(class_weight='balanced', kernel='linear'))]), 0.6575235109717869), (Pipeline(steps=[('classifier', LogisticRegression(C=0.1, solver='newton-cg'))]), 0.5377415458937198), (Pipeline(steps=[('classifier', KNeighborsClassifier(n_neighbors=6))]), 0.4657937601804285)]
Voting score for Atheism is 0.624929178470255.
[]
[(Pipeline(steps=[('classifier', LogisticRegression(C=1

(0.6536983448930536,
      Prediction
 0       AGAINST
 1       AGAINST
 2       AGAINST
 3          NONE
 4       AGAINST
 ...         ...
 1244       NONE
 1245       NONE
 1246       NONE
 1247       NONE
 1248    AGAINST
 
 [1249 rows x 1 columns])

In [None]:
## only. sentiment
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
# transformers.append(('glove', glove_vectorizer))
transformers.append(('sentiment', SentimentExtractor()))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)

# classifiers = {
#     'Atheism': LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, solver='liblinear'),
#     'Climate Change is a Real Concern': GradientBoostingClassifier(),
#     'Feminist Movement':  RandomForestClassifier(max_depth=6),
#     'Hillary Clinton':  LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg'),
#     'Legalization of Abortion': LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg')
# }

# transformers = []
# transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

# trump['Tweet'] = transform_all(trump['Tweet'])
# X_train = trump[['Tweet', 'Target']]
# y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

# ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
# ngram_transformer.fit_selector(X_train, y_train)
# transformers.append(('ngram', ngram_transformer))

# trump_feature_union = FeatureUnion(transformers)

# test_on_opinionA(train, opinion_to_target_A, feature_union, classifiers)
# test_on_opinionA(train, opinion_to_other_A, feature_union, classifiers)
# predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, classifiers)
# test_on_opinionB(train, opinion_to_target_B, trump_feature_union, classifiers, best_f1_targetmodel)
# test_on_opinionB(train, opinion_to_other_B, trump_feature_union, classifiers, best_f1_targetmodel)

The best model for Atheism is RandomForest with {'memory': None, 'steps': [('classifier', RandomForestClassifier(max_depth=4, n_estimators=10))], 'verbose': False, 'classifier': RandomForestClassifier(max_depth=4, n_estimators=10), 'classifier__bootstrap': True, 'classifier__ccp_alpha': 0.0, 'classifier__class_weight': None, 'classifier__criterion': 'gini', 'classifier__max_depth': 4, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': None, 'classifier__max_samples': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 10, 'classifier__n_jobs': None, 'classifier__oob_score': False, 'classifier__random_state': None, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is RandomForest with the average of F1 as 0.4487841945288754.
The best model for Climate Change is a Real Concern is SVM with {'memo

In [37]:
## only. sentiment
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
# transformers.append(('glove', glove_vectorizer))
# transformers.append(('sentiment', SentimentExtractor()))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)

# classifiers = {
#     'Atheism': LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, solver='liblinear'),
#     'Climate Change is a Real Concern': GradientBoostingClassifier(),
#     'Feminist Movement':  RandomForestClassifier(max_depth=6),
#     'Hillary Clinton':  LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg'),
#     'Legalization of Abortion': LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg')
# }

transformers = []
# transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, trained_classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, trained_classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, trained_classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, trained_classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, trained_classifiers, best_f1_targetmodel)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(max_depth=4, n_estimators=10))], 'verbose': False, 'classifier': GradientBoostingClassifier(max_depth=4, n_estimators=10), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 10, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as

In [41]:
## glove ngram
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
transformers.append(('glove', glove_vectorizer))
# transformers.append(('sentiment', SentimentExtractor()))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)

# classifiers = {
#     'Atheism': LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, solver='liblinear'),
#     'Climate Change is a Real Concern': GradientBoostingClassifier(),
#     'Feminist Movement':  RandomForestClassifier(max_depth=6),
#     'Hillary Clinton':  LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg'),
#     'Legalization of Abortion': LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg')
# }

transformers = []
transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
# transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, trained_classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, trained_classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, trained_classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, trained_classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, trained_classifiers, best_f1_targetmodel)

The best model for Atheism is LogisticRegression with {'memory': None, 'steps': [('classifier', LogisticRegression(C=0.1, class_weight='balanced', solver='liblinear'))], 'verbose': False, 'classifier': LogisticRegression(C=0.1, class_weight='balanced', solver='liblinear'), 'classifier__C': 0.1, 'classifier__class_weight': 'balanced', 'classifier__dual': False, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 100, 'classifier__multi_class': 'auto', 'classifier__n_jobs': None, 'classifier__penalty': 'l2', 'classifier__random_state': None, 'classifier__solver': 'liblinear', 'classifier__tol': 0.0001, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is LogisticRegression with the average of F1 as 0.6528862348798039.
The best model for Climate Change is a Real Concern is LogisticRegression with {'memory': None, 'steps': [('classifier', LogisticRegression(C=0.1, class_weight=

In [42]:
## glove ngram
transformers = []
# glove_vectorizer = GloVeVectorizer(glove_path=args.glove_path, vector_size=200)
# transformers.append(('glove', glove_vectorizer))
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
# transformers.append(('sentiment', SentimentExtractor()))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

train['Tweet'] = transform_all(train['Tweet'])
X_train = train[['Tweet', 'Target']]
y_train = train['Stance']

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

feature_union = FeatureUnion(transformers)
trained_feature_extraction, trained_classifiers = train_test_all(train, test, feature_union, classify_pipeline, classifiers, params_grid)
score, predictinos = test_all(test, trained_feature_extraction, trained_classifiers)

# classifiers = {
#     'Atheism': LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, solver='liblinear'),
#     'Climate Change is a Real Concern': GradientBoostingClassifier(),
#     'Feminist Movement':  RandomForestClassifier(max_depth=6),
#     'Hillary Clinton':  LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg'),
#     'Legalization of Abortion': LogisticRegression(C=0.1, max_iter=1000, solver='newton-cg')
# }

transformers = []
transformers.append(('tfidf', ModifiedTfidfVectorizer(max_feature=500)))
# transformers.append(('glove', glove_vectorizer))
# word2vec_model = train_word2vec(train)
# word2vec_vectorizer = Word2VecVectorizer(word2vec_model)
# transformers.append(('word2vec', word2vec_vectorizer))

# transformers.append(('pos_tag', Pipeline([
#     ('pos_extractor', PosTagVectorizer()),
#     ('vectorizer', DictVectorizer())
# ])))
transformers.append(('target_presence', TargetPresence(target_words)))

# transformers.append(('sentiment', SentimentExtractor()))

trump['Tweet'] = transform_all(trump['Tweet'])
X_train = trump[['Tweet', 'Target']]
y_train = trump['Stance']

# bigram_transformer = NGramVectorizer(ngram_range_word=(2,2))
# bigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('bigram', bigram_transformer))
# trigram_transformer = NGramVectorizer(ngram_range_word=(3,3))
# trigram_transformer.fit_selector(X_train, y_train)
# transformers.append(('trigram', trigram_transformer))

ngram_transformer = NGramVectorizer(ngram_range_word=(1,3), ngram_range_char=(2, 5))
ngram_transformer.fit_selector(X_train, y_train)
transformers.append(('ngram', ngram_transformer))

trump_feature_union = FeatureUnion(transformers)

test_on_opinionA(train, opinion_to_target_A, feature_union, trained_classifiers)
test_on_opinionA(train, opinion_to_other_A, feature_union, trained_classifiers)
predictions, best_f1_targetmodel = predict_model_taskB(train, trump, trump_feature_union, trained_classifiers)
test_on_opinionB(train, opinion_to_target_B, trump_feature_union, trained_classifiers, best_f1_targetmodel)
test_on_opinionB(train, opinion_to_other_B, trump_feature_union, trained_classifiers, best_f1_targetmodel)

The best model for Atheism is GradientBoosting with {'memory': None, 'steps': [('classifier', GradientBoostingClassifier(n_estimators=25))], 'verbose': False, 'classifier': GradientBoostingClassifier(n_estimators=25), 'classifier__ccp_alpha': 0.0, 'classifier__criterion': 'friedman_mse', 'classifier__init': None, 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 3, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__n_estimators': 25, 'classifier__n_iter_no_change': None, 'classifier__random_state': None, 'classifier__subsample': 1.0, 'classifier__tol': 0.0001, 'classifier__validation_fraction': 0.1, 'classifier__verbose': 0, 'classifier__warm_start': False}
The best classifier for Atheism is GradientBoosting with the average of F1 as 0.6647058823529413.
The b