### Reading the data 

In [1]:
import pandas as pd

In [2]:
files = ['twitter-2013train.txt','twitter-2015train.txt','twitter-2016train.txt']


In [3]:
df0, df1, df2 = [pd.read_csv(name, delimiter = '\t', header = None) for name in files]

In [4]:
data = pd.concat([df0, df1, df2]) #concatinating the tweets data in 1 dataframe


In [5]:
df0.columns

Int64Index([0, 1, 2], dtype='int64')

In [6]:
data.columns = ['serial', 'opinion', 'tweet_text']

In [7]:
data.groupby(by = 'opinion').count()

Unnamed: 0_level_0,serial,tweet_text
opinion,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,2374,2374
neutral,6840,6840
positive,6827,6827


In [8]:
data.head()

Unnamed: 0,serial,opinion,tweet_text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...


### Baseline Experiment

We will neglect any preprocessing in this part. Just vectorization then classification using different classifiers without intensive parameter tuning

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn import naive_bayes
from sklearn.model_selection import cross_val_score

  from numpy.core.umath_tests import inner1d


In [10]:
classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('clf', naive_bayes.MultinomialNB())])
    

In [11]:
search_space = [{'vectorizer': [CountVectorizer()],'vectorizer__ngram_range': [(1,1),(1,2)]},
                {'vectorizer': [TfidfVectorizer()]},
                {'clf': [LogisticRegression()]},
                {'clf': [naive_bayes.MultinomialNB()]},
                {'clf': [SVC()],'clf__kernel': ['rbf', 'linear']}
               ]

In [48]:
clf = GridSearchCV(classifier, search_space, cv=10, verbose=0)

In [49]:
best_acc = clf.fit(data.tweet_text, data.opinion)

In [50]:
best_acc.best_score_

0.5072002992332149

In [51]:
best_acc.best_params_

{'clf': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)}

In [62]:
means = best_acc.cv_results_['mean_test_score']
stds = best_acc.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, best_acc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

0.465 (+/-0.240) for {'vectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None), 'vectorizer__ngram_range': (1, 1)}
0.467 (+/-0.241) for {'vectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None), 'vectorizer__ngram_range': (1, 2)}
0.449 (+/-0.238) for {'vectorizer': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',


In [53]:
clf2 = GridSearchCV(classifier, search_space, cv=10, verbose=0, scoring = 'f1_macro')
best_f1 = clf2.fit(data.tweet_text, data.opinion)
best_f1.best_score_

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.47289766218193363

In [54]:
best_f1.best_params_

{'clf': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)}

In [63]:
means = best_f1.cv_results_['mean_test_score']
stds = best_f1.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, best_f1.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

0.392 (+/-0.189) for {'vectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None), 'vectorizer__ngram_range': (1, 1)}
0.373 (+/-0.192) for {'vectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None), 'vectorizer__ngram_range': (1, 2)}
0.320 (+/-0.174) for {'vectorizer': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',


#### Logistic Regression vs bigram count vectorizer and TF-IDF vectorizer

In [1]:
classifier = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range= (1,2))),
    ('clf', LogisticRegression())])

search_space = [{'vectorizer': [CountVectorizer(ngram_range= (1,2))]},
                {'vectorizer': [TfidfVectorizer()]},
                {'clf': [LogisticRegression()]},
                {'clf': [SVC()],'clf__kernel': ['rbf', 'linear']}
               ]

clf = GridSearchCV(classifier, search_space, cv=10, verbose=0)

NameError: name 'Pipeline' is not defined

In [13]:
best_acc = clf.fit(data.tweet_text, data.opinion)

In [14]:
best_acc.best_score_

0.5188579265631819

In [15]:
best_acc.best_params_

{'vectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 2), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None)}

In [16]:
means = best_acc.cv_results_['mean_test_score']
stds = best_acc.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, best_acc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

0.519 (+/-0.295) for {'vectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)}
0.509 (+/-0.271) for {'vectorizer': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)}
0.519 (+/-0.295) for {'clf': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,


In [17]:
clf2 = GridSearchCV(classifier, search_space, cv=10, verbose=0, scoring = 'f1_macro')
best_f1 = clf2.fit(data.tweet_text, data.opinion)
best_f1.best_score_
best_f1.best_params_
means = best_f1.cv_results_['mean_test_score']
stds = best_f1.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, best_f1.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.480 (+/-0.265) for {'vectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)}
0.439 (+/-0.234) for {'vectorizer': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)}
0.480 (+/-0.265) for {'clf': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,


#### SVM VS TF-IDF vectorizer

In [18]:
classifier = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('clf', SVC())])

search_space = [{'clf__kernel': ['rbf', 'linear']}]
clf = GridSearchCV(classifier, search_space, cv=10, verbose=0)

best_acc = clf.fit(data.tweet_text, data.opinion)

means = best_acc.cv_results_['mean_test_score']
stds = best_acc.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, best_acc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

0.426 (+/-0.000) for {'clf__kernel': 'rbf'}
0.506 (+/-0.282) for {'clf__kernel': 'linear'}


In [19]:
clf2 = GridSearchCV(classifier, search_space, cv=10, verbose=0, scoring = 'f1_macro')
best_f1 = clf2.fit(data.tweet_text, data.opinion)
means = best_f1.cv_results_['mean_test_score']
stds = best_f1.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, best_f1.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.199 (+/-0.000) for {'clf__kernel': 'rbf'}
0.465 (+/-0.252) for {'clf__kernel': 'linear'}
