In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# sklearn

from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# run data_processing.ipynb to generate the train, val, test sets here
# load and split train, val, test, (X, y)
TRAIN_DATA = 'data/training.1440000.csv'
VAL_DATA = 'data/validation.80000.csv'
TEST_DATA = 'data/test.80000.csv'

df_train = pd.read_csv(TRAIN_DATA)
df_val = pd.read_csv(VAL_DATA)
df_test = pd.read_csv(TEST_DATA)

X_train, y_train = df_train['text'], df_train['target']
X_val, y_val = df_val['text'], df_val['target']
X_test, y_test = df_test['text'], df_test['target']

## Baseline

Use some naive model or cite previous paper? 
1. use some naive model - like sentiment lexicon and classify positive if most words are positive
- pros: can replicate
2. use previous research results
- concerns: may not be able to find original source code to replicate baseline results

#### TwitrRatr - same baseline as https://www-cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf

In [3]:
# TODO : CANNOT FIND LIST OF POSITIVE/NEGATIVE WORDS MENTIONED IN THE PAPER???

#### Opinion lexicon - https://github.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/blob/master/data/opinion-lexicon-English/negative-words.txt

In [4]:
POSITIVE_FILE = 'data/positive-words.txt'
positive_words = pd.read_table(POSITIVE_FILE, skiprows=34, names=['words'], encoding='ISO-8859-1')['words'].tolist()
positive_words[:10]

['a+',
 'abound',
 'abounds',
 'abundance',
 'abundant',
 'accessable',
 'accessible',
 'acclaim',
 'acclaimed',
 'acclamation']

In [5]:
NEGATIVE_FILE = 'data/negative-words.txt'
negative_words = pd.read_table(NEGATIVE_FILE, skiprows=34, names=['words'], encoding='ISO-8859-1')['words'].tolist()
negative_words[:10]

['2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted']

In [6]:
# split based on all punctuations possible
def split_tweet(tweet):
    return "".join((char if char.isalpha() else " ") for char in tweet).split()

def count_pos(tweet):
    tweet_words = split_tweet(tweet)
    return sum([1 for w in tweet_words if w in positive_words])

def count_neg(tweet):
    tweet_words = split_tweet(tweet)
    return sum([1 for w in tweet_words if w in negative_words])

In [7]:
df_test.text[0]

'USERNAME yeah sure whatevs...haha you have to admit the sweet and innocent are by far the best and a epic win '

In [8]:
count_pos(df_test.text[0])

3

In [9]:
count_neg(df_test.text[0])

0

In [10]:
pred = df_test.text.apply(lambda x: 1 if count_pos(x) - count_neg(x) >= 0 else 0)

In [11]:
pred

0        1
1        1
2        1
3        1
4        1
        ..
79995    1
79996    1
79997    1
79998    1
79999    0
Name: text, Length: 80000, dtype: int64

Naive Bayes Baseline


In [18]:
def model_Evaluate(model, mode = 'val'):
    # mode in {val, test}
    if mode == 'test':
        eval_x = X_test
        eval_y = y_test
    else: 
        eval_x = X_val
        eval_y = y_val
    
    # Predict values for given dataset
    y_pred = model.predict(eval_x)

    # Print the evaluation metrics for the dataset.
    print(classification_report(eval_y, y_pred))
    
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(eval_y, y_pred)

In [16]:
# vectorize the raw input into a matrix of TF-IDF features

# 1. train a tf-idf 
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)

# 2. vectorize the raw inputs
X_train = vectoriser.transform(X_train)
X_val = vectoriser.transform(X_val)
X_test  = vectoriser.transform(X_test)

In [19]:

NBmodel = BernoulliNB(alpha = 2)
NBmodel.fit(X_train, y_train)
model_Evaluate(NBmodel)

              precision    recall  f1-score   support

           0       0.81      0.79      0.80     40215
           1       0.79      0.81      0.80     39785

    accuracy                           0.80     80000
   macro avg       0.80      0.80      0.80     80000
weighted avg       0.80      0.80      0.80     80000



SVM Model


In [None]:

SVMmodel = svm.SVC(kernel='linear') # Linear Kernel
SVMmodel.fit(X_train, y_train)
model_Evaluate(SVMmodel)

## Evaluation Metric

TODO: What to pick for evaluation metric?

#### F1-score and Confusion Matrix

In [12]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df_test.target, pred)

array([[14698, 25448],
       [ 3985, 35869]])

In [13]:
from sklearn.metrics import f1_score
f1_score(df_test.target, pred, average='macro')

0.6043811204758519