<b>Load sentiment140 dataset</b>

In [1]:
import pandas, numpy

In [2]:
dataset = pandas.read_csv('../../data/processed_tweets_140.csv', encoding='latin-1')
feature_sets = list(zip(list(dataset.text.astype(str)), list(dataset.target)))
numpy.random.shuffle(feature_sets)
tweets, labels = list(zip(*feature_sets))
labels = [1 if label==0 else 0 for label in labels]

<b>Tokenize tweet</b>

In [3]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

In [4]:
stop_words = stopwords.words('english')
tokenize = TweetTokenizer().tokenize

<b>Denoise tokenized tweet</b>

In [5]:
import re, string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag

In [6]:
def remove_noise(tweet_tokens, stop_words = ()):
    cleaned_tokens = []
    
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

<b>Tokenize and denoise tweets</b>

In [7]:
import time

In [8]:
start = time.time()
tokenized_tweets = [remove_noise(tokenize(tweet), stop_words) for tweet in tweets[:10000]]
print("Took " + str(time.time()-start) + " seconds to denoise and tokenize tweets")

Took 39.89064836502075 seconds to denoise and tokenize tweets


<b>Vectorize the tokenized tweets with TFIDF</b>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def dummy_fun(doc):
    return doc

In [None]:
tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

In [None]:
X = tfidf.fit_transform(tokenized_tweets)
y = labels

<b>Train sentiment classifier on twitter data (sentiment140 dataset)</b>

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=7,
    max_df=0.8,
    stop_words=stopwords.words('english')
)

X = vectorizer.fit_transform(sentiment140.normalized_tweets)
y = sentiment140.labels

In [3]:
from sklearn.linear_model import LogisticRegression

In [4]:
lr = LogisticRegression()
lr.fit(X[:100000], y[:100000])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
y_pred = lr.predict(X[100000:])

In [9]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y[100000:], y_pred)
confusion_matrix

array([[586657, 163349],
       [192664, 557330]], dtype=int64)

In [10]:
from sklearn.metrics import classification_report
print(classification_report(y[100000:], y_pred))

              precision    recall  f1-score   support

           0       0.75      0.78      0.77    750006
           1       0.77      0.74      0.76    749994

    accuracy                           0.76   1500000
   macro avg       0.76      0.76      0.76   1500000
weighted avg       0.76      0.76      0.76   1500000

