# TweetClassification.ipynb

### This notebook contains a machine learning algorithm to classify the unlabeled Tweets using the selected models.

Author: Erik Puijk <br>
Date  : May 3, 2022

In [1]:
pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /home/erik/anaconda3/lib/python3.8/site-packages (1.0.2)
Note: you may need to restart the kernel to use updated packages.


In [22]:
import numpy as np
import json, csv, math, random
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

In [23]:
Encoder = LabelEncoder()

In [24]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    return content

In [120]:
def write_tweets(tweets_w, path):
    """ Write obtained Tweets to a text file in JSON-format. """
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets_w, f)
    except IOError:
        print("I/O error")
        
    print("Wrote %s Tweets" % len(tweets_w))

In [123]:
def classify_tweets(X_train, y_train, X_unlabeled, c_type):
    """ Classify unlabeled Tweets by training SVM model on gold-standard data. """

    # Encode labels
    y_train = Encoder.fit_transform(y_train)

    # Select appropriate vectorizer and hyperparameters
    if c_type == 'content':
        vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, max_df=0.4, max_features=3000)
    else:
        vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1, max_df=0.4, max_features=450, sublinear_tf=True)

    # Fit and transform tokens
    X_train = vectorizer.fit_transform(X_train)
    X_unlabeled = vectorizer.transform(X_unlabeled)

    # Select appropriate SVM implementation and hyperparameters
    if c_type == 'content':
        clf = svm.LinearSVC(C=0.5, class_weight='balanced', dual=False)
    else:
        clf = svm.SVC(kernel='linear', C=0.6, class_weight='balanced')

    # Predict labels and return prediction
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_unlabeled)
    
    # Return decoded labels in list
    return list(Encoder.inverse_transform(predictions))

In [124]:
def merge_labels(tweets, labels_con, labels_act):
    """ Merge the new labels with the JSON-structure containing information about the Tweets. """
    
    i = 0
    
    for tweet in tweets:
        
        # Skip Tweet if it is already labeled
        if tweet['memo'] != 'gold_standard':
            tweet['cat_con'] = labels_con[i]
            tweet['cat_act'] = labels_act[i]
            i += 1
    
    return tweets

In [None]:
# Read all Tweets
tweets_preprocessed = read_tweets('source/tweets_all_preprocessed_exc_stopwords.txt')
tweets_raw = read_tweets('source/tweets_all.txt')

# Split Tweets into labeled and unlabeled
tweets_labeled = [tweet for tweet in tweets_preprocessed if tweet['memo'] == 'gold_standard']
tweets_unlabeled = [tweet for tweet in tweets_preprocessed if tweet['memo'] != 'gold_standard']

# Select the text from those Tweets
tweets_labeled_text = [tweet['text'] for tweet in tweets_labeled]
tweets_unlabeled_text = [tweet['text'] for tweet in tweets_unlabeled]

# Select the labels from those Tweets
labels_con = [tweet['cat_con'] for tweet in tweets_labeled]
labels_act = [tweet['cat_act'] for tweet in tweets_labeled]

# Label Tweets using classification model
new_labels_con = classify_tweets(tweets_labeled_text, labels_con, tweets_unlabeled_text, 'content')
new_labels_act = classify_tweets(tweets_labeled_text, labels_act, tweets_unlabeled_text, 'activation')

# Merge the new labels with the (non-preprocessed) JSON-structure and write it to a txt file
tweets_labeled = merge_labels(tweets_raw, new_labels_con, new_labels_act)
write_tweets(tweets_labeled, 'source/tweets_all_labeled.txt')