# MachineLearning.ipynb

### This notebook contains the feature generation, supervised machine learning and validation for automatic classification of Tweets.

Author: Erik Puijk <br>
Date  : March 28, 2022

In [28]:
pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /home/erik/anaconda3/lib/python3.8/site-packages (1.0.2)
Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import json, csv, math, random
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, accuracy_score, make_scorer, cohen_kappa_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils import resample
from sklearn import svm, metrics

In [2]:
Encoder = LabelEncoder()

In [3]:
# https://scikit-learn.org/stable/modules/svm.html
# https://machinelearningmastery.com/overfitting-and-underfitting-with-machine-learning-algorithms/
# https://vitalflux.com/hold-out-method-for-training-machine-learning-model/
# https://towardsdatascience.com/cross-validation-in-machine-learning-72924a69872f
# https://scikit-learn.org/stable/modules/cross_validation.html
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [4]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    print("Total Tweets read: %s\n" % (len(content)))

    return content

In [5]:
def write_tweets(tweets_w, path):
    """ Write obtained Tweets to a text file in JSON-format. """
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets_w, f)
    except IOError:
        print("I/O error")

In [6]:
def stats_count(cat):
    """ Count the occurrences of each category in a list of categories. """
    
    occurrences = [[x,cat.count(x)] for x in set(cat)]
    
    print('Number of training data')
    
    for occ in occurrences:
        print('%s: %s' % (occ[0], occ[1]))
        
    print('')

In [7]:
def resample_tweets(text, labels):
    """ Upsample Tweets from categories that are less occurring to create a balanced training set. """
    
    tweets = [list(a) for a in zip(text, labels)]
    cats = list(set(labels))
    tweets_divided = []
    tweets_upsampled = []
    max_len = 0
    
    for i, cat in enumerate(cats):
        tweets_divided.append([tweet for tweet in tweets if tweet[1] == cat])
        
        l_len = len(tweets_divided[i])
        
        if l_len > max_len:
            max_len = l_len
                    
    for i, cat in enumerate(cats):
        if len(tweets_divided[i]) < max_len:
            tweets_upsampled.append(resample(tweets_divided[i], replace=True, n_samples=max_len, random_state=0))
        else:
            tweets_upsampled.append(tweets_divided[i])
    
    tweets_joined =  [item for l in tweets_upsampled for item in l]
    random.shuffle(tweets_joined)
    unzipped = list(zip(*tweets_joined))
    
    return unzipped[0], unzipped[1]

In [8]:
def get_score(y_test, pred):
    """ Call an f1-score function to calculate the performance of a model configuration. """
    
    score = f1_score(y_test, pred, average='micro', zero_division=0)
    #score = cohen_kappa_score(y_test, pred)
        
    return score

In [9]:
def calc_avg_score(scores):
    """ Calculate an average score given a list of scores. """
    
    total = 0
    
    for score in scores:
        total += score
    
    return total / len(scores)

In [35]:
def cross_val(v_type, X_train, y_train, ngram_min, ngram_max, min_df, max_df, max_features):
    """ Perform the count/tfidf vectorization on the test set and run 5-fold cross validation on the test set
        to compare the results. """
    
    scores = []
           
    # Run cross-validation and calculate scores
    kf = StratifiedKFold(n_splits=5)
    for train, valid in kf.split(X_train, y_train):
        X_train1 = [X_train[i] for i in train]
        y_train1 = [y_train[i] for i in train]
        X_valid1 = [X_train[i] for i in valid]
        y_valid1 = [y_train[i] for i in valid]
        
        if v_type == 'count':
            vectorizer = CountVectorizer(analyzer='word', ngram_range=(ngram_min, ngram_max), min_df=min_df, max_df=max_df, max_features=max_features)
        else:
            vectorizer = TfidfVectorizer(ngram_range=(ngram_min, ngram_max), min_df=min_df, max_df=max_df, max_features=max_features)
        
        # Oversample tweets with less occurring categories to balance training set
        X_train1, y_train1 = resample_tweets(X_train1, y_train1)
        
        #stats_count(list(y_train1))
        
        # Encode labels
        y_train1 = Encoder.fit_transform(y_train1)
        y_valid1 = Encoder.fit_transform(y_valid1)
        
        # Vectorize data
        X_train1 = vectorizer.fit_transform(X_train1)
        X_valid1 = vectorizer.transform(X_valid1)
        
        # Train classifier
        clf = svm.SVC(kernel='linear', C=1, class_weight=None)
        clf.fit(X_train1, y_train1)
            
        # Predict and compare with test set
        pred = clf.predict(X_valid1)
        scores.append(f1_score(y_valid1, pred, average='micro', zero_division=0))
       
    # Return the configuration of the model together with the average of scores
    return [[ngram_min, ngram_max, min_df, max_df, max_features], calc_avg_score(scores)]

In [36]:
def test_configurations(v_type, test_validation, tweets, labels):
    """ Test different configurations for different models and compare the micro-f1 scores to select the best
        model. """
    
    # Define hyper parameters per categorization
    if len(list(set(labels))) == 3:
        # Content
        ngrams = [1, 2]
        min_dfs = [1, 2, 3, 4, 5]
        max_dfs = [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
        max_features = [None, 1000, 2000, 3000]
    else:
        # Activation
        ngrams = [1, 2, 3]
        min_dfs = [1, 2, 3, 4, 5]
        max_dfs = [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
        max_features = [None]#, 1000, 2000, 3000]
    
    # Split data set into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.3, random_state=0)
    
    #X_train, y_train = resample_train_set(X_train, y_train)
    
    scores = []
    i = 0
    i_max = (len(ngrams)**2 - math.factorial(len(ngrams)-1)) * len(min_dfs) * len(max_dfs) * len(max_features)
    
    # Perform hyperparameter optimization by testing every configuration with k-fold cross validation.
    for ngram_min in ngrams:
        for ngram_max in ngrams:
            if ngram_min > ngram_max:
                continue
            for min_df in min_dfs:
                for max_df in max_dfs:
                    for max_feature in max_features:
                        i += 1
                        print('Progress: %s/%s configurations' % (i, i_max), end='\r')
                        
                        # Call the function for the model and append the score
                        scores.append(cross_val(v_type, X_train, y_train, ngram_min, ngram_max, min_df, max_df, max_feature))
    
    # Sort the scores descending and print them
    scores.sort(key=lambda x: x[1], reverse=True)
    
    if test_validation:
        # Select the best configurations
        best_score = scores[0][1]
        best_configs = []
        for config in scores:
            if config[1] == best_score:
                best_configs.append(config[0])

        X_train, y_train = resample_tweets(X_train, y_train)
        
        # Encode labels
        y_train = Encoder.fit_transform(y_train)
        y_test = Encoder.fit_transform(y_test)
        
        # Use the best configurations to validate model on the test set
        for config in best_configs:
            print(config)
            
            # Select appropriate vectorizer
            if v_type == 'count':
                vectorizer = CountVectorizer(analyzer='word', ngram_range=(config[0], config[1]), min_df=config[2], max_df=config[3], max_features=config[4])
            else:
                vectorizer = TfidfVectorizer(ngram_range=(config[0], config[1]), min_df=config[2], max_df=config[3], max_features=config[4])
    
            # Train with vectorized features
            X_train_t = vectorizer.fit_transform(X_train)
            X_test_t = vectorizer.transform(X_test)
            clf = svm.SVC(kernel='linear', C=1, class_weight=None)
            clf.fit(X_train_t, y_train)
            
            # Predict and compare with test set
            pred = clf.predict(X_test_t)
            print(classification_report(y_test, pred, target_names=Encoder.inverse_transform(list(set(y_test))), zero_division=0))

    print(scores)

In [38]:
tweets_all = read_tweets('source/tweets_all_preprocessed.txt')
tweets_gs = [tweet for tweet in tweets_all if tweet['memo'] == 'gold_standard']

tweets_text = [tweet['text'] for tweet in tweets_gs]

labels_con = [tweet['cat_con'] for tweet in tweets_gs]
labels_act = [tweet['cat_act'] for tweet in tweets_gs]

#test_configurations('tfidf', True, tweets_text, labels_con)
test_configurations('tfidf', True, tweets_text, labels_act)

Total Tweets read: 4664

[1, 1, 4, 0.6, None]nfigurations
              precision    recall  f1-score   support

         CON       1.00      0.17      0.29         6
         FOL       0.68      0.48      0.57        27
        NONE       0.80      0.89      0.84        93
         SUP       0.50      0.57      0.53        14

    accuracy                           0.75       140
   macro avg       0.75      0.53      0.56       140
weighted avg       0.75      0.75      0.73       140

[1, 1, 4, 0.8, None]
              precision    recall  f1-score   support

         CON       1.00      0.17      0.29         6
         FOL       0.68      0.48      0.57        27
        NONE       0.80      0.89      0.84        93
         SUP       0.50      0.57      0.53        14

    accuracy                           0.75       140
   macro avg       0.75      0.53      0.56       140
weighted avg       0.75      0.75      0.73       140

[1, 1, 4, 1.0, None]
              precision    rec