# MachineLearning.ipynb

### This notebook contains the feature generation, supervised machine learning and validation for automatic classification of Tweets.

Author: Erik Puijk <br>
Date  : March 28, 2022

In [1]:
pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /home/erik/anaconda3/lib/python3.8/site-packages (1.0.2)
Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import json, csv, math, random
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, accuracy_score, make_scorer, cohen_kappa_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils import resample
from sklearn import svm, metrics, linear_model
import pyperclip as pc

In [2]:
Encoder = LabelEncoder()

In [3]:
# https://scikit-learn.org/stable/modules/svm.html
# https://machinelearningmastery.com/overfitting-and-underfitting-with-machine-learning-algorithms/
# https://vitalflux.com/hold-out-method-for-training-machine-learning-model/
# https://towardsdatascience.com/cross-validation-in-machine-learning-72924a69872f
# https://scikit-learn.org/stable/modules/cross_validation.html
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [4]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    #print("Total Tweets read: %s\n" % (len(content)))

    return content

In [5]:
def write_tweets(tweets_w, path):
    """ Write obtained Tweets to a text file in JSON-format. """
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets_w, f)
    except IOError:
        print("I/O error")

In [6]:
def stats_count(cat):
    """ Count the occurrences of each category in a list of categories. """
    
    occurrences = [[x,cat.count(x)] for x in set(cat)]
    
    print('Number of training data')
    
    for occ in occurrences:
        print('%s: %s' % (occ[0], occ[1]))
        
    print('')

In [7]:
def calc_avg_scores(scores, cats, k):
    """ Calculate an average score given a list of scores. """
    
    scores_avg = []
    
    # If a list of lists of scores is given, calculate the average per label
    if isinstance(scores[0], list): 
        
        for i in range(len(cats)):

            total = 0

            for j in range(k):
                total += scores[j][i]

            scores_avg.append(total / k)
            
    # If a list of average (f1) scores is given, just return that list
    else:
        return scores

    return scores_avg

In [53]:
def keysort(x):
    
    #return sum(x[1]) / len(x[1])
    return min(x[1])

In [75]:
def cross_val(v_type, X, y, ngram_min, ngram_max, min_df, max_df, max_features, cats, alg):
    """ Perform the count/tfidf vectorization on the test set and run 5-fold cross validation on the test set
        to compare the results. """
    
    scores = []
    k = 3 #test with lower k's, maybe 3, 4
        
    # Run cross-validation and calculate scores
    kf = KFold(n_splits=k)
    for train, valid in kf.split(X, y):
        X_train_t = [X[i] for i in train]
        y_train_t = [y[i] for i in train]
        X_valid_t = [X[i] for i in valid]
        y_valid_t = [y[i] for i in valid]
        
        # Select appropriate vectorizer
        if v_type == 'count':
            vectorizer = CountVectorizer(analyzer='word', ngram_range=(ngram_min, ngram_max), min_df=min_df, \
                                         max_df=max_df, max_features=max_features)
        else:
            vectorizer = TfidfVectorizer(ngram_range=(ngram_min, ngram_max), min_df=min_df, max_df=max_df, \
                                         max_features=max_features, sublinear_tf=True)
    
        X_train_t = vectorizer.fit_transform(X_train_t)
        X_valid_t = vectorizer.transform(X_valid_t)
        
        # Create and fit SVM
        if alg == 'svc':
            clf = svm.SVC(kernel='linear', C=1, class_weight='balanced')
        else:
            clf = svm.LinearSVC(C=1, class_weight='balanced', random_state=0)
        
        clf.fit(X_train_t, y_train_t)
        pred = clf.predict(X_valid_t)
        
        scores.append(list(f1_score(y_valid_t, pred, average=None, zero_division=0)))
        #scores.append(f1_score(y_valid_t, pred, average='micro', zero_division=0))
    
    # Return the configuration of the model together with the average of scores
    return [[ngram_min, ngram_max, min_df, max_df, max_features], calc_avg_scores(scores, cats, k)]

In [76]:
def test_configurations(v_type, test_validation, tweets, labels, alg, rs):
    """ Test different configurations for different models and compare the micro-f1 scores to select the best
        model. """
    
    cats = list(set(labels))
    
    # Define model parameters according to which categorization
    if len(cats) == 3:
        # Content
        ngrams = [1, 2]
        min_dfs = [1, 2, 3]
        max_dfs = [1.0]#[0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
        max_features = [None]
    else:
        # Activation
        ngrams = [1, 2]
        min_dfs = [1, 2, 3, 4, 5] #[2, 3, 4]
        max_dfs = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
        max_features = [None, 3000]
    
    # Split data set into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.3, random_state=rs)
        
    #stats_count(list(y_train))

    # Encode labels
    y_train = Encoder.fit_transform(y_train)
    y_test = Encoder.fit_transform(y_test)
    
    scores = []
    i = 0
    i_max = (len(ngrams)**2 - math.factorial(len(ngrams)-1)) * len(min_dfs) * len(max_dfs) * len(max_features)
    
    # Combine each parameter and run the model with cross validation
    for ngram_min in ngrams:
        for ngram_max in ngrams:
            if ngram_min > ngram_max:
                continue
            for min_df in min_dfs:
                for max_df in max_dfs:
                    for max_feature in max_features:
                        i += 1
                        #print('Progress: %s/%s configurations' % (i, i_max), end='\r')
                        
                        # Call the function for the model and append the score
                        scores.append(cross_val(v_type, X_train, y_train, ngram_min, ngram_max, min_df, \
                                                max_df, max_feature, cats, alg))
    
    # Sort the scores descending and print them
    scores.sort(key=keysort, reverse=True)
    #scores = sorted(scores, key=itemgetter(0))
    
    prnt = ''
    
    if test_validation:
        # Select the best configurations
        best_score = min(scores[0][1])
        best_configs = []
        for score in scores:
            if min(score[1]) == best_score:
                best_configs.append(score)

        # Use the best configurations to validate model on the test set
        for config, score in best_configs:
            prnt += (str([config, score]) + '\n')
            
            # Select appropriate vectorizer
            if v_type == 'count':
                vectorizer = CountVectorizer(analyzer='word', ngram_range=(config[0], config[1]), \
                                             min_df=config[2], max_df=config[3], max_features=config[4])
            else:
                vectorizer = TfidfVectorizer(ngram_range=(config[0], config[1]), min_df=config[2], \
                                             max_df=config[3], max_features=config[4], sublinear_tf=True)
            
            # Train with vectorized features
            X_train_t = vectorizer.fit_transform(X_train)
            X_test_t = vectorizer.transform(X_test)
            
            if alg == 'svc':
                clf = svm.SVC(kernel='linear', C=1, class_weight='balanced')
            else:
                clf = svm.LinearSVC(C=1, class_weight='balanced', random_state=0)
            
            clf.fit(X_train_t, y_train)
            
            # Predict and compare with test set
            pred = clf.predict(X_test_t)
            prnt += (classification_report(y_test, pred, target_names=Encoder.inverse_transform(list(set(y_test))), \
                                        zero_division=0))

    print(prnt)
    #pc.copy(prnt)
    #print('Result copied')
    #print(scores)

In [77]:
tweets_con = read_tweets('source/tweets_all_preprocessed_no_stopwords.txt')
tweets_act = read_tweets('source/tweets_all_preprocessed_exc_stopwords.txt')

tweets_con_gs = [tweet for tweet in tweets_con if tweet['memo'] == 'gold_standard']
tweets_act_gs = [tweet for tweet in tweets_act if tweet['memo'] == 'gold_standard']

tweets_con_text = [tweet['text'] for tweet in tweets_con_gs]
tweets_act_text = [tweet['text'] for tweet in tweets_act_gs]

labels_con = [tweet['cat_con'] for tweet in tweets_con_gs]
labels_act = [tweet['cat_act'] for tweet in tweets_act_gs]

i_max = 20

for i in range(0, i_max):
    print('RS=%s:' % (i))
    test_configurations('count', True, tweets_con_text, labels_con, 'linear_svc', i)
    #test_configurations('tfidf', True, tweets_act_text, labels_act, 'svc')
    #print('Progress: %s/%s runs' % (i, i_max), end='\r')

RS=0:
[[1, 1, 2, 1.0, None], [0.6393650793650795, 0.861496687303139, 0.5002513826043238]]
              precision    recall  f1-score   support

         CAM       0.64      0.78      0.70        32
         POL       0.90      0.86      0.88        95
         SOC       0.40      0.31      0.35        13

    accuracy                           0.79       140
   macro avg       0.65      0.65      0.64       140
weighted avg       0.80      0.79      0.79       140

RS=1:
[[1, 1, 1, 1.0, None], [0.6479591836734694, 0.8675678901242811, 0.5675500969618618]]
              precision    recall  f1-score   support

         CAM       0.83      0.63      0.72        38
         POL       0.85      0.95      0.90        96
         SOC       0.25      0.17      0.20         6

    accuracy                           0.83       140
   macro avg       0.64      0.58      0.60       140
weighted avg       0.82      0.83      0.82       140

RS=2:
[[1, 2, 2, 1.0, None], [0.6362625139043381, 0.85264

[[1, 2, 2, 1.0, None], [0.6554360812425329, 0.8710433582077877, 0.22377622377622375]]
              precision    recall  f1-score   support

         CAM       0.61      0.64      0.62        36
         POL       0.84      0.90      0.87        91
         SOC       0.75      0.23      0.35        13

    accuracy                           0.77       140
   macro avg       0.73      0.59      0.61       140
weighted avg       0.77      0.77      0.76       140

RS=19:
[[2, 2, 3, 1.0, None], [0.47307494878843265, 0.7693268609239982, 0.26583293249959916]]
              precision    recall  f1-score   support

         CAM       0.45      0.39      0.42        33
         POL       0.78      0.71      0.74        95
         SOC       0.24      0.50      0.32        12

    accuracy                           0.61       140
   macro avg       0.49      0.53      0.49       140
weighted avg       0.65      0.61      0.63       140

