# MachineLearning.ipynb

### This notebook contains the feature generation, supervised machine learning and validation for automatic classification of Tweets.

Author: Erik Puijk <br>
Date  : March 28, 2022

In [1]:
pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /home/erik/anaconda3/lib/python3.8/site-packages (1.0.2)
Note: you may need to restart the kernel to use updated packages.


In [14]:
import numpy as np
import json, csv, math, random
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, accuracy_score, make_scorer, cohen_kappa_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils import resample
from sklearn import svm, metrics

In [15]:
Encoder = LabelEncoder()

In [16]:
# https://scikit-learn.org/stable/modules/svm.html
# https://machinelearningmastery.com/overfitting-and-underfitting-with-machine-learning-algorithms/
# https://vitalflux.com/hold-out-method-for-training-machine-learning-model/
# https://towardsdatascience.com/cross-validation-in-machine-learning-72924a69872f
# https://scikit-learn.org/stable/modules/cross_validation.html
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [17]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    print("Total Tweets read: %s\n" % (len(content)))

    return content

In [18]:
def write_tweets(tweets_w, path):
    """ Write obtained Tweets to a text file in JSON-format. """
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets_w, f)
    except IOError:
        print("I/O error")

In [19]:
def stats_count(cat):
    """ Count the occurrences of each category in a list of categories. """
    
    occurrences = [[x,cat.count(x)] for x in set(cat)]
    
    print('Number of training data')
    
    for occ in occurrences:
        print('%s: %s' % (occ[0], occ[1]))
        
    print('')

In [20]:
def calc_avg_scores(scores, cats, k):
    """ Calculate an average score given a list of scores. """
    
    scores_avg = []
    
    for i in range(len(cats)):
        
        total = 0
        
        for j in range(k):
            total += scores[j][i]
        
        scores_avg.append(total / k)

    return scores_avg

In [73]:
def keysort(x):
    
    return min(x[1])

In [74]:
def cross_val(v_type, X, y, ngram_min, ngram_max, min_df, max_df, max_features, cats):
    """ Perform the count/tfidf vectorization on the test set and run 5-fold cross validation on the test set
        to compare the results. """
    
    scores = []
    k = 5
        
    # Run cross-validation and calculate scores
    kf = StratifiedKFold(n_splits=k)
    for train, valid in kf.split(X, y):
        X_train_t = [X[i] for i in train]
        y_train_t = [y[i] for i in train]
        X_valid_t = [X[i] for i in valid]
        y_valid_t = [y[i] for i in valid]
        
        # Select appropriate vectorizer
        if v_type == 'count':
            vectorizer = CountVectorizer(analyzer='word', ngram_range=(ngram_min, ngram_max), min_df=min_df, \
                                         max_df=max_df, max_features=max_features)
        else:
            vectorizer = TfidfVectorizer(ngram_range=(ngram_min, ngram_max), min_df=min_df, max_df=max_df, \
                                         max_features=max_features, sublinear_tf=True)
    
        X_train_t = vectorizer.fit_transform(X_train_t)
        X_valid_t = vectorizer.transform(X_valid_t)
        
        # Create and fit SVM
        clf = svm.SVC(kernel='linear', C=1, class_weight='balanced')
        clf.fit(X_train_t, y_train_t)
        pred = clf.predict(X_valid_t)
        
        scores.append(list(f1_score(y_valid_t, pred, average=None, zero_division=0)))
    
    # Return the configuration of the model together with the average of scores
    return [[ngram_min, ngram_max, min_df, max_df, max_features], calc_avg_scores(scores, cats, k)]

In [83]:
def test_configurations(v_type, test_validation, tweets, labels):
    """ Test different configurations for different models and compare the micro-f1 scores to select the best
        model. """
    
    cats = list(set(labels))
    
    # Define model parameters according to which categorization
    if len(cats) == 3:
        # Content
        ngrams = [1, 2]
        min_dfs = [1, 2, 3, 4, 5]
        max_dfs = [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
        max_features = [None, 1000, 2000, 3000]
    else:
        # Activation
        ngrams = [1, 2]
        min_dfs = [1, 2, 3, 4, 5]
        max_dfs = [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
        max_features = [None, 1000, 2000, 3000]
    
    # Split data set into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.3, random_state=0)
        
    #stats_count(list(y_train))

    # Encode labels
    y_train = Encoder.fit_transform(y_train)
    y_test = Encoder.fit_transform(y_test)
    
    scores = []
    i = 0
    i_max = (len(ngrams)**2 - math.factorial(len(ngrams)-1)) * len(min_dfs) * len(max_dfs) * len(max_features)
    
    # Combine each parameter and run the model with cross validation
    for ngram_min in ngrams:
        for ngram_max in ngrams:
            if ngram_min > ngram_max:
                continue
            for min_df in min_dfs:
                for max_df in max_dfs:
                    for max_feature in max_features:
                        i += 1
                        print('Progress: %s/%s configurations' % (i, i_max), end='\r')
                        
                        # Call the function for the model and append the score
                        scores.append(cross_val(v_type, X_train, y_train, ngram_min, ngram_max, min_df, \
                                                max_df, max_feature, cats))
    
    # Sort the scores descending and print them
    scores.sort(key=keysort, reverse=True)
    #scores = sorted(scores, key=itemgetter(0))
    
    if test_validation:
        # Select the best configurations
        best_score = min(scores[0][1])
        best_configs = []
        for score in scores:
            if min(score[1]) == best_score:
                best_configs.append(score)

        # Use the best configurations to validate model on the test set
        for config, score in best_configs:
            print([config, score])
            
            # Select appropriate vectorizer
            if v_type == 'count':
                vectorizer = CountVectorizer(analyzer='word', ngram_range=(config[0], config[1]), \
                                             min_df=config[2], max_df=config[3], max_features=config[4])
            else:
                vectorizer = TfidfVectorizer(ngram_range=(config[0], config[1]), min_df=config[2], \
                                             max_df=config[3], max_features=config[4], sublinear_tf=True)
            
            # Train with vectorized features
            X_train_t = vectorizer.fit_transform(X_train)
            X_test_t = vectorizer.transform(X_test)
            clf = svm.SVC(kernel='linear', C=1, class_weight='balanced')
            clf.fit(X_train_t, y_train)
            
            # Predict and compare with test set
            pred = clf.predict(X_test_t)
            print(classification_report(y_test, pred, target_names=Encoder.inverse_transform(list(set(y_test))), \
                                        zero_division=0))

    print(scores)

In [84]:
tweets_all = read_tweets('source/tweets_all_preprocessed_exc_stopwords.txt')
tweets_gs = [tweet for tweet in tweets_all if tweet['memo'] == 'gold_standard']

tweets_text = [tweet['text'] for tweet in tweets_gs]

labels_con = [tweet['cat_con'] for tweet in tweets_gs]
labels_act = [tweet['cat_act'] for tweet in tweets_gs]

#test_configurations('count', True, tweets_text, labels_con)
test_configurations('tfidf', True, tweets_text, labels_act)

Total Tweets read: 4664

[[1, 2, 5, 0.6, None], [0.47333333333333333, 0.6013416149068324, 0.8028395727433006, 0.5575504404660415]]
              precision    recall  f1-score   support

         CON       0.71      0.83      0.77         6
         FOL       0.74      0.52      0.61        27
        NONE       0.88      0.91      0.89        93
         SUP       0.59      0.71      0.65        14

    accuracy                           0.81       140
   macro avg       0.73      0.75      0.73       140
weighted avg       0.81      0.81      0.81       140

[[1, 2, 5, 0.6, 1000], [0.47333333333333333, 0.6013416149068324, 0.8028395727433006, 0.5575504404660415]]
              precision    recall  f1-score   support

         CON       0.71      0.83      0.77         6
         FOL       0.74      0.52      0.61        27
        NONE       0.88      0.91      0.89        93
         SUP       0.59      0.71      0.65        14

    accuracy                           0.81       140
  