# MachineLearning.ipynb

### This notebook contains the feature generation, supervised machine learning and validation for automatic classification of Tweets.

Author: Erik Puijk <br>
Date  : March 28, 2022

In [198]:
pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[K     |████████████████████████████████| 26.7 MB 4.7 MB/s eta 0:00:01
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.2
    Uninstalling scikit-learn-0.23.2:
      Successfully uninstalled scikit-learn-0.23.2
Successfully installed scikit-learn-1.0.2
Note: you may need to restart the kernel to use updated packages.


In [48]:
import json
import csv
import numpy as np
import math
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, accuracy_score, make_scorer, cohen_kappa_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn import metrics

In [2]:
Encoder = LabelEncoder()

In [3]:
# https://scikit-learn.org/stable/modules/svm.html
# https://machinelearningmastery.com/overfitting-and-underfitting-with-machine-learning-algorithms/
# https://vitalflux.com/hold-out-method-for-training-machine-learning-model/
# https://towardsdatascience.com/cross-validation-in-machine-learning-72924a69872f
# https://scikit-learn.org/stable/modules/cross_validation.html
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [4]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    print("Total Tweets read: %s\n" % (len(content)))

    return content

In [5]:
def write_tweets(tweets_w, path):
    """ Write obtained Tweets to a text file in JSON-format. """
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets_w, f)
    except IOError:
        print("I/O error")

In [6]:
def stats_count(cat):
    """ Count the occurrences of each category in a list of categories. """
    
    occurrences = [[x,cat.count(x)] for x in set(cat)]
    
    print('Number of training data')
    
    for occ in occurrences:
        print('%s: %s' % (occ[0], occ[1]))
        
    print('')

In [92]:
def get_score(y_test, pred):
    """ Call an f1-score function to calculate the performance of a model configuration. """
    
    score = f1_score(y_test, pred, average='micro', zero_division=0)
    #score = cohen_kappa_score(y_test, pred)
        
    return score

In [93]:
def calc_avg_score(scores):
    """ Calculate an average score given a list of scores. """
    
    total = 0
    
    for score in scores:
        total += score
    
    return total / len(scores)

In [94]:
def cross_val(v_type, X_train, y_train, ngram_min, ngram_max, min_df, max_df, max_features):
    """ Perform the count/tfidf vectorization on the test set and run 5-fold cross validation on the test set
        to compare the results. """
    
    # Select appropriate vectorizer
    if v_type == 'count':
        vectorizer = CountVectorizer(analyzer='word', ngram_range=(ngram_min, ngram_max), min_df=min_df, max_df=max_df, max_features=max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=(ngram_min, ngram_max), min_df=min_df, max_df=max_df, max_features=max_features)
    
    X_train = vectorizer.fit_transform(X_train)
    
    # Create SVM
    clf = svm.SVC(kernel='linear', C=1, class_weight=None)
        
    # Run cross-validation and calculate scores
    scores = cross_val_score(clf, X=X_train, y=y_train, cv=5, scoring=make_scorer(get_score))
    
    # Return the configuration of the model together with the average of scores
    return [[ngram_min, ngram_max, min_df, max_df, max_features], calc_avg_score(scores)]

In [104]:
def test_configurations(v_type, test_validation, tweets, cat):
    """ Test different configurations for different models and compare the micro-f1 scores to select the best
        model. """
    
    # Define model parameters according to which categorization
    if len(list(set(cat))) == 3:
        # Content
        ngrams = [1, 2]
        min_dfs = [1, 2, 3, 4, 5]
        max_dfs = [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
        max_features = [None, 1000, 2000, 3000]
    else:
        # Activation
        ngrams = [1, 2, 3]
        min_dfs = [1, 2, 3, 4, 5]
        max_dfs = [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
        max_features = [None, 1000, 2000, 3000]
    
    # Split data set into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tweets, cat, test_size=0.3, random_state=0)
    
    #stats_count(list(y_train))

    # Encode labels
    y_train = Encoder.fit_transform(y_train)
    y_test = Encoder.fit_transform(y_test)
    
    scores = []
    i = 0
    i_max = (len(ngrams)**2 - math.factorial(len(ngrams)-1)) * len(min_dfs) * len(max_dfs) * len(max_features)
    
    # Combine each parameter and run the model with cross validation
    for ngram_min in ngrams:
        for ngram_max in ngrams:
            if ngram_min > ngram_max:
                continue
            for min_df in min_dfs:
                for max_df in max_dfs:
                    for max_feature in max_features:
                        i += 1
                        print('Progress: %s/%s configurations' % (i, i_max), end='\r')
                        
                        # Call the function for the model and append the score
                        scores.append(cross_val(v_type, X_train, y_train, ngram_min, ngram_max, min_df, max_df, max_feature))
    
    # Sort the scores descending and print them
    scores.sort(key=lambda x: x[1], reverse=True)
    
    if test_validation:
        # Select the best configurations
        best_score = scores[0][1]
        best_configs = []
        for config in scores:
            if config[1] == best_score:
                best_configs.append(config[0])

        # Use the best configurations to validate model on the test set
        for config in best_configs:
            print(config)
            
            # Select appropriate vectorizer
            if v_type == 'count':
                vectorizer = CountVectorizer(analyzer='word', ngram_range=(config[0], config[1]), min_df=config[2], max_df=config[3], max_features=config[4])
            else:
                vectorizer = TfidfVectorizer(ngram_range=(config[0], config[1]), min_df=config[2], max_df=config[3], max_features=config[4])
            
            # Train with vectorized features
            X_train_t = vectorizer.fit_transform(X_train)
            X_test_t = vectorizer.transform(X_test)
            clf = svm.SVC(kernel='linear', C=1, class_weight=None)
            clf.fit(X_train_t, y_train)
            
            # Predict and compare with test set
            pred = clf.predict(X_test_t)
            print(classification_report(y_test, pred, target_names=Encoder.inverse_transform(list(set(y_test))), zero_division=0))

    print(scores)

In [105]:
tweets_r = read_tweets('source/tweets_all_preprocessed.txt')

tweets_tokens = [tweet_r['text'].split() for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']
tweets = [tweet_r['text'] for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']
cat_con = [tweet_r['cat_con'] for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']
cat_act = [tweet_r['cat_act'] for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']

test_configurations('count', True, tweets, cat_con)
#test_configurations('tfidf', True, tweets, cat_act)

Total Tweets read: 4664

[1, 2, 2, 0.4, None]nfigurations
              precision    recall  f1-score   support

         CAM       0.64      0.72      0.68        32
         POL       0.85      0.91      0.88        95
         SOC       0.67      0.15      0.25        13

    accuracy                           0.79       140
   macro avg       0.72      0.59      0.60       140
weighted avg       0.79      0.79      0.77       140

[1, 2, 2, 0.4, 1000]
              precision    recall  f1-score   support

         CAM       0.64      0.72      0.68        32
         POL       0.85      0.91      0.88        95
         SOC       0.67      0.15      0.25        13

    accuracy                           0.79       140
   macro avg       0.72      0.59      0.60       140
weighted avg       0.79      0.79      0.77       140

[1, 2, 2, 0.4, 2000]
              precision    recall  f1-score   support

         CAM       0.64      0.72      0.68        32
         POL       0.85      