# MachineLearning.ipynb

### This notebook contains the feature generation, supervised machine learning and validation for automatic classification of Tweets.

Author: Erik Puijk <br>
Date  : March 28, 2022

In [198]:
pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[K     |████████████████████████████████| 26.7 MB 4.7 MB/s eta 0:00:01
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.2
    Uninstalling scikit-learn-0.23.2:
      Successfully uninstalled scikit-learn-0.23.2
Successfully installed scikit-learn-1.0.2
Note: you may need to restart the kernel to use updated packages.


In [45]:
import json
import csv
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, accuracy_score, make_scorer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn import metrics

In [91]:
Encoder = LabelEncoder()

In [92]:
# https://scikit-learn.org/stable/modules/svm.html
# https://machinelearningmastery.com/overfitting-and-underfitting-with-machine-learning-algorithms/
# https://vitalflux.com/hold-out-method-for-training-machine-learning-model/
# https://towardsdatascience.com/cross-validation-in-machine-learning-72924a69872f
# https://scikit-learn.org/stable/modules/cross_validation.html
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [93]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    print("Total Tweets read: %s\n" % (len(content)))

    return content

In [94]:
def write_tweets(tweets_w, path):
    """ Write obtained Tweets to a text file in JSON-format. """
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets_w, f)
    except IOError:
        print("I/O error")

In [95]:
def stats_count(cat):
    """ Count the occurrences of each category in a list of categories. """
    
    occurrences = [[x,cat.count(x)] for x in set(cat)]
    
    print('Number of training data')
    
    for occ in occurrences:
        print('%s: %s' % (occ[0], occ[1]))
        
    print('')

In [172]:
def get_score(y_test, pred):
    """ Call an sklearn score function to calculate the performance of a model configuration. """
    
    score = f1_score(y_test, pred, average='micro', zero_division=0)
        
    return score

In [173]:
def calc_avg_score(scores):
    """ Calculate an average score given a list of scores. """
    
    total = 0
    
    for score in scores:
        total += score
    
    return total / len(scores)

In [197]:
def count_vectorize(X_train, y_train, ngram_min, ngram_max, min_df, max_df, max_features):
    """ Perform the count vectorization on the test set and run 5-fold cross validation on the test set
        to compare the results. """
    
    # Transform training set
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(ngram_min, ngram_max), min_df=min_df, max_features=max_features)
    X_train = vectorizer.fit_transform(X_train)
    
    # Create SVM
    clf = svm.SVC(kernel='linear', C=1)
        
    # Run cross-validation and calculate scores
    scores = cross_val_score(clf, X=X_train, y=y_train, cv=5, scoring=make_scorer(get_score))
    
    # Return the configuration of the model together with the average of scores
    return [[ngram_min, ngram_max, min_df, max_df, max_features], calc_avg_score(scores)]

In [213]:
def tfidf_vectorize(X_train, y_train, ngram_min, ngram_max, min_df, max_df, max_features):
    """ Perform the TF-IDF vectorization on the test set and run 5-fold cross validation on the test set
        to compare the results. """
    
    # Transform training set
    vectorizer = TfidfVectorizer(ngram_range=(ngram_min, ngram_max), min_df=min_df, max_df=max_df, max_features=max_features)
    X_train = vectorizer.fit_transform(X_train)
    
    # Create SVM
    clf = svm.SVC(kernel='linear', C=1)
    
    # Run cross-validation and calculate scores
    scores = cross_val_score(clf, X=X_train, y=y_train, cv=5, scoring=make_scorer(get_score))
    
    # Return the configuration of the model together with the average of scores
    return [[ngram_min, ngram_max, min_df, max_df, max_features], calc_avg_score(scores)]

In [226]:
def test_configurations(tweets, cat):
    """ Test different configurations for different models and compare the micro-f1 scores to select the best
        model. """
    
    # Define model parameters
    ngrams = [1, 2, 3]
    min_dfs = [1, 2, 3, 4, 5]
    max_dfs = [0.2, 0.4, 0.6, 0.8, 1.0]
    max_features = [None, 1000, 2000, 3000]
    
    # Split data set into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tweets, cat, test_size=0.3, random_state=0)
    
    #stats_count(list(y_train))

    # Encode labels
    y_train = Encoder.fit_transform(y_train)
    y_test = Encoder.fit_transform(y_test)
    
    scores = []
    i = 0
    i_max = (len(ngrams) - 1) * len(ngrams) * len(min_dfs) * len(max_dfs) * len(max_features)
    
    # Combine each parameter and run the model with cross validation
    for ngram_min in ngrams:
        for ngram_max in ngrams:
            if ngram_min > ngram_max:
                continue
            for min_df in min_dfs:
                for max_df in max_dfs:
                    for max_feature in max_features:
                        i += 1
                        print('Progress: %s/%s configurations' % (i, i_max), end='\r')
                        
                        # Call the function for the model and append the score
                        scores.append(tfidf_vectorize(X_train, y_train, ngram_min, ngram_max, min_df, max_df, max_feature))
    
    # Sort the scores descending and print them
    scores.sort(key=lambda x: x[1], reverse=True)
    print(scores)
        
    #X_test = vectorizer.transform(X_test)
    #clf = svm.SVC(kernel='linear', C=1)
    #clf.fit(X_train, y_train)
    #pred = clf.predict(X_test)
    #print(classification_report(y_test, pred, target_names=Encoder.inverse_transform(list(set(y_test))), zero_division=0))


In [227]:
tweets_r = read_tweets('source/tweets_all_preprocessed.txt')

tweets_tokens = [tweet_r['text'].split() for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']
tweets = [tweet_r['text'] for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']
cat_con = [tweet_r['cat_con'] for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']
cat_act = [tweet_r['cat_act'] for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']

test_configurations(tweets, cat_con)
#test_configurations(tweets, cat_act)

Total Tweets read: 4664

[[[1, 1, 3, 0.2, None], 0.8037762237762237], [[1, 1, 3, 0.2, 1000], 0.8037762237762237], [[1, 1, 3, 0.2, 2000], 0.8037762237762237], [[1, 1, 3, 0.2, 3000], 0.8037762237762237], [[1, 1, 3, 0.4, None], 0.8007459207459208], [[1, 1, 3, 0.4, 1000], 0.8007459207459208], [[1, 1, 3, 0.4, 2000], 0.8007459207459208], [[1, 1, 3, 0.4, 3000], 0.8007459207459208], [[1, 1, 1, 0.2, 1000], 0.8006060606060605], [[1, 1, 2, 0.2, None], 0.8006060606060605], [[1, 1, 2, 0.2, 1000], 0.8006060606060605], [[1, 1, 2, 0.2, 2000], 0.8006060606060605], [[1, 1, 2, 0.2, 3000], 0.8006060606060605], [[1, 1, 3, 0.6, None], 0.7976223776223776], [[1, 1, 3, 0.6, 1000], 0.7976223776223776], [[1, 1, 3, 0.6, 2000], 0.7976223776223776], [[1, 1, 3, 0.6, 3000], 0.7976223776223776], [[1, 1, 3, 0.8, None], 0.7976223776223776], [[1, 1, 3, 0.8, 1000], 0.7976223776223776], [[1, 1, 3, 0.8, 2000], 0.7976223776223776], [[1, 1, 3, 0.8, 3000], 0.7976223776223776], [[1, 1, 3, 1.0, None], 0.7976223776223776], [[1, 1